regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73
  74 /* Note on debug output:
  75  *
  76  * This is set up so that -Dr turns on debugging like all other flags that are
  77  * enabled by -DDEBUGGING.  -Drv gives more verbose output.  This applies to
  78  * all regular expressions encountered in a program, and gives a huge amount of
  79  * output for all but the shortest programs.
  80  *
  81  * The ability to output pattern debugging information lexically, and with much
  82  * finer grained control was added, with 'use re qw(Debug ....);' available even
  83  * in non-DEBUGGING builds.  This is accomplished by copying the contents of
  84  * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
  85  * Those files are compiled and linked into the perl executable, and they are
  86  * compiled essentially as if DEBUGGING were enabled, and controlled by calls
  87  * to re.pm.
  88  *
  89  * That would normally mean linking errors when two functions of the same name
  90  * are attempted to be placed into the same executable.  That is solved in one
  91  * of four ways:
  92  *  1)  Static functions aren't known outside the file they are in, so for the
  93  *      many functions of that type in this file, it just isn't a problem.
  94  *  2)  Most externally known functions are enclosed in
  95  *          #ifndef PERL_IN_XSUB_RE
  96  *          ...
  97  *          #endif
  98  *      blocks, so there is only one defintion for them in the whole
  99  *      executable, the one in regcomp.c (or regexec.c).  The implication of
 100  *      that is any debugging info that comes from them is controlled only by
 101  *      -Dr.  Further, any static function they call will also be the version
 102  *      in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
 103  *  3)  About a dozen external functions are re-#defined in ext/re/re_top.h, to
 104  *      have different names, so that what gets loaded in the executable is
 105  *      'Perl_foo' from regcomp.c (and regexec.c), and the identical function
 106  *      from re_comp.c (and re_exec.c), but with the name 'my_foo'  Debugging
 107  *      in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
 108  *      versions and their callees are under control of re.pm.   The catch is
 109  *      that references to all these go through the regexp_engine structure,
 110  *      which is initialized in regcomp.h to the Perl_foo versions, and
 111  *      substituted out in lexical scopes where 'use re' is in effect to the
 112  *      'my_foo' ones.   That structure is public API, so it would be a hard
 113  *      sell to add any additional members.
 114  *  4)  For functions in regcomp.c and re_comp.c that are called only from,
 115  *      respectively, regexec.c and re_exec.c, they can have two different
 116  *      names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
 117  *      embed.fnc.
 118  *
 119  * The bottom line is that if you add code to one of the public functions
 120  * listed in ext/re/re_top.h, debugging automagically works.  But if you write
 121  * a new function that needs to do debugging or there is a chain of calls from
 122  * it that need to do debugging, all functions in the chain should use options
 123  * 2) or 4) above.
 124  *
 125  * A function may have to be split so that debugging stuff is static, but it
 126  * calls out to some other function that only gets compiled in regcomp.c to
 127  * access data that we don't want to duplicate.
 128  */
 129
 130 #include "EXTERN.h"
 131 #define PERL_IN_REGCOMP_C
 132 #include "perl.h"
 133
 134 #define REG_COMP_C
 135 #ifdef PERL_IN_XSUB_RE
 136 #  include "re_comp.h"
 137 EXTERN_C const struct regexp_engine my_reg_engine;
 138 EXTERN_C const struct regexp_engine wild_reg_engine;
 139 #else
 140 #  include "regcomp.h"
 141 #endif
 142
 143 #include "invlist_inline.h"
 144 #include "unicode_constants.h"
 145
 146 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
 147  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 148 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
 149  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 150 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 151 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 152
 153 #ifndef STATIC
 154 #define STATIC  static
 155 #endif
 156
 157 /* this is a chain of data about sub patterns we are processing that
 158    need to be handled separately/specially in study_chunk. Its so
 159    we can simulate recursion without losing state.  */
 160 struct scan_frame;
 161 typedef struct scan_frame {
 162     regnode *last_regnode;      /* last node to process in this frame */
 163     regnode *next_regnode;      /* next node to process when last is reached */
 164     U32 prev_recursed_depth;
 165     I32 stopparen;              /* what stopparen do we use */
 166     bool in_gosub;              /* this or an outer frame is for GOSUB */
 167
 168     struct scan_frame *this_prev_frame; /* this previous frame */
 169     struct scan_frame *prev_frame;      /* previous frame */
 170     struct scan_frame *next_frame;      /* next frame */
 171 } scan_frame;
 172
 173 /* Certain characters are output as a sequence with the first being a
 174  * backslash. */
 175 #define isBACKSLASHED_PUNCT(c)  memCHRs("-[]\\^", c)
 176
 177
 178 struct RExC_state_t {
 179     U32         flags;                  /* RXf_* are we folding, multilining? */
 180     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 181     char        *precomp;               /* uncompiled string. */
 182     char        *precomp_end;           /* pointer to end of uncompiled string. */
 183     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 184     regexp      *rx;                    /* perl core regexp structure */
 185     regexp_internal     *rxi;           /* internal data for regexp object
 186                                            pprivate field */
 187     char        *start;                 /* Start of input for compile */
 188     char        *end;                   /* End of input for compile */
 189     char        *parse;                 /* Input-scan pointer. */
 190     char        *copy_start;            /* start of copy of input within
 191                                            constructed parse string */
 192     char        *save_copy_start;       /* Provides one level of saving
 193                                            and restoring 'copy_start' */
 194     char        *copy_start_in_input;   /* Position in input string
 195                                            corresponding to copy_start */
 196     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 197     regnode     *emit_start;            /* Start of emitted-code area */
 198     regnode_offset emit;                /* Code-emit pointer */
 199     I32         naughty;                /* How bad is this pattern? */
 200     I32         sawback;                /* Did we see \1, ...? */
 201     SSize_t     size;                   /* Number of regnode equivalents in
 202                                            pattern */
 203     Size_t      sets_depth;              /* Counts recursion depth of already-
 204                                            compiled regex set patterns */
 205     U32         seen;
 206
 207     I32      parens_buf_size;           /* #slots malloced open/close_parens */
 208     regnode_offset *open_parens;        /* offsets to open parens */
 209     regnode_offset *close_parens;       /* offsets to close parens */
 210     HV          *paren_names;           /* Paren names */
 211
 212     /* position beyond 'precomp' of the warning message furthest away from
 213      * 'precomp'.  During the parse, no warnings are raised for any problems
 214      * earlier in the parse than this position.  This works if warnings are
 215      * raised the first time a given spot is parsed, and if only one
 216      * independent warning is raised for any given spot */
 217     Size_t      latest_warn_offset;
 218
 219     I32         npar;                   /* Capture buffer count so far in the
 220                                            parse, (OPEN) plus one. ("par" 0 is
 221                                            the whole pattern)*/
 222     I32         total_par;              /* During initial parse, is either 0,
 223                                            or -1; the latter indicating a
 224                                            reparse is needed.  After that pass,
 225                                            it is what 'npar' became after the
 226                                            pass.  Hence, it being > 0 indicates
 227                                            we are in a reparse situation */
 228     I32         nestroot;               /* root parens we are in - used by
 229                                            accept */
 230     I32         seen_zerolen;
 231     regnode     *end_op;                /* END node in program */
 232     I32         utf8;           /* whether the pattern is utf8 or not */
 233     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 234                                 /* XXX use this for future optimisation of case
 235                                  * where pattern must be upgraded to utf8. */
 236     I32         uni_semantics;  /* If a d charset modifier should use unicode
 237                                    rules, even if the pattern is not in
 238                                    utf8 */
 239
 240     I32         recurse_count;          /* Number of recurse regops we have generated */
 241     regnode     **recurse;              /* Recurse regops */
 242     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 243                                            through */
 244     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 245     I32         in_lookbehind;
 246     I32         in_lookahead;
 247     I32         contains_locale;
 248     I32         override_recoding;
 249     I32         recode_x_to_native;
 250     I32         in_multi_char_class;
 251     int         code_index;             /* next code_blocks[] slot */
 252     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
 253                                             within pattern */
 254     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 255     scan_frame *frame_head;
 256     scan_frame *frame_last;
 257     U32         frame_count;
 258     AV         *warn_text;
 259     HV         *unlexed_names;
 260     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 261 #ifdef DEBUGGING
 262     const char  *lastparse;
 263     I32         lastnum;
 264     U32         study_chunk_recursed_count;
 265     AV          *paren_name_list;       /* idx -> name */
 266     SV          *mysv1;
 267     SV          *mysv2;
 268
 269 #define RExC_lastparse  (pRExC_state->lastparse)
 270 #define RExC_lastnum    (pRExC_state->lastnum)
 271 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 272 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 273 #define RExC_mysv       (pRExC_state->mysv1)
 274 #define RExC_mysv1      (pRExC_state->mysv1)
 275 #define RExC_mysv2      (pRExC_state->mysv2)
 276
 277 #endif
 278     bool        seen_d_op;
 279     bool        strict;
 280     bool        study_started;
 281     bool        in_script_run;
 282     bool        use_BRANCHJ;
 283     bool        sWARN_EXPERIMENTAL__VLB;
 284     bool        sWARN_EXPERIMENTAL__REGEX_SETS;
 285 };
 286
 287 #define RExC_flags      (pRExC_state->flags)
 288 #define RExC_pm_flags   (pRExC_state->pm_flags)
 289 #define RExC_precomp    (pRExC_state->precomp)
 290 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
 291 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
 292 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
 293 #define RExC_precomp_end (pRExC_state->precomp_end)
 294 #define RExC_rx_sv      (pRExC_state->rx_sv)
 295 #define RExC_rx         (pRExC_state->rx)
 296 #define RExC_rxi        (pRExC_state->rxi)
 297 #define RExC_start      (pRExC_state->start)
 298 #define RExC_end        (pRExC_state->end)
 299 #define RExC_parse      (pRExC_state->parse)
 300 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
 301 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 302 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
 303                                                    under /d from /u ? */
 304
 305 #ifdef RE_TRACK_PATTERN_OFFSETS
 306 #  define RExC_offsets  (RExC_rxi->u.offsets) /* I am not like the
 307                                                          others */
 308 #endif
 309 #define RExC_emit       (pRExC_state->emit)
 310 #define RExC_emit_start (pRExC_state->emit_start)
 311 #define RExC_sawback    (pRExC_state->sawback)
 312 #define RExC_seen       (pRExC_state->seen)
 313 #define RExC_size       (pRExC_state->size)
 314 #define RExC_maxlen        (pRExC_state->maxlen)
 315 #define RExC_npar       (pRExC_state->npar)
 316 #define RExC_total_parens       (pRExC_state->total_par)
 317 #define RExC_parens_buf_size    (pRExC_state->parens_buf_size)
 318 #define RExC_nestroot   (pRExC_state->nestroot)
 319 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 320 #define RExC_utf8       (pRExC_state->utf8)
 321 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 322 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 323 #define RExC_open_parens        (pRExC_state->open_parens)
 324 #define RExC_close_parens       (pRExC_state->close_parens)
 325 #define RExC_end_op     (pRExC_state->end_op)
 326 #define RExC_paren_names        (pRExC_state->paren_names)
 327 #define RExC_recurse    (pRExC_state->recurse)
 328 #define RExC_recurse_count      (pRExC_state->recurse_count)
 329 #define RExC_sets_depth         (pRExC_state->sets_depth)
 330 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 331 #define RExC_study_chunk_recursed_bytes  \
 332                                    (pRExC_state->study_chunk_recursed_bytes)
 333 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 334 #define RExC_in_lookahead       (pRExC_state->in_lookahead)
 335 #define RExC_contains_locale    (pRExC_state->contains_locale)
 336 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 337
 338 #ifdef EBCDIC
 339 #  define SET_recode_x_to_native(x)                                         \
 340                     STMT_START { RExC_recode_x_to_native = (x); } STMT_END
 341 #else
 342 #  define SET_recode_x_to_native(x) NOOP
 343 #endif
 344
 345 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 346 #define RExC_frame_head (pRExC_state->frame_head)
 347 #define RExC_frame_last (pRExC_state->frame_last)
 348 #define RExC_frame_count (pRExC_state->frame_count)
 349 #define RExC_strict (pRExC_state->strict)
 350 #define RExC_study_started      (pRExC_state->study_started)
 351 #define RExC_warn_text (pRExC_state->warn_text)
 352 #define RExC_in_script_run      (pRExC_state->in_script_run)
 353 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
 354 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
 355 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
 356 #define RExC_unlexed_names (pRExC_state->unlexed_names)
 357
 358 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 359  * a flag to disable back-off on the fixed/floating substrings - if it's
 360  * a high complexity pattern we assume the benefit of avoiding a full match
 361  * is worth the cost of checking for the substrings even if they rarely help.
 362  */
 363 #define RExC_naughty    (pRExC_state->naughty)
 364 #define TOO_NAUGHTY (10)
 365 #define MARK_NAUGHTY(add) \
 366     if (RExC_naughty < TOO_NAUGHTY) \
 367         RExC_naughty += (add)
 368 #define MARK_NAUGHTY_EXP(exp, add) \
 369     if (RExC_naughty < TOO_NAUGHTY) \
 370         RExC_naughty += RExC_naughty / (exp) + (add)
 371
 372 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 373 #define ISMULT2(s)      (ISMULT1(*s) || ((*s) == '{' && regcurly(s)))
 374
 375 /*
 376  * Flags to be passed up and down.
 377  */
 378 #define HASWIDTH        0x01    /* Known to not match null strings, could match
 379                                    non-null ones. */
 380 #define SIMPLE          0x02    /* Exactly one character wide */
 381                                 /* (or LNBREAK as a special case) */
 382 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 383 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 384 #define RESTART_PARSE   0x20    /* Need to redo the parse */
 385 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
 386                                    calcuate sizes as UTF-8 */
 387
 388 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 389
 390 /* whether trie related optimizations are enabled */
 391 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 392 #define TRIE_STUDY_OPT
 393 #define FULL_TRIE_STUDY
 394 #define TRIE_STCLASS
 395 #endif
 396
 397
 398
 399 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 400 #define PBITVAL(paren) (1 << ((paren) & 7))
 401 #define PAREN_OFFSET(depth) \
 402     (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
 403 #define PAREN_TEST(depth, paren) \
 404     (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
 405 #define PAREN_SET(depth, paren) \
 406     (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
 407 #define PAREN_UNSET(depth, paren) \
 408     (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
 409
 410 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 411                                      if (!UTF) {                           \
 412                                          *flagp = RESTART_PARSE|NEED_UTF8; \
 413                                          return 0;                         \
 414                                      }                                     \
 415                              } STMT_END
 416
 417 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
 418  * a flag that indicates we need to override /d with /u as a result of
 419  * something in the pattern.  It should only be used in regards to calling
 420  * set_regex_charset() or get_regex_charset() */
 421 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 422     STMT_START {                                                            \
 423             if (DEPENDS_SEMANTICS) {                                        \
 424                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 425                 RExC_uni_semantics = 1;                                     \
 426                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
 427                     /* No need to restart the parse if we haven't seen      \
 428                      * anything that differs between /u and /d, and no need \
 429                      * to restart immediately if we're going to reparse     \
 430                      * anyway to count parens */                            \
 431                     *flagp |= RESTART_PARSE;                                \
 432                     return restart_retval;                                  \
 433                 }                                                           \
 434             }                                                               \
 435     } STMT_END
 436
 437 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
 438     STMT_START {                                                            \
 439                 RExC_use_BRANCHJ = 1;                                       \
 440                 *flagp |= RESTART_PARSE;                                    \
 441                 return restart_retval;                                      \
 442     } STMT_END
 443
 444 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
 445  * less.  After that, it must always be positive, because the whole re is
 446  * considered to be surrounded by virtual parens.  Setting it to negative
 447  * indicates there is some construct that needs to know the actual number of
 448  * parens to be properly handled.  And that means an extra pass will be
 449  * required after we've counted them all */
 450 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
 451 #define REQUIRE_PARENS_PASS                                                 \
 452     STMT_START {  /* No-op if have completed a pass */                      \
 453                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
 454     } STMT_END
 455 #define IN_PARENS_PASS (RExC_total_parens < 0)
 456
 457
 458 /* This is used to return failure (zero) early from the calling function if
 459  * various flags in 'flags' are set.  Two flags always cause a return:
 460  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
 461  * additional flags that should cause a return; 0 if none.  If the return will
 462  * be done, '*flagp' is first set to be all of the flags that caused the
 463  * return. */
 464 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
 465     STMT_START {                                                            \
 466             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
 467                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
 468                 return 0;                                                   \
 469             }                                                               \
 470     } STMT_END
 471
 472 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
 473
 474 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
 475                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
 476 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
 477                                     if (MUST_RESTART(*(flagp))) return 0
 478
 479 /* This converts the named class defined in regcomp.h to its equivalent class
 480  * number defined in handy.h. */
 481 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 482 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 483
 484 #define _invlist_union_complement_2nd(a, b, output) \
 485                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 486 #define _invlist_intersection_complement_2nd(a, b, output) \
 487                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 488
 489 /* We add a marker if we are deferring expansion of a property that is both
 490  * 1) potentiallly user-defined; and
 491  * 2) could also be an official Unicode property.
 492  *
 493  * Without this marker, any deferred expansion can only be for a user-defined
 494  * one.  This marker shouldn't conflict with any that could be in a legal name,
 495  * and is appended to its name to indicate this.  There is a string and
 496  * character form */
 497 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs  "~"
 498 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc  '~'
 499
 500 /* What is infinity for optimization purposes */
 501 #define OPTIMIZE_INFTY  SSize_t_MAX
 502
 503 /* About scan_data_t.
 504
 505   During optimisation we recurse through the regexp program performing
 506   various inplace (keyhole style) optimisations. In addition study_chunk
 507   and scan_commit populate this data structure with information about
 508   what strings MUST appear in the pattern. We look for the longest
 509   string that must appear at a fixed location, and we look for the
 510   longest string that may appear at a floating location. So for instance
 511   in the pattern:
 512
 513     /FOO[xX]A.*B[xX]BAR/
 514
 515   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 516   strings (because they follow a .* construct). study_chunk will identify
 517   both FOO and BAR as being the longest fixed and floating strings respectively.
 518
 519   The strings can be composites, for instance
 520
 521      /(f)(o)(o)/
 522
 523   will result in a composite fixed substring 'foo'.
 524
 525   For each string some basic information is maintained:
 526
 527   - min_offset
 528     This is the position the string must appear at, or not before.
 529     It also implicitly (when combined with minlenp) tells us how many
 530     characters must match before the string we are searching for.
 531     Likewise when combined with minlenp and the length of the string it
 532     tells us how many characters must appear after the string we have
 533     found.
 534
 535   - max_offset
 536     Only used for floating strings. This is the rightmost point that
 537     the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
 538     string can occur infinitely far to the right.
 539     For fixed strings, it is equal to min_offset.
 540
 541   - minlenp
 542     A pointer to the minimum number of characters of the pattern that the
 543     string was found inside. This is important as in the case of positive
 544     lookahead or positive lookbehind we can have multiple patterns
 545     involved. Consider
 546
 547     /(?=FOO).*F/
 548
 549     The minimum length of the pattern overall is 3, the minimum length
 550     of the lookahead part is 3, but the minimum length of the part that
 551     will actually match is 1. So 'FOO's minimum length is 3, but the
 552     minimum length for the F is 1. This is important as the minimum length
 553     is used to determine offsets in front of and behind the string being
 554     looked for.  Since strings can be composites this is the length of the
 555     pattern at the time it was committed with a scan_commit. Note that
 556     the length is calculated by study_chunk, so that the minimum lengths
 557     are not known until the full pattern has been compiled, thus the
 558     pointer to the value.
 559
 560   - lookbehind
 561
 562     In the case of lookbehind the string being searched for can be
 563     offset past the start point of the final matching string.
 564     If this value was just blithely removed from the min_offset it would
 565     invalidate some of the calculations for how many chars must match
 566     before or after (as they are derived from min_offset and minlen and
 567     the length of the string being searched for).
 568     When the final pattern is compiled and the data is moved from the
 569     scan_data_t structure into the regexp structure the information
 570     about lookbehind is factored in, with the information that would
 571     have been lost precalculated in the end_shift field for the
 572     associated string.
 573
 574   The fields pos_min and pos_delta are used to store the minimum offset
 575   and the delta to the maximum offset at the current point in the pattern.
 576
 577 */
 578
 579 struct scan_data_substrs {
 580     SV      *str;       /* longest substring found in pattern */
 581     SSize_t min_offset; /* earliest point in string it can appear */
 582     SSize_t max_offset; /* latest point in string it can appear */
 583     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
 584     SSize_t lookbehind; /* is the pos of the string modified by LB */
 585     I32 flags;          /* per substring SF_* and SCF_* flags */
 586 };
 587
 588 typedef struct scan_data_t {
 589     /*I32 len_min;      unused */
 590     /*I32 len_delta;    unused */
 591     SSize_t pos_min;
 592     SSize_t pos_delta;
 593     SV *last_found;
 594     SSize_t last_end;       /* min value, <0 unless valid. */
 595     SSize_t last_start_min;
 596     SSize_t last_start_max;
 597     U8      cur_is_floating; /* whether the last_* values should be set as
 598                               * the next fixed (0) or floating (1)
 599                               * substring */
 600
 601     /* [0] is longest fixed substring so far, [1] is longest float so far */
 602     struct scan_data_substrs  substrs[2];
 603
 604     I32 flags;             /* common SF_* and SCF_* flags */
 605     I32 whilem_c;
 606     SSize_t *last_closep;
 607     regnode_ssc *start_class;
 608 } scan_data_t;
 609
 610 /*
 611  * Forward declarations for pregcomp()'s friends.
 612  */
 613
 614 static const scan_data_t zero_scan_data = {
 615     0, 0, NULL, 0, 0, 0, 0,
 616     {
 617         { NULL, 0, 0, 0, 0, 0 },
 618         { NULL, 0, 0, 0, 0, 0 },
 619     },
 620     0, 0, NULL, NULL
 621 };
 622
 623 /* study flags */
 624
 625 #define SF_BEFORE_SEOL          0x0001
 626 #define SF_BEFORE_MEOL          0x0002
 627 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 628
 629 #define SF_IS_INF               0x0040
 630 #define SF_HAS_PAR              0x0080
 631 #define SF_IN_PAR               0x0100
 632 #define SF_HAS_EVAL             0x0200
 633
 634
 635 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
 636  * longest substring in the pattern. When it is not set the optimiser keeps
 637  * track of position, but does not keep track of the actual strings seen,
 638  *
 639  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
 640  * /foo/i will not.
 641  *
 642  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
 643  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
 644  * turned off because of the alternation (BRANCH). */
 645 #define SCF_DO_SUBSTR           0x0400
 646
 647 #define SCF_DO_STCLASS_AND      0x0800
 648 #define SCF_DO_STCLASS_OR       0x1000
 649 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 650 #define SCF_WHILEM_VISITED_POS  0x2000
 651
 652 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 653 #define SCF_SEEN_ACCEPT         0x8000
 654 #define SCF_TRIE_DOING_RESTUDY 0x10000
 655 #define SCF_IN_DEFINE          0x20000
 656
 657
 658
 659
 660 #define UTF cBOOL(RExC_utf8)
 661
 662 /* The enums for all these are ordered so things work out correctly */
 663 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 664 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 665                                                      == REGEX_DEPENDS_CHARSET)
 666 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 667 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 668                                                      >= REGEX_UNICODE_CHARSET)
 669 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 670                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 671 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 672                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 673 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 674                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 675
 676 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 677
 678 /* For programs that want to be strictly Unicode compatible by dying if any
 679  * attempt is made to match a non-Unicode code point against a Unicode
 680  * property.  */
 681 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 682
 683 #define OOB_NAMEDCLASS          -1
 684
 685 /* There is no code point that is out-of-bounds, so this is problematic.  But
 686  * its only current use is to initialize a variable that is always set before
 687  * looked at. */
 688 #define OOB_UNICODE             0xDEADBEEF
 689
 690 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 691
 692
 693 /* length of regex to show in messages that don't mark a position within */
 694 #define RegexLengthToShowInErrorMessages 127
 695
 696 /*
 697  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 698  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 699  * op/pragma/warn/regcomp.
 700  */
 701 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 702 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 703
 704 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 705                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
 706
 707 /* The code in this file in places uses one level of recursion with parsing
 708  * rebased to an alternate string constructed by us in memory.  This can take
 709  * the form of something that is completely different from the input, or
 710  * something that uses the input as part of the alternate.  In the first case,
 711  * there should be no possibility of an error, as we are in complete control of
 712  * the alternate string.  But in the second case we don't completely control
 713  * the input portion, so there may be errors in that.  Here's an example:
 714  *      /[abc\x{DF}def]/ui
 715  * is handled specially because \x{df} folds to a sequence of more than one
 716  * character: 'ss'.  What is done is to create and parse an alternate string,
 717  * which looks like this:
 718  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 719  * where it uses the input unchanged in the middle of something it constructs,
 720  * which is a branch for the DF outside the character class, and clustering
 721  * parens around the whole thing. (It knows enough to skip the DF inside the
 722  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 723  * need to be reported.  The general situation looks like this:
 724  *
 725  *                                       |<------- identical ------>|
 726  *              sI                       tI               xI       eI
 727  * Input:       ---------------------------------------------------------------
 728  * Constructed:         ---------------------------------------------------
 729  *                      sC               tC               xC       eC     EC
 730  *                                       |<------- identical ------>|
 731  *
 732  * sI..eI   is the portion of the input pattern we are concerned with here.
 733  * sC..EC   is the constructed substitute parse string.
 734  *  sC..tC  is constructed by us
 735  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
 736  *          In the diagram, these are vertically aligned.
 737  *  eC..EC  is also constructed by us.
 738  * xC       is the position in the substitute parse string where we found a
 739  *          problem.
 740  * xI       is the position in the original pattern corresponding to xC.
 741  *
 742  * We want to display a message showing the real input string.  Thus we need to
 743  * translate from xC to xI.  We know that xC >= tC, since the portion of the
 744  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
 745  * get:
 746  *      xI = tI + (xC - tC)
 747  *
 748  * When the substitute parse is constructed, the code needs to set:
 749  *      RExC_start (sC)
 750  *      RExC_end (eC)
 751  *      RExC_copy_start_in_input  (tI)
 752  *      RExC_copy_start_in_constructed (tC)
 753  * and restore them when done.
 754  *
 755  * During normal processing of the input pattern, both
 756  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
 757  * sI, so that xC equals xI.
 758  */
 759
 760 #define sI              RExC_precomp
 761 #define eI              RExC_precomp_end
 762 #define sC              RExC_start
 763 #define eC              RExC_end
 764 #define tI              RExC_copy_start_in_input
 765 #define tC              RExC_copy_start_in_constructed
 766 #define xI(xC)          (tI + (xC - tC))
 767 #define xI_offset(xC)   (xI(xC) - sI)
 768
 769 #define REPORT_LOCATION_ARGS(xC)                                            \
 770     UTF8fARG(UTF,                                                           \
 771              (xI(xC) > eI) /* Don't run off end */                          \
 772               ? eI - sI   /* Length before the <--HERE */                   \
 773               : ((xI_offset(xC) >= 0)                                       \
 774                  ? xI_offset(xC)                                            \
 775                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
 776                                     IVdf " trying to output message for "   \
 777                                     " pattern %.*s",                        \
 778                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
 779                                     ((int) (eC - sC)), sC), 0)),            \
 780              sI),         /* The input pattern printed up to the <--HERE */ \
 781     UTF8fARG(UTF,                                                           \
 782              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
 783              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
 784
 785 /* Used to point after bad bytes for an error message, but avoid skipping
 786  * past a nul byte. */
 787 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
 788
 789 /* Set up to clean up after our imminent demise */
 790 #define PREPARE_TO_DIE                                                      \
 791     STMT_START {                                                            \
 792         if (RExC_rx_sv)                                                     \
 793             SAVEFREESV(RExC_rx_sv);                                         \
 794         if (RExC_open_parens)                                               \
 795             SAVEFREEPV(RExC_open_parens);                                   \
 796         if (RExC_close_parens)                                              \
 797             SAVEFREEPV(RExC_close_parens);                                  \
 798     } STMT_END
 799
 800 /*
 801  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 802  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 803  * "...".
 804  */
 805 #define _FAIL(code) STMT_START {                                        \
 806     const char *ellipses = "";                                          \
 807     IV len = RExC_precomp_end - RExC_precomp;                           \
 808                                                                         \
 809     PREPARE_TO_DIE;                                                     \
 810     if (len > RegexLengthToShowInErrorMessages) {                       \
 811         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 812         len = RegexLengthToShowInErrorMessages - 10;                    \
 813         ellipses = "...";                                               \
 814     }                                                                   \
 815     code;                                                               \
 816 } STMT_END
 817
 818 #define FAIL(msg) _FAIL(                            \
 819     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
 820             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 821
 822 #define FAIL2(msg,arg) _FAIL(                       \
 823     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 824             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 825
 826 #define FAIL3(msg,arg1,arg2) _FAIL(                         \
 827     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 828      arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 829
 830 /*
 831  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 832  */
 833 #define Simple_vFAIL(m) STMT_START {                                    \
 834     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 835             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 836 } STMT_END
 837
 838 /*
 839  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 840  */
 841 #define vFAIL(m) STMT_START {                           \
 842     PREPARE_TO_DIE;                                     \
 843     Simple_vFAIL(m);                                    \
 844 } STMT_END
 845
 846 /*
 847  * Like Simple_vFAIL(), but accepts two arguments.
 848  */
 849 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 850     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,                \
 851                       REPORT_LOCATION_ARGS(RExC_parse));        \
 852 } STMT_END
 853
 854 /*
 855  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 856  */
 857 #define vFAIL2(m,a1) STMT_START {                       \
 858     PREPARE_TO_DIE;                                     \
 859     Simple_vFAIL2(m, a1);                               \
 860 } STMT_END
 861
 862
 863 /*
 864  * Like Simple_vFAIL(), but accepts three arguments.
 865  */
 866 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 867     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,            \
 868             REPORT_LOCATION_ARGS(RExC_parse));                  \
 869 } STMT_END
 870
 871 /*
 872  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 873  */
 874 #define vFAIL3(m,a1,a2) STMT_START {                    \
 875     PREPARE_TO_DIE;                                     \
 876     Simple_vFAIL3(m, a1, a2);                           \
 877 } STMT_END
 878
 879 /*
 880  * Like Simple_vFAIL(), but accepts four arguments.
 881  */
 882 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 883     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3,        \
 884             REPORT_LOCATION_ARGS(RExC_parse));                  \
 885 } STMT_END
 886
 887 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 888     PREPARE_TO_DIE;                                     \
 889     Simple_vFAIL4(m, a1, a2, a3);                       \
 890 } STMT_END
 891
 892 /* A specialized version of vFAIL2 that works with UTF8f */
 893 #define vFAIL2utf8f(m, a1) STMT_START {             \
 894     PREPARE_TO_DIE;                                 \
 895     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,  \
 896             REPORT_LOCATION_ARGS(RExC_parse));      \
 897 } STMT_END
 898
 899 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 900     PREPARE_TO_DIE;                                     \
 901     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,  \
 902             REPORT_LOCATION_ARGS(RExC_parse));          \
 903 } STMT_END
 904
 905 /* Setting this to NULL is a signal to not output warnings */
 906 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
 907     STMT_START {                                                            \
 908       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
 909       RExC_copy_start_in_constructed = NULL;                                \
 910     } STMT_END
 911 #define RESTORE_WARNINGS                                                    \
 912     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
 913
 914 /* Since a warning can be generated multiple times as the input is reparsed, we
 915  * output it the first time we come to that point in the parse, but suppress it
 916  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
 917  * generate any warnings */
 918 #define TO_OUTPUT_WARNINGS(loc)                                         \
 919   (   RExC_copy_start_in_constructed                                    \
 920    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
 921
 922 /* After we've emitted a warning, we save the position in the input so we don't
 923  * output it again */
 924 #define UPDATE_WARNINGS_LOC(loc)                                        \
 925     STMT_START {                                                        \
 926         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 927             RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
 928                                                        - RExC_precomp;  \
 929         }                                                               \
 930     } STMT_END
 931
 932 /* 'warns' is the output of the packWARNx macro used in 'code' */
 933 #define _WARN_HELPER(loc, warns, code)                                  \
 934     STMT_START {                                                        \
 935         if (! RExC_copy_start_in_constructed) {                         \
 936             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
 937                               " expected at '%s'",                      \
 938                               __FILE__, __LINE__, loc);                 \
 939         }                                                               \
 940         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 941             if (ckDEAD(warns))                                          \
 942                 PREPARE_TO_DIE;                                         \
 943             code;                                                       \
 944             UPDATE_WARNINGS_LOC(loc);                                   \
 945         }                                                               \
 946     } STMT_END
 947
 948 /* m is not necessarily a "literal string", in this macro */
 949 #define warn_non_literal_string(loc, packed_warn, m)                    \
 950     _WARN_HELPER(loc, packed_warn,                                      \
 951                       Perl_warner(aTHX_ packed_warn,                    \
 952                                        "%s" REPORT_LOCATION,            \
 953                                   m, REPORT_LOCATION_ARGS(loc)))
 954 #define reg_warn_non_literal_string(loc, m)                             \
 955                 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
 956
 957 #define ckWARN2_non_literal_string(loc, packwarn, m, a1)                    \
 958     STMT_START {                                                            \
 959                 char * format;                                              \
 960                 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
 961                 Newx(format, format_size, char);                            \
 962                 my_strlcpy(format, m, format_size);                         \
 963                 my_strlcat(format, REPORT_LOCATION, format_size);           \
 964                 SAVEFREEPV(format);                                         \
 965                 _WARN_HELPER(loc, packwarn,                                 \
 966                       Perl_ck_warner(aTHX_ packwarn,                        \
 967                                         format,                             \
 968                                         a1, REPORT_LOCATION_ARGS(loc)));    \
 969     } STMT_END
 970
 971 #define ckWARNreg(loc,m)                                                \
 972     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 973                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 974                                           m REPORT_LOCATION,            \
 975                                           REPORT_LOCATION_ARGS(loc)))
 976
 977 #define vWARN(loc, m)                                                   \
 978     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 979                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 980                                        m REPORT_LOCATION,               \
 981                                        REPORT_LOCATION_ARGS(loc)))      \
 982
 983 #define vWARN_dep(loc, m)                                               \
 984     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 985                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
 986                                        m REPORT_LOCATION,               \
 987                                        REPORT_LOCATION_ARGS(loc)))
 988
 989 #define ckWARNdep(loc,m)                                                \
 990     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 991                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
 992                                             m REPORT_LOCATION,          \
 993                                             REPORT_LOCATION_ARGS(loc)))
 994
 995 #define ckWARNregdep(loc,m)                                                 \
 996     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
 997                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
 998                                                       WARN_REGEXP),         \
 999                                              m REPORT_LOCATION,             \
1000                                              REPORT_LOCATION_ARGS(loc)))
1001
1002 #define ckWARN2reg_d(loc,m, a1)                                             \
1003     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1004                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
1005                                             m REPORT_LOCATION,              \
1006                                             a1, REPORT_LOCATION_ARGS(loc)))
1007
1008 #define ckWARN2reg(loc, m, a1)                                              \
1009     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1010                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1011                                           m REPORT_LOCATION,                \
1012                                           a1, REPORT_LOCATION_ARGS(loc)))
1013
1014 #define vWARN3(loc, m, a1, a2)                                              \
1015     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1016                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
1017                                        m REPORT_LOCATION,                   \
1018                                        a1, a2, REPORT_LOCATION_ARGS(loc)))
1019
1020 #define ckWARN3reg(loc, m, a1, a2)                                          \
1021     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1022                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1023                                           m REPORT_LOCATION,                \
1024                                           a1, a2,                           \
1025                                           REPORT_LOCATION_ARGS(loc)))
1026
1027 #define vWARN4(loc, m, a1, a2, a3)                                      \
1028     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1029                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1030                                        m REPORT_LOCATION,               \
1031                                        a1, a2, a3,                      \
1032                                        REPORT_LOCATION_ARGS(loc)))
1033
1034 #define ckWARN4reg(loc, m, a1, a2, a3)                                  \
1035     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1036                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
1037                                           m REPORT_LOCATION,            \
1038                                           a1, a2, a3,                   \
1039                                           REPORT_LOCATION_ARGS(loc)))
1040
1041 #define vWARN5(loc, m, a1, a2, a3, a4)                                  \
1042     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1043                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1044                                        m REPORT_LOCATION,               \
1045                                        a1, a2, a3, a4,                  \
1046                                        REPORT_LOCATION_ARGS(loc)))
1047
1048 #define ckWARNexperimental(loc, class, m)                               \
1049     STMT_START {                                                        \
1050         if (! RExC_warned_ ## class) { /* warn once per compilation */  \
1051             RExC_warned_ ## class = 1;                                  \
1052             _WARN_HELPER(loc, packWARN(class),                          \
1053                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
1054                                             m REPORT_LOCATION,          \
1055                                             REPORT_LOCATION_ARGS(loc)));\
1056         }                                                               \
1057     } STMT_END
1058
1059 /* Convert between a pointer to a node and its offset from the beginning of the
1060  * program */
1061 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
1062 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1063
1064 /* Macros for recording node offsets.   20001227 mjd@plover.com
1065  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
1066  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
1067  * Element 0 holds the number n.
1068  * Position is 1 indexed.
1069  */
1070 #ifndef RE_TRACK_PATTERN_OFFSETS
1071 #define Set_Node_Offset_To_R(offset,byte)
1072 #define Set_Node_Offset(node,byte)
1073 #define Set_Cur_Node_Offset
1074 #define Set_Node_Length_To_R(node,len)
1075 #define Set_Node_Length(node,len)
1076 #define Set_Node_Cur_Length(node,start)
1077 #define Node_Offset(n)
1078 #define Node_Length(n)
1079 #define Set_Node_Offset_Length(node,offset,len)
1080 #define ProgLen(ri) ri->u.proglen
1081 #define SetProgLen(ri,x) ri->u.proglen = x
1082 #define Track_Code(code)
1083 #else
1084 #define ProgLen(ri) ri->u.offsets[0]
1085 #define SetProgLen(ri,x) ri->u.offsets[0] = x
1086 #define Set_Node_Offset_To_R(offset,byte) STMT_START {                  \
1087         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
1088                     __LINE__, (int)(offset), (int)(byte)));             \
1089         if((offset) < 0) {                                              \
1090             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
1091                                          (int)(offset));                \
1092         } else {                                                        \
1093             RExC_offsets[2*(offset)-1] = (byte);                        \
1094         }                                                               \
1095 } STMT_END
1096
1097 #define Set_Node_Offset(node,byte)                                      \
1098     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1099 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1100
1101 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
1102         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
1103                 __LINE__, (int)(node), (int)(len)));                    \
1104         if((node) < 0) {                                                \
1105             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
1106                                          (int)(node));                  \
1107         } else {                                                        \
1108             RExC_offsets[2*(node)] = (len);                             \
1109         }                                                               \
1110 } STMT_END
1111
1112 #define Set_Node_Length(node,len) \
1113     Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1114 #define Set_Node_Cur_Length(node, start)                \
1115     Set_Node_Length(node, RExC_parse - start)
1116
1117 /* Get offsets and lengths */
1118 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1119 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1120
1121 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
1122     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset));       \
1123     Set_Node_Length_To_R(REGNODE_OFFSET(node), (len));  \
1124 } STMT_END
1125
1126 #define Track_Code(code) STMT_START { code } STMT_END
1127 #endif
1128
1129 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1130 #define EXPERIMENTAL_INPLACESCAN
1131 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1132
1133 #ifdef DEBUGGING
1134 int
1135 Perl_re_printf(pTHX_ const char *fmt, ...)
1136 {
1137     va_list ap;
1138     int result;
1139     PerlIO *f= Perl_debug_log;
1140     PERL_ARGS_ASSERT_RE_PRINTF;
1141     va_start(ap, fmt);
1142     result = PerlIO_vprintf(f, fmt, ap);
1143     va_end(ap);
1144     return result;
1145 }
1146
1147 int
1148 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1149 {
1150     va_list ap;
1151     int result;
1152     PerlIO *f= Perl_debug_log;
1153     PERL_ARGS_ASSERT_RE_INDENTF;
1154     va_start(ap, depth);
1155     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1156     result = PerlIO_vprintf(f, fmt, ap);
1157     va_end(ap);
1158     return result;
1159 }
1160 #endif /* DEBUGGING */
1161
1162 #define DEBUG_RExC_seen()                                                   \
1163         DEBUG_OPTIMISE_MORE_r({                                             \
1164             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1165                                                                             \
1166             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1167                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1168                                                                             \
1169             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1170                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1171                                                                             \
1172             if (RExC_seen & REG_GPOS_SEEN)                                  \
1173                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1174                                                                             \
1175             if (RExC_seen & REG_RECURSE_SEEN)                               \
1176                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1177                                                                             \
1178             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1179                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1180                                                                             \
1181             if (RExC_seen & REG_VERBARG_SEEN)                               \
1182                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1183                                                                             \
1184             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1185                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1186                                                                             \
1187             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1188                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1189                                                                             \
1190             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1191                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1192                                                                             \
1193             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1194                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1195                                                                             \
1196             Perl_re_printf( aTHX_ "\n");                                    \
1197         });
1198
1199 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1200   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1201
1202
1203 #ifdef DEBUGGING
1204 static void
1205 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1206                                     const char *close_str)
1207 {
1208     if (!flags)
1209         return;
1210
1211     Perl_re_printf( aTHX_  "%s", open_str);
1212     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1213     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1214     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1215     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1216     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1217     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1218     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1219     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1220     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1221     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1222     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1223     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1224     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1225     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1226     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1227     Perl_re_printf( aTHX_  "%s", close_str);
1228 }
1229
1230
1231 static void
1232 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1233                     U32 depth, int is_inf)
1234 {
1235     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1236
1237     DEBUG_OPTIMISE_MORE_r({
1238         if (!data)
1239             return;
1240         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1241             depth,
1242             where,
1243             (IV)data->pos_min,
1244             (IV)data->pos_delta,
1245             (UV)data->flags
1246         );
1247
1248         S_debug_show_study_flags(aTHX_ data->flags," [","]");
1249
1250         Perl_re_printf( aTHX_
1251             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1252             (IV)data->whilem_c,
1253             (IV)(data->last_closep ? *((data)->last_closep) : -1),
1254             is_inf ? "INF " : ""
1255         );
1256
1257         if (data->last_found) {
1258             int i;
1259             Perl_re_printf(aTHX_
1260                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1261                     SvPVX_const(data->last_found),
1262                     (IV)data->last_end,
1263                     (IV)data->last_start_min,
1264                     (IV)data->last_start_max
1265             );
1266
1267             for (i = 0; i < 2; i++) {
1268                 Perl_re_printf(aTHX_
1269                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
1270                     data->cur_is_floating == i ? "*" : "",
1271                     i ? "Float" : "Fixed",
1272                     SvPVX_const(data->substrs[i].str),
1273                     (IV)data->substrs[i].min_offset,
1274                     (IV)data->substrs[i].max_offset
1275                 );
1276                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1277             }
1278         }
1279
1280         Perl_re_printf( aTHX_ "\n");
1281     });
1282 }
1283
1284
1285 static void
1286 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1287                 regnode *scan, U32 depth, U32 flags)
1288 {
1289     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1290
1291     DEBUG_OPTIMISE_r({
1292         regnode *Next;
1293
1294         if (!scan)
1295             return;
1296         Next = regnext(scan);
1297         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1298         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
1299             depth,
1300             str,
1301             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1302             Next ? (REG_NODE_NUM(Next)) : 0 );
1303         S_debug_show_study_flags(aTHX_ flags," [ ","]");
1304         Perl_re_printf( aTHX_  "\n");
1305    });
1306 }
1307
1308
1309 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1310                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
1311
1312 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1313                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1314
1315 #else
1316 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1317 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1318 #endif
1319
1320
1321 /* =========================================================
1322  * BEGIN edit_distance stuff.
1323  *
1324  * This calculates how many single character changes of any type are needed to
1325  * transform a string into another one.  It is taken from version 3.1 of
1326  *
1327  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1328  */
1329
1330 /* Our unsorted dictionary linked list.   */
1331 /* Note we use UVs, not chars. */
1332
1333 struct dictionary{
1334   UV key;
1335   UV value;
1336   struct dictionary* next;
1337 };
1338 typedef struct dictionary item;
1339
1340
1341 PERL_STATIC_INLINE item*
1342 push(UV key, item* curr)
1343 {
1344     item* head;
1345     Newx(head, 1, item);
1346     head->key = key;
1347     head->value = 0;
1348     head->next = curr;
1349     return head;
1350 }
1351
1352
1353 PERL_STATIC_INLINE item*
1354 find(item* head, UV key)
1355 {
1356     item* iterator = head;
1357     while (iterator){
1358         if (iterator->key == key){
1359             return iterator;
1360         }
1361         iterator = iterator->next;
1362     }
1363
1364     return NULL;
1365 }
1366
1367 PERL_STATIC_INLINE item*
1368 uniquePush(item* head, UV key)
1369 {
1370     item* iterator = head;
1371
1372     while (iterator){
1373         if (iterator->key == key) {
1374             return head;
1375         }
1376         iterator = iterator->next;
1377     }
1378
1379     return push(key, head);
1380 }
1381
1382 PERL_STATIC_INLINE void
1383 dict_free(item* head)
1384 {
1385     item* iterator = head;
1386
1387     while (iterator) {
1388         item* temp = iterator;
1389         iterator = iterator->next;
1390         Safefree(temp);
1391     }
1392
1393     head = NULL;
1394 }
1395
1396 /* End of Dictionary Stuff */
1397
1398 /* All calculations/work are done here */
1399 STATIC int
1400 S_edit_distance(const UV* src,
1401                 const UV* tgt,
1402                 const STRLEN x,             /* length of src[] */
1403                 const STRLEN y,             /* length of tgt[] */
1404                 const SSize_t maxDistance
1405 )
1406 {
1407     item *head = NULL;
1408     UV swapCount, swapScore, targetCharCount, i, j;
1409     UV *scores;
1410     UV score_ceil = x + y;
1411
1412     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1413
1414     /* intialize matrix start values */
1415     Newx(scores, ( (x + 2) * (y + 2)), UV);
1416     scores[0] = score_ceil;
1417     scores[1 * (y + 2) + 0] = score_ceil;
1418     scores[0 * (y + 2) + 1] = score_ceil;
1419     scores[1 * (y + 2) + 1] = 0;
1420     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1421
1422     /* work loops    */
1423     /* i = src index */
1424     /* j = tgt index */
1425     for (i=1;i<=x;i++) {
1426         if (i < x)
1427             head = uniquePush(head, src[i]);
1428         scores[(i+1) * (y + 2) + 1] = i;
1429         scores[(i+1) * (y + 2) + 0] = score_ceil;
1430         swapCount = 0;
1431
1432         for (j=1;j<=y;j++) {
1433             if (i == 1) {
1434                 if(j < y)
1435                 head = uniquePush(head, tgt[j]);
1436                 scores[1 * (y + 2) + (j + 1)] = j;
1437                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1438             }
1439
1440             targetCharCount = find(head, tgt[j-1])->value;
1441             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1442
1443             if (src[i-1] != tgt[j-1]){
1444                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1445             }
1446             else {
1447                 swapCount = j;
1448                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1449             }
1450         }
1451
1452         find(head, src[i-1])->value = i;
1453     }
1454
1455     {
1456         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1457         dict_free(head);
1458         Safefree(scores);
1459         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1460     }
1461 }
1462
1463 /* END of edit_distance() stuff
1464  * ========================================================= */
1465
1466 /* Mark that we cannot extend a found fixed substring at this point.
1467    Update the longest found anchored substring or the longest found
1468    floating substrings if needed. */
1469
1470 STATIC void
1471 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1472                     SSize_t *minlenp, int is_inf)
1473 {
1474     const STRLEN l = CHR_SVLEN(data->last_found);
1475     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1476     const STRLEN old_l = CHR_SVLEN(longest_sv);
1477     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1478
1479     PERL_ARGS_ASSERT_SCAN_COMMIT;
1480
1481     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1482         const U8 i = data->cur_is_floating;
1483         SvSetMagicSV(longest_sv, data->last_found);
1484         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1485
1486         if (!i) /* fixed */
1487             data->substrs[0].max_offset = data->substrs[0].min_offset;
1488         else { /* float */
1489             data->substrs[1].max_offset =
1490                       (is_inf)
1491                        ? OPTIMIZE_INFTY
1492                        : (l
1493                           ? data->last_start_max
1494                           /* temporary underflow guard for 5.32 */
1495                           : data->pos_delta < 0 ? OPTIMIZE_INFTY
1496                           : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1497                                          ? OPTIMIZE_INFTY
1498                                          : data->pos_min + data->pos_delta));
1499         }
1500
1501         data->substrs[i].flags &= ~SF_BEFORE_EOL;
1502         data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
1503         data->substrs[i].minlenp = minlenp;
1504         data->substrs[i].lookbehind = 0;
1505     }
1506
1507     SvCUR_set(data->last_found, 0);
1508     {
1509         SV * const sv = data->last_found;
1510         if (SvUTF8(sv) && SvMAGICAL(sv)) {
1511             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1512             if (mg)
1513                 mg->mg_len = 0;
1514         }
1515     }
1516     data->last_end = -1;
1517     data->flags &= ~SF_BEFORE_EOL;
1518     DEBUG_STUDYDATA("commit", data, 0, is_inf);
1519 }
1520
1521 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1522  * list that describes which code points it matches */
1523
1524 STATIC void
1525 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1526 {
1527     /* Set the SSC 'ssc' to match an empty string or any code point */
1528
1529     PERL_ARGS_ASSERT_SSC_ANYTHING;
1530
1531     assert(is_ANYOF_SYNTHETIC(ssc));
1532
1533     /* mortalize so won't leak */
1534     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1535     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1536 }
1537
1538 STATIC int
1539 S_ssc_is_anything(const regnode_ssc *ssc)
1540 {
1541     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1542      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1543      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1544      * in any way, so there's no point in using it */
1545
1546     UV start, end;
1547     bool ret;
1548
1549     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1550
1551     assert(is_ANYOF_SYNTHETIC(ssc));
1552
1553     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1554         return FALSE;
1555     }
1556
1557     /* See if the list consists solely of the range 0 - Infinity */
1558     invlist_iterinit(ssc->invlist);
1559     ret = invlist_iternext(ssc->invlist, &start, &end)
1560           && start == 0
1561           && end == UV_MAX;
1562
1563     invlist_iterfinish(ssc->invlist);
1564
1565     if (ret) {
1566         return TRUE;
1567     }
1568
1569     /* If e.g., both \w and \W are set, matches everything */
1570     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1571         int i;
1572         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1573             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1574                 return TRUE;
1575             }
1576         }
1577     }
1578
1579     return FALSE;
1580 }
1581
1582 STATIC void
1583 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1584 {
1585     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1586      * string, any code point, or any posix class under locale */
1587
1588     PERL_ARGS_ASSERT_SSC_INIT;
1589
1590     Zero(ssc, 1, regnode_ssc);
1591     set_ANYOF_SYNTHETIC(ssc);
1592     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1593     ssc_anything(ssc);
1594
1595     /* If any portion of the regex is to operate under locale rules that aren't
1596      * fully known at compile time, initialization includes it.  The reason
1597      * this isn't done for all regexes is that the optimizer was written under
1598      * the assumption that locale was all-or-nothing.  Given the complexity and
1599      * lack of documentation in the optimizer, and that there are inadequate
1600      * test cases for locale, many parts of it may not work properly, it is
1601      * safest to avoid locale unless necessary. */
1602     if (RExC_contains_locale) {
1603         ANYOF_POSIXL_SETALL(ssc);
1604     }
1605     else {
1606         ANYOF_POSIXL_ZERO(ssc);
1607     }
1608 }
1609
1610 STATIC int
1611 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1612                         const regnode_ssc *ssc)
1613 {
1614     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1615      * to the list of code points matched, and locale posix classes; hence does
1616      * not check its flags) */
1617
1618     UV start, end;
1619     bool ret;
1620
1621     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1622
1623     assert(is_ANYOF_SYNTHETIC(ssc));
1624
1625     invlist_iterinit(ssc->invlist);
1626     ret = invlist_iternext(ssc->invlist, &start, &end)
1627           && start == 0
1628           && end == UV_MAX;
1629
1630     invlist_iterfinish(ssc->invlist);
1631
1632     if (! ret) {
1633         return FALSE;
1634     }
1635
1636     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1637         return FALSE;
1638     }
1639
1640     return TRUE;
1641 }
1642
1643 #define INVLIST_INDEX 0
1644 #define ONLY_LOCALE_MATCHES_INDEX 1
1645 #define DEFERRED_USER_DEFINED_INDEX 2
1646
1647 STATIC SV*
1648 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1649                                const regnode_charclass* const node)
1650 {
1651     /* Returns a mortal inversion list defining which code points are matched
1652      * by 'node', which is of type ANYOF.  Handles complementing the result if
1653      * appropriate.  If some code points aren't knowable at this time, the
1654      * returned list must, and will, contain every code point that is a
1655      * possibility. */
1656
1657     SV* invlist = NULL;
1658     SV* only_utf8_locale_invlist = NULL;
1659     unsigned int i;
1660     const U32 n = ARG(node);
1661     bool new_node_has_latin1 = FALSE;
1662     const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1663                       ? 0
1664                       : ANYOF_FLAGS(node);
1665
1666     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1667
1668     /* Look at the data structure created by S_set_ANYOF_arg() */
1669     if (n != ANYOF_ONLY_HAS_BITMAP) {
1670         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1671         AV * const av = MUTABLE_AV(SvRV(rv));
1672         SV **const ary = AvARRAY(av);
1673         assert(RExC_rxi->data->what[n] == 's');
1674
1675         if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1676
1677             /* Here there are things that won't be known until runtime -- we
1678              * have to assume it could be anything */
1679             invlist = sv_2mortal(_new_invlist(1));
1680             return _add_range_to_invlist(invlist, 0, UV_MAX);
1681         }
1682         else if (ary[INVLIST_INDEX]) {
1683
1684             /* Use the node's inversion list */
1685             invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1686         }
1687
1688         /* Get the code points valid only under UTF-8 locales */
1689         if (   (flags & ANYOFL_FOLD)
1690             &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1691         {
1692             only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1693         }
1694     }
1695
1696     if (! invlist) {
1697         invlist = sv_2mortal(_new_invlist(0));
1698     }
1699
1700     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1701      * code points, and an inversion list for the others, but if there are code
1702      * points that should match only conditionally on the target string being
1703      * UTF-8, those are placed in the inversion list, and not the bitmap.
1704      * Since there are circumstances under which they could match, they are
1705      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1706      * to exclude them here, so that when we invert below, the end result
1707      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1708      * have to do this here before we add the unconditionally matched code
1709      * points */
1710     if (flags & ANYOF_INVERT) {
1711         _invlist_intersection_complement_2nd(invlist,
1712                                              PL_UpperLatin1,
1713                                              &invlist);
1714     }
1715
1716     /* Add in the points from the bit map */
1717     if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1718         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1719             if (ANYOF_BITMAP_TEST(node, i)) {
1720                 unsigned int start = i++;
1721
1722                 for (;    i < NUM_ANYOF_CODE_POINTS
1723                        && ANYOF_BITMAP_TEST(node, i); ++i)
1724                 {
1725                     /* empty */
1726                 }
1727                 invlist = _add_range_to_invlist(invlist, start, i-1);
1728                 new_node_has_latin1 = TRUE;
1729             }
1730         }
1731     }
1732
1733     /* If this can match all upper Latin1 code points, have to add them
1734      * as well.  But don't add them if inverting, as when that gets done below,
1735      * it would exclude all these characters, including the ones it shouldn't
1736      * that were added just above */
1737     if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1738         && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1739     {
1740         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1741     }
1742
1743     /* Similarly for these */
1744     if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1745         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1746     }
1747
1748     if (flags & ANYOF_INVERT) {
1749         _invlist_invert(invlist);
1750     }
1751     else if (flags & ANYOFL_FOLD) {
1752         if (new_node_has_latin1) {
1753
1754             /* Under /li, any 0-255 could fold to any other 0-255, depending on
1755              * the locale.  We can skip this if there are no 0-255 at all. */
1756             _invlist_union(invlist, PL_Latin1, &invlist);
1757
1758             invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1759             invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1760         }
1761         else {
1762             if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1763                 invlist = add_cp_to_invlist(invlist, 'I');
1764             }
1765             if (_invlist_contains_cp(invlist,
1766                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1767             {
1768                 invlist = add_cp_to_invlist(invlist, 'i');
1769             }
1770         }
1771     }
1772
1773     /* Similarly add the UTF-8 locale possible matches.  These have to be
1774      * deferred until after the non-UTF-8 locale ones are taken care of just
1775      * above, or it leads to wrong results under ANYOF_INVERT */
1776     if (only_utf8_locale_invlist) {
1777         _invlist_union_maybe_complement_2nd(invlist,
1778                                             only_utf8_locale_invlist,
1779                                             flags & ANYOF_INVERT,
1780                                             &invlist);
1781     }
1782
1783     return invlist;
1784 }
1785
1786 /* These two functions currently do the exact same thing */
1787 #define ssc_init_zero           ssc_init
1788
1789 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1790 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1791
1792 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1793  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1794  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1795
1796 STATIC void
1797 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1798                 const regnode_charclass *and_with)
1799 {
1800     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1801      * another SSC or a regular ANYOF class.  Can create false positives. */
1802
1803     SV* anded_cp_list;
1804     U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1805                           ? 0
1806                           : ANYOF_FLAGS(and_with);
1807     U8  anded_flags;
1808
1809     PERL_ARGS_ASSERT_SSC_AND;
1810
1811     assert(is_ANYOF_SYNTHETIC(ssc));
1812
1813     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1814      * the code point inversion list and just the relevant flags */
1815     if (is_ANYOF_SYNTHETIC(and_with)) {
1816         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1817         anded_flags = and_with_flags;
1818
1819         /* XXX This is a kludge around what appears to be deficiencies in the
1820          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1821          * there are paths through the optimizer where it doesn't get weeded
1822          * out when it should.  And if we don't make some extra provision for
1823          * it like the code just below, it doesn't get added when it should.
1824          * This solution is to add it only when AND'ing, which is here, and
1825          * only when what is being AND'ed is the pristine, original node
1826          * matching anything.  Thus it is like adding it to ssc_anything() but
1827          * only when the result is to be AND'ed.  Probably the same solution
1828          * could be adopted for the same problem we have with /l matching,
1829          * which is solved differently in S_ssc_init(), and that would lead to
1830          * fewer false positives than that solution has.  But if this solution
1831          * creates bugs, the consequences are only that a warning isn't raised
1832          * that should be; while the consequences for having /l bugs is
1833          * incorrect matches */
1834         if (ssc_is_anything((regnode_ssc *)and_with)) {
1835             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1836         }
1837     }
1838     else {
1839         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1840         if (OP(and_with) == ANYOFD) {
1841             anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1842         }
1843         else {
1844             anded_flags = and_with_flags
1845             &( ANYOF_COMMON_FLAGS
1846               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1847               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1848             if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1849                 anded_flags &=
1850                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1851             }
1852         }
1853     }
1854
1855     ANYOF_FLAGS(ssc) &= anded_flags;
1856
1857     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1858      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1859      * 'and_with' may be inverted.  When not inverted, we have the situation of
1860      * computing:
1861      *  (C1 | P1) & (C2 | P2)
1862      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1863      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1864      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1865      *                    <=  ((C1 & C2) | P1 | P2)
1866      * Alternatively, the last few steps could be:
1867      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1868      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1869      *                    <=  (C1 | C2 | (P1 & P2))
1870      * We favor the second approach if either P1 or P2 is non-empty.  This is
1871      * because these components are a barrier to doing optimizations, as what
1872      * they match cannot be known until the moment of matching as they are
1873      * dependent on the current locale, 'AND"ing them likely will reduce or
1874      * eliminate them.
1875      * But we can do better if we know that C1,P1 are in their initial state (a
1876      * frequent occurrence), each matching everything:
1877      *  (<everything>) & (C2 | P2) =  C2 | P2
1878      * Similarly, if C2,P2 are in their initial state (again a frequent
1879      * occurrence), the result is a no-op
1880      *  (C1 | P1) & (<everything>) =  C1 | P1
1881      *
1882      * Inverted, we have
1883      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1884      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1885      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1886      * */
1887
1888     if ((and_with_flags & ANYOF_INVERT)
1889         && ! is_ANYOF_SYNTHETIC(and_with))
1890     {
1891         unsigned int i;
1892
1893         ssc_intersection(ssc,
1894                          anded_cp_list,
1895                          FALSE /* Has already been inverted */
1896                          );
1897
1898         /* If either P1 or P2 is empty, the intersection will be also; can skip
1899          * the loop */
1900         if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1901             ANYOF_POSIXL_ZERO(ssc);
1902         }
1903         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1904
1905             /* Note that the Posix class component P from 'and_with' actually
1906              * looks like:
1907              *      P = Pa | Pb | ... | Pn
1908              * where each component is one posix class, such as in [\w\s].
1909              * Thus
1910              *      ~P = ~(Pa | Pb | ... | Pn)
1911              *         = ~Pa & ~Pb & ... & ~Pn
1912              *        <= ~Pa | ~Pb | ... | ~Pn
1913              * The last is something we can easily calculate, but unfortunately
1914              * is likely to have many false positives.  We could do better
1915              * in some (but certainly not all) instances if two classes in
1916              * P have known relationships.  For example
1917              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1918              * So
1919              *      :lower: & :print: = :lower:
1920              * And similarly for classes that must be disjoint.  For example,
1921              * since \s and \w can have no elements in common based on rules in
1922              * the POSIX standard,
1923              *      \w & ^\S = nothing
1924              * Unfortunately, some vendor locales do not meet the Posix
1925              * standard, in particular almost everything by Microsoft.
1926              * The loop below just changes e.g., \w into \W and vice versa */
1927
1928             regnode_charclass_posixl temp;
1929             int add = 1;    /* To calculate the index of the complement */
1930
1931             Zero(&temp, 1, regnode_charclass_posixl);
1932             ANYOF_POSIXL_ZERO(&temp);
1933             for (i = 0; i < ANYOF_MAX; i++) {
1934                 assert(i % 2 != 0
1935                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1936                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1937
1938                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1939                     ANYOF_POSIXL_SET(&temp, i + add);
1940                 }
1941                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1942             }
1943             ANYOF_POSIXL_AND(&temp, ssc);
1944
1945         } /* else ssc already has no posixes */
1946     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1947          in its initial state */
1948     else if (! is_ANYOF_SYNTHETIC(and_with)
1949              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1950     {
1951         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1952          * copy it over 'ssc' */
1953         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1954             if (is_ANYOF_SYNTHETIC(and_with)) {
1955                 StructCopy(and_with, ssc, regnode_ssc);
1956             }
1957             else {
1958                 ssc->invlist = anded_cp_list;
1959                 ANYOF_POSIXL_ZERO(ssc);
1960                 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1961                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1962                 }
1963             }
1964         }
1965         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1966                  || (and_with_flags & ANYOF_MATCHES_POSIXL))
1967         {
1968             /* One or the other of P1, P2 is non-empty. */
1969             if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1970                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1971             }
1972             ssc_union(ssc, anded_cp_list, FALSE);
1973         }
1974         else { /* P1 = P2 = empty */
1975             ssc_intersection(ssc, anded_cp_list, FALSE);
1976         }
1977     }
1978 }
1979
1980 STATIC void
1981 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1982                const regnode_charclass *or_with)
1983 {
1984     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1985      * another SSC or a regular ANYOF class.  Can create false positives if
1986      * 'or_with' is to be inverted. */
1987
1988     SV* ored_cp_list;
1989     U8 ored_flags;
1990     U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
1991                          ? 0
1992                          : ANYOF_FLAGS(or_with);
1993
1994     PERL_ARGS_ASSERT_SSC_OR;
1995
1996     assert(is_ANYOF_SYNTHETIC(ssc));
1997
1998     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1999      * the code point inversion list and just the relevant flags */
2000     if (is_ANYOF_SYNTHETIC(or_with)) {
2001         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
2002         ored_flags = or_with_flags;
2003     }
2004     else {
2005         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
2006         ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
2007         if (OP(or_with) != ANYOFD) {
2008             ored_flags
2009             |= or_with_flags
2010              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2011                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
2012             if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
2013                 ored_flags |=
2014                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
2015             }
2016         }
2017     }
2018
2019     ANYOF_FLAGS(ssc) |= ored_flags;
2020
2021     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2022      * C2 is the list of code points in 'or-with'; P2, its posix classes.
2023      * 'or_with' may be inverted.  When not inverted, we have the simple
2024      * situation of computing:
2025      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
2026      * If P1|P2 yields a situation with both a class and its complement are
2027      * set, like having both \w and \W, this matches all code points, and we
2028      * can delete these from the P component of the ssc going forward.  XXX We
2029      * might be able to delete all the P components, but I (khw) am not certain
2030      * about this, and it is better to be safe.
2031      *
2032      * Inverted, we have
2033      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
2034      *                         <=  (C1 | P1) | ~C2
2035      *                         <=  (C1 | ~C2) | P1
2036      * (which results in actually simpler code than the non-inverted case)
2037      * */
2038
2039     if ((or_with_flags & ANYOF_INVERT)
2040         && ! is_ANYOF_SYNTHETIC(or_with))
2041     {
2042         /* We ignore P2, leaving P1 going forward */
2043     }   /* else  Not inverted */
2044     else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
2045         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
2046         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2047             unsigned int i;
2048             for (i = 0; i < ANYOF_MAX; i += 2) {
2049                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2050                 {
2051                     ssc_match_all_cp(ssc);
2052                     ANYOF_POSIXL_CLEAR(ssc, i);
2053                     ANYOF_POSIXL_CLEAR(ssc, i+1);
2054                 }
2055             }
2056         }
2057     }
2058
2059     ssc_union(ssc,
2060               ored_cp_list,
2061               FALSE /* Already has been inverted */
2062               );
2063 }
2064
2065 STATIC void
2066 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2067 {
2068     PERL_ARGS_ASSERT_SSC_UNION;
2069
2070     assert(is_ANYOF_SYNTHETIC(ssc));
2071
2072     _invlist_union_maybe_complement_2nd(ssc->invlist,
2073                                         invlist,
2074                                         invert2nd,
2075                                         &ssc->invlist);
2076 }
2077
2078 STATIC void
2079 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2080                          SV* const invlist,
2081                          const bool invert2nd)
2082 {
2083     PERL_ARGS_ASSERT_SSC_INTERSECTION;
2084
2085     assert(is_ANYOF_SYNTHETIC(ssc));
2086
2087     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2088                                                invlist,
2089                                                invert2nd,
2090                                                &ssc->invlist);
2091 }
2092
2093 STATIC void
2094 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2095 {
2096     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2097
2098     assert(is_ANYOF_SYNTHETIC(ssc));
2099
2100     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2101 }
2102
2103 STATIC void
2104 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2105 {
2106     /* AND just the single code point 'cp' into the SSC 'ssc' */
2107
2108     SV* cp_list = _new_invlist(2);
2109
2110     PERL_ARGS_ASSERT_SSC_CP_AND;
2111
2112     assert(is_ANYOF_SYNTHETIC(ssc));
2113
2114     cp_list = add_cp_to_invlist(cp_list, cp);
2115     ssc_intersection(ssc, cp_list,
2116                      FALSE /* Not inverted */
2117                      );
2118     SvREFCNT_dec_NN(cp_list);
2119 }
2120
2121 STATIC void
2122 S_ssc_clear_locale(regnode_ssc *ssc)
2123 {
2124     /* Set the SSC 'ssc' to not match any locale things */
2125     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2126
2127     assert(is_ANYOF_SYNTHETIC(ssc));
2128
2129     ANYOF_POSIXL_ZERO(ssc);
2130     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2131 }
2132
2133 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2134
2135 STATIC bool
2136 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2137 {
2138     /* The synthetic start class is used to hopefully quickly winnow down
2139      * places where a pattern could start a match in the target string.  If it
2140      * doesn't really narrow things down that much, there isn't much point to
2141      * having the overhead of using it.  This function uses some very crude
2142      * heuristics to decide if to use the ssc or not.
2143      *
2144      * It returns TRUE if 'ssc' rules out more than half what it considers to
2145      * be the "likely" possible matches, but of course it doesn't know what the
2146      * actual things being matched are going to be; these are only guesses
2147      *
2148      * For /l matches, it assumes that the only likely matches are going to be
2149      *      in the 0-255 range, uniformly distributed, so half of that is 127
2150      * For /a and /d matches, it assumes that the likely matches will be just
2151      *      the ASCII range, so half of that is 63
2152      * For /u and there isn't anything matching above the Latin1 range, it
2153      *      assumes that that is the only range likely to be matched, and uses
2154      *      half that as the cut-off: 127.  If anything matches above Latin1,
2155      *      it assumes that all of Unicode could match (uniformly), except for
2156      *      non-Unicode code points and things in the General Category "Other"
2157      *      (unassigned, private use, surrogates, controls and formats).  This
2158      *      is a much large number. */
2159
2160     U32 count = 0;      /* Running total of number of code points matched by
2161                            'ssc' */
2162     UV start, end;      /* Start and end points of current range in inversion
2163                            XXX outdated.  UTF-8 locales are common, what about invert? list */
2164     const U32 max_code_points = (LOC)
2165                                 ?  256
2166                                 : ((  ! UNI_SEMANTICS
2167                                     ||  invlist_highest(ssc->invlist) < 256)
2168                                   ? 128
2169                                   : NON_OTHER_COUNT);
2170     const U32 max_match = max_code_points / 2;
2171
2172     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2173
2174     invlist_iterinit(ssc->invlist);
2175     while (invlist_iternext(ssc->invlist, &start, &end)) {
2176         if (start >= max_code_points) {
2177             break;
2178         }
2179         end = MIN(end, max_code_points - 1);
2180         count += end - start + 1;
2181         if (count >= max_match) {
2182             invlist_iterfinish(ssc->invlist);
2183             return FALSE;
2184         }
2185     }
2186
2187     return TRUE;
2188 }
2189
2190
2191 STATIC void
2192 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2193 {
2194     /* The inversion list in the SSC is marked mortal; now we need a more
2195      * permanent copy, which is stored the same way that is done in a regular
2196      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2197      * map */
2198
2199     SV* invlist = invlist_clone(ssc->invlist, NULL);
2200
2201     PERL_ARGS_ASSERT_SSC_FINALIZE;
2202
2203     assert(is_ANYOF_SYNTHETIC(ssc));
2204
2205     /* The code in this file assumes that all but these flags aren't relevant
2206      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2207      * by the time we reach here */
2208     assert(! (ANYOF_FLAGS(ssc)
2209         & ~( ANYOF_COMMON_FLAGS
2210             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2211             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2212
2213     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2214
2215     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2216     SvREFCNT_dec(invlist);
2217
2218     /* Make sure is clone-safe */
2219     ssc->invlist = NULL;
2220
2221     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2222         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2223         OP(ssc) = ANYOFPOSIXL;
2224     }
2225     else if (RExC_contains_locale) {
2226         OP(ssc) = ANYOFL;
2227     }
2228
2229     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2230 }
2231
2232 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2233 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
2234 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2235 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
2236                                ? (TRIE_LIST_CUR( idx ) - 1)           \
2237                                : 0 )
2238
2239
2240 #ifdef DEBUGGING
2241 /*
2242    dump_trie(trie,widecharmap,revcharmap)
2243    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2244    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2245
2246    These routines dump out a trie in a somewhat readable format.
2247    The _interim_ variants are used for debugging the interim
2248    tables that are used to generate the final compressed
2249    representation which is what dump_trie expects.
2250
2251    Part of the reason for their existence is to provide a form
2252    of documentation as to how the different representations function.
2253
2254 */
2255
2256 /*
2257   Dumps the final compressed table form of the trie to Perl_debug_log.
2258   Used for debugging make_trie().
2259 */
2260
2261 STATIC void
2262 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2263             AV *revcharmap, U32 depth)
2264 {
2265     U32 state;
2266     SV *sv=sv_newmortal();
2267     int colwidth= widecharmap ? 6 : 4;
2268     U16 word;
2269     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2270
2271     PERL_ARGS_ASSERT_DUMP_TRIE;
2272
2273     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
2274         depth+1, "Match","Base","Ofs" );
2275
2276     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2277         SV ** const tmp = av_fetch( revcharmap, state, 0);
2278         if ( tmp ) {
2279             Perl_re_printf( aTHX_  "%*s",
2280                 colwidth,
2281                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2282                             PL_colors[0], PL_colors[1],
2283                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2284                             PERL_PV_ESCAPE_FIRSTCHAR
2285                 )
2286             );
2287         }
2288     }
2289     Perl_re_printf( aTHX_  "\n");
2290     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2291
2292     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2293         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2294     Perl_re_printf( aTHX_  "\n");
2295
2296     for( state = 1 ; state < trie->statecount ; state++ ) {
2297         const U32 base = trie->states[ state ].trans.base;
2298
2299         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2300
2301         if ( trie->states[ state ].wordnum ) {
2302             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2303         } else {
2304             Perl_re_printf( aTHX_  "%6s", "" );
2305         }
2306
2307         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2308
2309         if ( base ) {
2310             U32 ofs = 0;
2311
2312             while( ( base + ofs  < trie->uniquecharcount ) ||
2313                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2314                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2315                                                                     != state))
2316                     ofs++;
2317
2318             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2319
2320             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2321                 if ( ( base + ofs >= trie->uniquecharcount )
2322                         && ( base + ofs - trie->uniquecharcount
2323                                                         < trie->lasttrans )
2324                         && trie->trans[ base + ofs
2325                                     - trie->uniquecharcount ].check == state )
2326                 {
2327                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2328                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2329                    );
2330                 } else {
2331                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
2332                 }
2333             }
2334
2335             Perl_re_printf( aTHX_  "]");
2336
2337         }
2338         Perl_re_printf( aTHX_  "\n" );
2339     }
2340     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2341                                 depth);
2342     for (word=1; word <= trie->wordcount; word++) {
2343         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2344             (int)word, (int)(trie->wordinfo[word].prev),
2345             (int)(trie->wordinfo[word].len));
2346     }
2347     Perl_re_printf( aTHX_  "\n" );
2348 }
2349 /*
2350   Dumps a fully constructed but uncompressed trie in list form.
2351   List tries normally only are used for construction when the number of
2352   possible chars (trie->uniquecharcount) is very high.
2353   Used for debugging make_trie().
2354 */
2355 STATIC void
2356 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2357                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
2358                          U32 depth)
2359 {
2360     U32 state;
2361     SV *sv=sv_newmortal();
2362     int colwidth= widecharmap ? 6 : 4;
2363     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2364
2365     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2366
2367     /* print out the table precompression.  */
2368     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2369             depth+1 );
2370     Perl_re_indentf( aTHX_  "%s",
2371             depth+1, "------:-----+-----------------\n" );
2372
2373     for( state=1 ; state < next_alloc ; state ++ ) {
2374         U16 charid;
2375
2376         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2377             depth+1, (UV)state  );
2378         if ( ! trie->states[ state ].wordnum ) {
2379             Perl_re_printf( aTHX_  "%5s| ","");
2380         } else {
2381             Perl_re_printf( aTHX_  "W%4x| ",
2382                 trie->states[ state ].wordnum
2383             );
2384         }
2385         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2386             SV ** const tmp = av_fetch( revcharmap,
2387                                         TRIE_LIST_ITEM(state, charid).forid, 0);
2388             if ( tmp ) {
2389                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2390                     colwidth,
2391                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2392                               colwidth,
2393                               PL_colors[0], PL_colors[1],
2394                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2395                               | PERL_PV_ESCAPE_FIRSTCHAR
2396                     ) ,
2397                     TRIE_LIST_ITEM(state, charid).forid,
2398                     (UV)TRIE_LIST_ITEM(state, charid).newstate
2399                 );
2400                 if (!(charid % 10))
2401                     Perl_re_printf( aTHX_  "\n%*s| ",
2402                         (int)((depth * 2) + 14), "");
2403             }
2404         }
2405         Perl_re_printf( aTHX_  "\n");
2406     }
2407 }
2408
2409 /*
2410   Dumps a fully constructed but uncompressed trie in table form.
2411   This is the normal DFA style state transition table, with a few
2412   twists to facilitate compression later.
2413   Used for debugging make_trie().
2414 */
2415 STATIC void
2416 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2417                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
2418                           U32 depth)
2419 {
2420     U32 state;
2421     U16 charid;
2422     SV *sv=sv_newmortal();
2423     int colwidth= widecharmap ? 6 : 4;
2424     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2425
2426     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2427
2428     /*
2429        print out the table precompression so that we can do a visual check
2430        that they are identical.
2431      */
2432
2433     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2434
2435     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2436         SV ** const tmp = av_fetch( revcharmap, charid, 0);
2437         if ( tmp ) {
2438             Perl_re_printf( aTHX_  "%*s",
2439                 colwidth,
2440                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2441                             PL_colors[0], PL_colors[1],
2442                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2443                             PERL_PV_ESCAPE_FIRSTCHAR
2444                 )
2445             );
2446         }
2447     }
2448
2449     Perl_re_printf( aTHX_ "\n");
2450     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2451
2452     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2453         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2454     }
2455
2456     Perl_re_printf( aTHX_  "\n" );
2457
2458     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2459
2460         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2461             depth+1,
2462             (UV)TRIE_NODENUM( state ) );
2463
2464         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2465             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2466             if (v)
2467                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2468             else
2469                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2470         }
2471         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2472             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2473                                             (UV)trie->trans[ state ].check );
2474         } else {
2475             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2476                                             (UV)trie->trans[ state ].check,
2477             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2478         }
2479     }
2480 }
2481
2482 #endif
2483
2484
2485 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2486   startbranch: the first branch in the whole branch sequence
2487   first      : start branch of sequence of branch-exact nodes.
2488                May be the same as startbranch
2489   last       : Thing following the last branch.
2490                May be the same as tail.
2491   tail       : item following the branch sequence
2492   count      : words in the sequence
2493   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2494   depth      : indent depth
2495
2496 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2497
2498 A trie is an N'ary tree where the branches are determined by digital
2499 decomposition of the key. IE, at the root node you look up the 1st character and
2500 follow that branch repeat until you find the end of the branches. Nodes can be
2501 marked as "accepting" meaning they represent a complete word. Eg:
2502
2503   /he|she|his|hers/
2504
2505 would convert into the following structure. Numbers represent states, letters
2506 following numbers represent valid transitions on the letter from that state, if
2507 the number is in square brackets it represents an accepting state, otherwise it
2508 will be in parenthesis.
2509
2510       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2511       |    |
2512       |   (2)
2513       |    |
2514      (1)   +-i->(6)-+-s->[7]
2515       |
2516       +-s->(3)-+-h->(4)-+-e->[5]
2517
2518       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2519
2520 This shows that when matching against the string 'hers' we will begin at state 1
2521 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2522 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2523 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2524 single traverse. We store a mapping from accepting to state to which word was
2525 matched, and then when we have multiple possibilities we try to complete the
2526 rest of the regex in the order in which they occurred in the alternation.
2527
2528 The only prior NFA like behaviour that would be changed by the TRIE support is
2529 the silent ignoring of duplicate alternations which are of the form:
2530
2531  / (DUPE|DUPE) X? (?{ ... }) Y /x
2532
2533 Thus EVAL blocks following a trie may be called a different number of times with
2534 and without the optimisation. With the optimisations dupes will be silently
2535 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2536 the following demonstrates:
2537
2538  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2539
2540 which prints out 'word' three times, but
2541
2542  'words'=~/(word|word|word)(?{ print $1 })S/
2543
2544 which doesnt print it out at all. This is due to other optimisations kicking in.
2545
2546 Example of what happens on a structural level:
2547
2548 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2549
2550    1: CURLYM[1] {1,32767}(18)
2551    5:   BRANCH(8)
2552    6:     EXACT <ac>(16)
2553    8:   BRANCH(11)
2554    9:     EXACT <ad>(16)
2555   11:   BRANCH(14)
2556   12:     EXACT <ab>(16)
2557   16:   SUCCEED(0)
2558   17:   NOTHING(18)
2559   18: END(0)
2560
2561 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2562 and should turn into:
2563
2564    1: CURLYM[1] {1,32767}(18)
2565    5:   TRIE(16)
2566         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2567           <ac>
2568           <ad>
2569           <ab>
2570   16:   SUCCEED(0)
2571   17:   NOTHING(18)
2572   18: END(0)
2573
2574 Cases where tail != last would be like /(?foo|bar)baz/:
2575
2576    1: BRANCH(4)
2577    2:   EXACT <foo>(8)
2578    4: BRANCH(7)
2579    5:   EXACT <bar>(8)
2580    7: TAIL(8)
2581    8: EXACT <baz>(10)
2582   10: END(0)
2583
2584 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2585 and would end up looking like:
2586
2587     1: TRIE(8)
2588       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2589         <foo>
2590         <bar>
2591    7: TAIL(8)
2592    8: EXACT <baz>(10)
2593   10: END(0)
2594
2595     d = uvchr_to_utf8_flags(d, uv, 0);
2596
2597 is the recommended Unicode-aware way of saying
2598
2599     *(d++) = uv;
2600 */
2601
2602 #define TRIE_STORE_REVCHAR(val)                                            \
2603     STMT_START {                                                           \
2604         if (UTF) {                                                         \
2605             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
2606             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
2607             unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val);     \
2608             *kapow = '\0';                                                 \
2609             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
2610             SvPOK_on(zlopp);                                               \
2611             SvUTF8_on(zlopp);                                              \
2612             av_push(revcharmap, zlopp);                                    \
2613         } else {                                                           \
2614             char ooooff = (char)val;                                           \
2615             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2616         }                                                                  \
2617         } STMT_END
2618
2619 /* This gets the next character from the input, folding it if not already
2620  * folded. */
2621 #define TRIE_READ_CHAR STMT_START {                                           \
2622     wordlen++;                                                                \
2623     if ( UTF ) {                                                              \
2624         /* if it is UTF then it is either already folded, or does not need    \
2625          * folding */                                                         \
2626         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2627     }                                                                         \
2628     else if (folder == PL_fold_latin1) {                                      \
2629         /* This folder implies Unicode rules, which in the range expressible  \
2630          *  by not UTF is the lower case, with the two exceptions, one of     \
2631          *  which should have been taken care of before calling this */       \
2632         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2633         uvc = toLOWER_L1(*uc);                                                \
2634         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2635         len = 1;                                                              \
2636     } else {                                                                  \
2637         /* raw data, will be folded later if needed */                        \
2638         uvc = (U32)*uc;                                                       \
2639         len = 1;                                                              \
2640     }                                                                         \
2641 } STMT_END
2642
2643
2644
2645 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2646     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2647         U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
2648         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2649         TRIE_LIST_LEN( state ) = ging;                          \
2650     }                                                           \
2651     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2652     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2653     TRIE_LIST_CUR( state )++;                                   \
2654 } STMT_END
2655
2656 #define TRIE_LIST_NEW(state) STMT_START {                       \
2657     Newx( trie->states[ state ].trans.list,                     \
2658         4, reg_trie_trans_le );                                 \
2659      TRIE_LIST_CUR( state ) = 1;                                \
2660      TRIE_LIST_LEN( state ) = 4;                                \
2661 } STMT_END
2662
2663 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2664     U16 dupe= trie->states[ state ].wordnum;                    \
2665     regnode * const noper_next = regnext( noper );              \
2666                                                                 \
2667     DEBUG_r({                                                   \
2668         /* store the word for dumping */                        \
2669         SV* tmp;                                                \
2670         if (OP(noper) != NOTHING)                               \
2671             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2672         else                                                    \
2673             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2674         av_push( trie_words, tmp );                             \
2675     });                                                         \
2676                                                                 \
2677     curword++;                                                  \
2678     trie->wordinfo[curword].prev   = 0;                         \
2679     trie->wordinfo[curword].len    = wordlen;                   \
2680     trie->wordinfo[curword].accept = state;                     \
2681                                                                 \
2682     if ( noper_next < tail ) {                                  \
2683         if (!trie->jump)                                        \
2684             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2685                                                  sizeof(U16) ); \
2686         trie->jump[curword] = (U16)(noper_next - convert);      \
2687         if (!jumper)                                            \
2688             jumper = noper_next;                                \
2689         if (!nextbranch)                                        \
2690             nextbranch= regnext(cur);                           \
2691     }                                                           \
2692                                                                 \
2693     if ( dupe ) {                                               \
2694         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2695         /* chain, so that when the bits of chain are later    */\
2696         /* linked together, the dups appear in the chain      */\
2697         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2698         trie->wordinfo[dupe].prev = curword;                    \
2699     } else {                                                    \
2700         /* we haven't inserted this word yet.                */ \
2701         trie->states[ state ].wordnum = curword;                \
2702     }                                                           \
2703 } STMT_END
2704
2705
2706 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2707      ( ( base + charid >=  ucharcount                                   \
2708          && base + charid < ubound                                      \
2709          && state == trie->trans[ base - ucharcount + charid ].check    \
2710          && trie->trans[ base - ucharcount + charid ].next )            \
2711            ? trie->trans[ base - ucharcount + charid ].next             \
2712            : ( state==1 ? special : 0 )                                 \
2713       )
2714
2715 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2716 STMT_START {                                                \
2717     TRIE_BITMAP_SET(trie, uvc);                             \
2718     /* store the folded codepoint */                        \
2719     if ( folder )                                           \
2720         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2721                                                             \
2722     if ( !UTF ) {                                           \
2723         /* store first byte of utf8 representation of */    \
2724         /* variant codepoints */                            \
2725         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2726             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2727         }                                                   \
2728     }                                                       \
2729 } STMT_END
2730 #define MADE_TRIE       1
2731 #define MADE_JUMP_TRIE  2
2732 #define MADE_EXACT_TRIE 4
2733
2734 STATIC I32
2735 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2736                   regnode *first, regnode *last, regnode *tail,
2737                   U32 word_count, U32 flags, U32 depth)
2738 {
2739     /* first pass, loop through and scan words */
2740     reg_trie_data *trie;
2741     HV *widecharmap = NULL;
2742     AV *revcharmap = newAV();
2743     regnode *cur;
2744     STRLEN len = 0;
2745     UV uvc = 0;
2746     U16 curword = 0;
2747     U32 next_alloc = 0;
2748     regnode *jumper = NULL;
2749     regnode *nextbranch = NULL;
2750     regnode *convert = NULL;
2751     U32 *prev_states; /* temp array mapping each state to previous one */
2752     /* we just use folder as a flag in utf8 */
2753     const U8 * folder = NULL;
2754
2755     /* in the below add_data call we are storing either 'tu' or 'tuaa'
2756      * which stands for one trie structure, one hash, optionally followed
2757      * by two arrays */
2758 #ifdef DEBUGGING
2759     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2760     AV *trie_words = NULL;
2761     /* along with revcharmap, this only used during construction but both are
2762      * useful during debugging so we store them in the struct when debugging.
2763      */
2764 #else
2765     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2766     STRLEN trie_charcount=0;
2767 #endif
2768     SV *re_trie_maxbuff;
2769     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2770
2771     PERL_ARGS_ASSERT_MAKE_TRIE;
2772 #ifndef DEBUGGING
2773     PERL_UNUSED_ARG(depth);
2774 #endif
2775
2776     switch (flags) {
2777         case EXACT: case EXACT_REQ8: case EXACTL: break;
2778         case EXACTFAA:
2779         case EXACTFUP:
2780         case EXACTFU:
2781         case EXACTFLU8: folder = PL_fold_latin1; break;
2782         case EXACTF:  folder = PL_fold; break;
2783         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2784     }
2785
2786     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2787     trie->refcount = 1;
2788     trie->startstate = 1;
2789     trie->wordcount = word_count;
2790     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2791     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2792     if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2793         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2794     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2795                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2796
2797     DEBUG_r({
2798         trie_words = newAV();
2799     });
2800
2801     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2802     assert(re_trie_maxbuff);
2803     if (!SvIOK(re_trie_maxbuff)) {
2804         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2805     }
2806     DEBUG_TRIE_COMPILE_r({
2807         Perl_re_indentf( aTHX_
2808           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2809           depth+1,
2810           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2811           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2812     });
2813
2814    /* Find the node we are going to overwrite */
2815     if ( first == startbranch && OP( last ) != BRANCH ) {
2816         /* whole branch chain */
2817         convert = first;
2818     } else {
2819         /* branch sub-chain */
2820         convert = NEXTOPER( first );
2821     }
2822
2823     /*  -- First loop and Setup --
2824
2825        We first traverse the branches and scan each word to determine if it
2826        contains widechars, and how many unique chars there are, this is
2827        important as we have to build a table with at least as many columns as we
2828        have unique chars.
2829
2830        We use an array of integers to represent the character codes 0..255
2831        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2832        the native representation of the character value as the key and IV's for
2833        the coded index.
2834
2835        *TODO* If we keep track of how many times each character is used we can
2836        remap the columns so that the table compression later on is more
2837        efficient in terms of memory by ensuring the most common value is in the
2838        middle and the least common are on the outside.  IMO this would be better
2839        than a most to least common mapping as theres a decent chance the most
2840        common letter will share a node with the least common, meaning the node
2841        will not be compressible. With a middle is most common approach the worst
2842        case is when we have the least common nodes twice.
2843
2844      */
2845
2846     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2847         regnode *noper = NEXTOPER( cur );
2848         const U8 *uc;
2849         const U8 *e;
2850         int foldlen = 0;
2851         U32 wordlen      = 0;         /* required init */
2852         STRLEN minchars = 0;
2853         STRLEN maxchars = 0;
2854         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2855                                                bitmap?*/
2856
2857         if (OP(noper) == NOTHING) {
2858             /* skip past a NOTHING at the start of an alternation
2859              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2860              *
2861              * If the next node is not something we are supposed to process
2862              * we will just ignore it due to the condition guarding the
2863              * next block.
2864              */
2865
2866             regnode *noper_next= regnext(noper);
2867             if (noper_next < tail)
2868                 noper= noper_next;
2869         }
2870
2871         if (    noper < tail
2872             && (    OP(noper) == flags
2873                 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2874                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
2875                                          || OP(noper) == EXACTFUP))))
2876         {
2877             uc= (U8*)STRING(noper);
2878             e= uc + STR_LEN(noper);
2879         } else {
2880             trie->minlen= 0;
2881             continue;
2882         }
2883
2884
2885         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2886             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2887                                           regardless of encoding */
2888             if (OP( noper ) == EXACTFUP) {
2889                 /* false positives are ok, so just set this */
2890                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2891             }
2892         }
2893
2894         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2895                                            branch */
2896             TRIE_CHARCOUNT(trie)++;
2897             TRIE_READ_CHAR;
2898
2899             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2900              * is in effect.  Under /i, this character can match itself, or
2901              * anything that folds to it.  If not under /i, it can match just
2902              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2903              * all fold to k, and all are single characters.   But some folds
2904              * expand to more than one character, so for example LATIN SMALL
2905              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2906              * the string beginning at 'uc' is 'ffi', it could be matched by
2907              * three characters, or just by the one ligature character. (It
2908              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2909              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2910              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2911              * match.)  The trie needs to know the minimum and maximum number
2912              * of characters that could match so that it can use size alone to
2913              * quickly reject many match attempts.  The max is simple: it is
2914              * the number of folded characters in this branch (since a fold is
2915              * never shorter than what folds to it. */
2916
2917             maxchars++;
2918
2919             /* And the min is equal to the max if not under /i (indicated by
2920              * 'folder' being NULL), or there are no multi-character folds.  If
2921              * there is a multi-character fold, the min is incremented just
2922              * once, for the character that folds to the sequence.  Each
2923              * character in the sequence needs to be added to the list below of
2924              * characters in the trie, but we count only the first towards the
2925              * min number of characters needed.  This is done through the
2926              * variable 'foldlen', which is returned by the macros that look
2927              * for these sequences as the number of bytes the sequence
2928              * occupies.  Each time through the loop, we decrement 'foldlen' by
2929              * how many bytes the current char occupies.  Only when it reaches
2930              * 0 do we increment 'minchars' or look for another multi-character
2931              * sequence. */
2932             if (folder == NULL) {
2933                 minchars++;
2934             }
2935             else if (foldlen > 0) {
2936                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2937             }
2938             else {
2939                 minchars++;
2940
2941                 /* See if *uc is the beginning of a multi-character fold.  If
2942                  * so, we decrement the length remaining to look at, to account
2943                  * for the current character this iteration.  (We can use 'uc'
2944                  * instead of the fold returned by TRIE_READ_CHAR because for
2945                  * non-UTF, the latin1_safe macro is smart enough to account
2946                  * for all the unfolded characters, and because for UTF, the
2947                  * string will already have been folded earlier in the
2948                  * compilation process */
2949                 if (UTF) {
2950                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2951                         foldlen -= UTF8SKIP(uc);
2952                     }
2953                 }
2954                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2955                     foldlen--;
2956                 }
2957             }
2958
2959             /* The current character (and any potential folds) should be added
2960              * to the possible matching characters for this position in this
2961              * branch */
2962             if ( uvc < 256 ) {
2963                 if ( folder ) {
2964                     U8 folded= folder[ (U8) uvc ];
2965                     if ( !trie->charmap[ folded ] ) {
2966                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2967                         TRIE_STORE_REVCHAR( folded );
2968                     }
2969                 }
2970                 if ( !trie->charmap[ uvc ] ) {
2971                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2972                     TRIE_STORE_REVCHAR( uvc );
2973                 }
2974                 if ( set_bit ) {
2975                     /* store the codepoint in the bitmap, and its folded
2976                      * equivalent. */
2977                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2978                     set_bit = 0; /* We've done our bit :-) */
2979                 }
2980             } else {
2981
2982                 /* XXX We could come up with the list of code points that fold
2983                  * to this using PL_utf8_foldclosures, except not for
2984                  * multi-char folds, as there may be multiple combinations
2985                  * there that could work, which needs to wait until runtime to
2986                  * resolve (The comment about LIGATURE FFI above is such an
2987                  * example */
2988
2989                 SV** svpp;
2990                 if ( !widecharmap )
2991                     widecharmap = newHV();
2992
2993                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2994
2995                 if ( !svpp )
2996                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2997
2998                 if ( !SvTRUE( *svpp ) ) {
2999                     sv_setiv( *svpp, ++trie->uniquecharcount );
3000                     TRIE_STORE_REVCHAR(uvc);
3001                 }
3002             }
3003         } /* end loop through characters in this branch of the trie */
3004
3005         /* We take the min and max for this branch and combine to find the min
3006          * and max for all branches processed so far */
3007         if( cur == first ) {
3008             trie->minlen = minchars;
3009             trie->maxlen = maxchars;
3010         } else if (minchars < trie->minlen) {
3011             trie->minlen = minchars;
3012         } else if (maxchars > trie->maxlen) {
3013             trie->maxlen = maxchars;
3014         }
3015     } /* end first pass */
3016     DEBUG_TRIE_COMPILE_r(
3017         Perl_re_indentf( aTHX_
3018                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3019                 depth+1,
3020                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
3021                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3022                 (int)trie->minlen, (int)trie->maxlen )
3023     );
3024
3025     /*
3026         We now know what we are dealing with in terms of unique chars and
3027         string sizes so we can calculate how much memory a naive
3028         representation using a flat table  will take. If it's over a reasonable
3029         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
3030         conservative but potentially much slower representation using an array
3031         of lists.
3032
3033         At the end we convert both representations into the same compressed
3034         form that will be used in regexec.c for matching with. The latter
3035         is a form that cannot be used to construct with but has memory
3036         properties similar to the list form and access properties similar
3037         to the table form making it both suitable for fast searches and
3038         small enough that its feasable to store for the duration of a program.
3039
3040         See the comment in the code where the compressed table is produced
3041         inplace from the flat tabe representation for an explanation of how
3042         the compression works.
3043
3044     */
3045
3046
3047     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3048     prev_states[1] = 0;
3049
3050     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3051                                                     > SvIV(re_trie_maxbuff) )
3052     {
3053         /*
3054             Second Pass -- Array Of Lists Representation
3055
3056             Each state will be represented by a list of charid:state records
3057             (reg_trie_trans_le) the first such element holds the CUR and LEN
3058             points of the allocated array. (See defines above).
3059
3060             We build the initial structure using the lists, and then convert
3061             it into the compressed table form which allows faster lookups
3062             (but cant be modified once converted).
3063         */
3064
3065         STRLEN transcount = 1;
3066
3067         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
3068             depth+1));
3069
3070         trie->states = (reg_trie_state *)
3071             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3072                                   sizeof(reg_trie_state) );
3073         TRIE_LIST_NEW(1);
3074         next_alloc = 2;
3075
3076         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3077
3078             regnode *noper   = NEXTOPER( cur );
3079             U32 state        = 1;         /* required init */
3080             U16 charid       = 0;         /* sanity init */
3081             U32 wordlen      = 0;         /* required init */
3082
3083             if (OP(noper) == NOTHING) {
3084                 regnode *noper_next= regnext(noper);
3085                 if (noper_next < tail)
3086                     noper= noper_next;
3087                 /* we will undo this assignment if noper does not
3088                  * point at a trieable type in the else clause of
3089                  * the following statement. */
3090             }
3091
3092             if (    noper < tail
3093                 && (    OP(noper) == flags
3094                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3095                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3096                                              || OP(noper) == EXACTFUP))))
3097             {
3098                 const U8 *uc= (U8*)STRING(noper);
3099                 const U8 *e= uc + STR_LEN(noper);
3100
3101                 for ( ; uc < e ; uc += len ) {
3102
3103                     TRIE_READ_CHAR;
3104
3105                     if ( uvc < 256 ) {
3106                         charid = trie->charmap[ uvc ];
3107                     } else {
3108                         SV** const svpp = hv_fetch( widecharmap,
3109                                                     (char*)&uvc,
3110                                                     sizeof( UV ),
3111                                                     0);
3112                         if ( !svpp ) {
3113                             charid = 0;
3114                         } else {
3115                             charid=(U16)SvIV( *svpp );
3116                         }
3117                     }
3118                     /* charid is now 0 if we dont know the char read, or
3119                      * nonzero if we do */
3120                     if ( charid ) {
3121
3122                         U16 check;
3123                         U32 newstate = 0;
3124
3125                         charid--;
3126                         if ( !trie->states[ state ].trans.list ) {
3127                             TRIE_LIST_NEW( state );
3128                         }
3129                         for ( check = 1;
3130                               check <= TRIE_LIST_USED( state );
3131                               check++ )
3132                         {
3133                             if ( TRIE_LIST_ITEM( state, check ).forid
3134                                                                     == charid )
3135                             {
3136                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3137                                 break;
3138                             }
3139                         }
3140                         if ( ! newstate ) {
3141                             newstate = next_alloc++;
3142                             prev_states[newstate] = state;
3143                             TRIE_LIST_PUSH( state, charid, newstate );
3144                             transcount++;
3145                         }
3146                         state = newstate;
3147                     } else {
3148                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3149                     }
3150                 }
3151             } else {
3152                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3153                  * on a trieable type. So we need to reset noper back to point at the first regop
3154                  * in the branch before we call TRIE_HANDLE_WORD()
3155                 */
3156                 noper= NEXTOPER(cur);
3157             }
3158             TRIE_HANDLE_WORD(state);
3159
3160         } /* end second pass */
3161
3162         /* next alloc is the NEXT state to be allocated */
3163         trie->statecount = next_alloc;
3164         trie->states = (reg_trie_state *)
3165             PerlMemShared_realloc( trie->states,
3166                                    next_alloc
3167                                    * sizeof(reg_trie_state) );
3168
3169         /* and now dump it out before we compress it */
3170         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3171                                                          revcharmap, next_alloc,
3172                                                          depth+1)
3173         );
3174
3175         trie->trans = (reg_trie_trans *)
3176             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3177         {
3178             U32 state;
3179             U32 tp = 0;
3180             U32 zp = 0;
3181
3182
3183             for( state=1 ; state < next_alloc ; state ++ ) {
3184                 U32 base=0;
3185
3186                 /*
3187                 DEBUG_TRIE_COMPILE_MORE_r(
3188                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
3189                 );
3190                 */
3191
3192                 if (trie->states[state].trans.list) {
3193                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3194                     U16 maxid=minid;
3195                     U16 idx;
3196
3197                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3198                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3199                         if ( forid < minid ) {
3200                             minid=forid;
3201                         } else if ( forid > maxid ) {
3202                             maxid=forid;
3203                         }
3204                     }
3205                     if ( transcount < tp + maxid - minid + 1) {
3206                         transcount *= 2;
3207                         trie->trans = (reg_trie_trans *)
3208                             PerlMemShared_realloc( trie->trans,
3209                                                      transcount
3210                                                      * sizeof(reg_trie_trans) );
3211                         Zero( trie->trans + (transcount / 2),
3212                               transcount / 2,
3213                               reg_trie_trans );
3214                     }
3215                     base = trie->uniquecharcount + tp - minid;
3216                     if ( maxid == minid ) {
3217                         U32 set = 0;
3218                         for ( ; zp < tp ; zp++ ) {
3219                             if ( ! trie->trans[ zp ].next ) {
3220                                 base = trie->uniquecharcount + zp - minid;
3221                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3222                                                                    1).newstate;
3223                                 trie->trans[ zp ].check = state;
3224                                 set = 1;
3225                                 break;
3226                             }
3227                         }
3228                         if ( !set ) {
3229                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3230                                                                    1).newstate;
3231                             trie->trans[ tp ].check = state;
3232                             tp++;
3233                             zp = tp;
3234                         }
3235                     } else {
3236                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3237                             const U32 tid = base
3238                                            - trie->uniquecharcount
3239                                            + TRIE_LIST_ITEM( state, idx ).forid;
3240                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3241                                                                 idx ).newstate;
3242                             trie->trans[ tid ].check = state;
3243                         }
3244                         tp += ( maxid - minid + 1 );
3245                     }
3246                     Safefree(trie->states[ state ].trans.list);
3247                 }
3248                 /*
3249                 DEBUG_TRIE_COMPILE_MORE_r(
3250                     Perl_re_printf( aTHX_  " base: %d\n",base);
3251                 );
3252                 */
3253                 trie->states[ state ].trans.base=base;
3254             }
3255             trie->lasttrans = tp + 1;
3256         }
3257     } else {
3258         /*
3259            Second Pass -- Flat Table Representation.
3260
3261            we dont use the 0 slot of either trans[] or states[] so we add 1 to
3262            each.  We know that we will need Charcount+1 trans at most to store
3263            the data (one row per char at worst case) So we preallocate both
3264            structures assuming worst case.
3265
3266            We then construct the trie using only the .next slots of the entry
3267            structs.
3268
3269            We use the .check field of the first entry of the node temporarily
3270            to make compression both faster and easier by keeping track of how
3271            many non zero fields are in the node.
3272
3273            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3274            transition.
3275
3276            There are two terms at use here: state as a TRIE_NODEIDX() which is
3277            a number representing the first entry of the node, and state as a
3278            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3279            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3280            if there are 2 entrys per node. eg:
3281
3282              A B       A B
3283           1. 2 4    1. 3 7
3284           2. 0 3    3. 0 5
3285           3. 0 0    5. 0 0
3286           4. 0 0    7. 0 0
3287
3288            The table is internally in the right hand, idx form. However as we
3289            also have to deal with the states array which is indexed by nodenum
3290            we have to use TRIE_NODENUM() to convert.
3291
3292         */
3293         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
3294             depth+1));
3295
3296         trie->trans = (reg_trie_trans *)
3297             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3298                                   * trie->uniquecharcount + 1,
3299                                   sizeof(reg_trie_trans) );
3300         trie->states = (reg_trie_state *)
3301             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3302                                   sizeof(reg_trie_state) );
3303         next_alloc = trie->uniquecharcount + 1;
3304
3305
3306         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3307
3308             regnode *noper   = NEXTOPER( cur );
3309
3310             U32 state        = 1;         /* required init */
3311
3312             U16 charid       = 0;         /* sanity init */
3313             U32 accept_state = 0;         /* sanity init */
3314
3315             U32 wordlen      = 0;         /* required init */
3316
3317             if (OP(noper) == NOTHING) {
3318                 regnode *noper_next= regnext(noper);
3319                 if (noper_next < tail)
3320                     noper= noper_next;
3321                 /* we will undo this assignment if noper does not
3322                  * point at a trieable type in the else clause of
3323                  * the following statement. */
3324             }
3325
3326             if (    noper < tail
3327                 && (    OP(noper) == flags
3328                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3329                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3330                                              || OP(noper) == EXACTFUP))))
3331             {
3332                 const U8 *uc= (U8*)STRING(noper);
3333                 const U8 *e= uc + STR_LEN(noper);
3334
3335                 for ( ; uc < e ; uc += len ) {
3336
3337                     TRIE_READ_CHAR;
3338
3339                     if ( uvc < 256 ) {
3340                         charid = trie->charmap[ uvc ];
3341                     } else {
3342                         SV* const * const svpp = hv_fetch( widecharmap,
3343                                                            (char*)&uvc,
3344                                                            sizeof( UV ),
3345                                                            0);
3346                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3347                     }
3348                     if ( charid ) {
3349                         charid--;
3350                         if ( !trie->trans[ state + charid ].next ) {
3351                             trie->trans[ state + charid ].next = next_alloc;
3352                             trie->trans[ state ].check++;
3353                             prev_states[TRIE_NODENUM(next_alloc)]
3354                                     = TRIE_NODENUM(state);
3355                             next_alloc += trie->uniquecharcount;
3356                         }
3357                         state = trie->trans[ state + charid ].next;
3358                     } else {
3359                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3360                     }
3361                     /* charid is now 0 if we dont know the char read, or
3362                      * nonzero if we do */
3363                 }
3364             } else {
3365                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3366                  * on a trieable type. So we need to reset noper back to point at the first regop
3367                  * in the branch before we call TRIE_HANDLE_WORD().
3368                 */
3369                 noper= NEXTOPER(cur);
3370             }
3371             accept_state = TRIE_NODENUM( state );
3372             TRIE_HANDLE_WORD(accept_state);
3373
3374         } /* end second pass */
3375
3376         /* and now dump it out before we compress it */
3377         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3378                                                           revcharmap,
3379                                                           next_alloc, depth+1));
3380
3381         {
3382         /*
3383            * Inplace compress the table.*
3384
3385            For sparse data sets the table constructed by the trie algorithm will
3386            be mostly 0/FAIL transitions or to put it another way mostly empty.
3387            (Note that leaf nodes will not contain any transitions.)
3388
3389            This algorithm compresses the tables by eliminating most such
3390            transitions, at the cost of a modest bit of extra work during lookup:
3391
3392            - Each states[] entry contains a .base field which indicates the
3393            index in the state[] array wheres its transition data is stored.
3394
3395            - If .base is 0 there are no valid transitions from that node.
3396
3397            - If .base is nonzero then charid is added to it to find an entry in
3398            the trans array.
3399
3400            -If trans[states[state].base+charid].check!=state then the
3401            transition is taken to be a 0/Fail transition. Thus if there are fail
3402            transitions at the front of the node then the .base offset will point
3403            somewhere inside the previous nodes data (or maybe even into a node
3404            even earlier), but the .check field determines if the transition is
3405            valid.
3406
3407            XXX - wrong maybe?
3408            The following process inplace converts the table to the compressed
3409            table: We first do not compress the root node 1,and mark all its
3410            .check pointers as 1 and set its .base pointer as 1 as well. This
3411            allows us to do a DFA construction from the compressed table later,
3412            and ensures that any .base pointers we calculate later are greater
3413            than 0.
3414
3415            - We set 'pos' to indicate the first entry of the second node.
3416
3417            - We then iterate over the columns of the node, finding the first and
3418            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3419            and set the .check pointers accordingly, and advance pos
3420            appropriately and repreat for the next node. Note that when we copy
3421            the next pointers we have to convert them from the original
3422            NODEIDX form to NODENUM form as the former is not valid post
3423            compression.
3424
3425            - If a node has no transitions used we mark its base as 0 and do not
3426            advance the pos pointer.
3427
3428            - If a node only has one transition we use a second pointer into the
3429            structure to fill in allocated fail transitions from other states.
3430            This pointer is independent of the main pointer and scans forward
3431            looking for null transitions that are allocated to a state. When it
3432            finds one it writes the single transition into the "hole".  If the
3433            pointer doesnt find one the single transition is appended as normal.
3434
3435            - Once compressed we can Renew/realloc the structures to release the
3436            excess space.
3437
3438            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3439            specifically Fig 3.47 and the associated pseudocode.
3440
3441            demq
3442         */
3443         const U32 laststate = TRIE_NODENUM( next_alloc );
3444         U32 state, charid;
3445         U32 pos = 0, zp=0;
3446         trie->statecount = laststate;
3447
3448         for ( state = 1 ; state < laststate ; state++ ) {
3449             U8 flag = 0;
3450             const U32 stateidx = TRIE_NODEIDX( state );
3451             const U32 o_used = trie->trans[ stateidx ].check;
3452             U32 used = trie->trans[ stateidx ].check;
3453             trie->trans[ stateidx ].check = 0;
3454
3455             for ( charid = 0;
3456                   used && charid < trie->uniquecharcount;
3457                   charid++ )
3458             {
3459                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3460                     if ( trie->trans[ stateidx + charid ].next ) {
3461                         if (o_used == 1) {
3462                             for ( ; zp < pos ; zp++ ) {
3463                                 if ( ! trie->trans[ zp ].next ) {
3464                                     break;
3465                                 }
3466                             }
3467                             trie->states[ state ].trans.base
3468                                                     = zp
3469                                                       + trie->uniquecharcount
3470                                                       - charid ;
3471                             trie->trans[ zp ].next
3472                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3473                                                              + charid ].next );
3474                             trie->trans[ zp ].check = state;
3475                             if ( ++zp > pos ) pos = zp;
3476                             break;
3477                         }
3478                         used--;
3479                     }
3480                     if ( !flag ) {
3481                         flag = 1;
3482                         trie->states[ state ].trans.base
3483                                        = pos + trie->uniquecharcount - charid ;
3484                     }
3485                     trie->trans[ pos ].next
3486                         = SAFE_TRIE_NODENUM(
3487                                        trie->trans[ stateidx + charid ].next );
3488                     trie->trans[ pos ].check = state;
3489                     pos++;
3490                 }
3491             }
3492         }
3493         trie->lasttrans = pos + 1;
3494         trie->states = (reg_trie_state *)
3495             PerlMemShared_realloc( trie->states, laststate
3496                                    * sizeof(reg_trie_state) );
3497         DEBUG_TRIE_COMPILE_MORE_r(
3498             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3499                 depth+1,
3500                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3501                        + 1 ),
3502                 (IV)next_alloc,
3503                 (IV)pos,
3504                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3505             );
3506
3507         } /* end table compress */
3508     }
3509     DEBUG_TRIE_COMPILE_MORE_r(
3510             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3511                 depth+1,
3512                 (UV)trie->statecount,
3513                 (UV)trie->lasttrans)
3514     );
3515     /* resize the trans array to remove unused space */
3516     trie->trans = (reg_trie_trans *)
3517         PerlMemShared_realloc( trie->trans, trie->lasttrans
3518                                * sizeof(reg_trie_trans) );
3519
3520     {   /* Modify the program and insert the new TRIE node */
3521         U8 nodetype =(U8)(flags & 0xFF);
3522         char *str=NULL;
3523
3524 #ifdef DEBUGGING
3525         regnode *optimize = NULL;
3526 #ifdef RE_TRACK_PATTERN_OFFSETS
3527
3528         U32 mjd_offset = 0;
3529         U32 mjd_nodelen = 0;
3530 #endif /* RE_TRACK_PATTERN_OFFSETS */
3531 #endif /* DEBUGGING */
3532         /*
3533            This means we convert either the first branch or the first Exact,
3534            depending on whether the thing following (in 'last') is a branch
3535            or not and whther first is the startbranch (ie is it a sub part of
3536            the alternation or is it the whole thing.)
3537            Assuming its a sub part we convert the EXACT otherwise we convert
3538            the whole branch sequence, including the first.
3539          */
3540         /* Find the node we are going to overwrite */
3541         if ( first != startbranch || OP( last ) == BRANCH ) {
3542             /* branch sub-chain */
3543             NEXT_OFF( first ) = (U16)(last - first);
3544 #ifdef RE_TRACK_PATTERN_OFFSETS
3545             DEBUG_r({
3546                 mjd_offset= Node_Offset((convert));
3547                 mjd_nodelen= Node_Length((convert));
3548             });
3549 #endif
3550             /* whole branch chain */
3551         }
3552 #ifdef RE_TRACK_PATTERN_OFFSETS
3553         else {
3554             DEBUG_r({
3555                 const  regnode *nop = NEXTOPER( convert );
3556                 mjd_offset= Node_Offset((nop));
3557                 mjd_nodelen= Node_Length((nop));
3558             });
3559         }
3560         DEBUG_OPTIMISE_r(
3561             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3562                 depth+1,
3563                 (UV)mjd_offset, (UV)mjd_nodelen)
3564         );
3565 #endif
3566         /* But first we check to see if there is a common prefix we can
3567            split out as an EXACT and put in front of the TRIE node.  */
3568         trie->startstate= 1;
3569         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3570             /* we want to find the first state that has more than
3571              * one transition, if that state is not the first state
3572              * then we have a common prefix which we can remove.
3573              */
3574             U32 state;
3575             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3576                 U32 ofs = 0;
3577                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3578                                        transition, -1 means none */
3579                 U32 count = 0;
3580                 const U32 base = trie->states[ state ].trans.base;
3581
3582                 /* does this state terminate an alternation? */
3583                 if ( trie->states[state].wordnum )
3584                         count = 1;
3585
3586                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3587                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3588                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3589                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3590                     {
3591                         if ( ++count > 1 ) {
3592                             /* we have more than one transition */
3593                             SV **tmp;
3594                             U8 *ch;
3595                             /* if this is the first state there is no common prefix
3596                              * to extract, so we can exit */
3597                             if ( state == 1 ) break;
3598                             tmp = av_fetch( revcharmap, ofs, 0);
3599                             ch = (U8*)SvPV_nolen_const( *tmp );
3600
3601                             /* if we are on count 2 then we need to initialize the
3602                              * bitmap, and store the previous char if there was one
3603                              * in it*/
3604                             if ( count == 2 ) {
3605                                 /* clear the bitmap */
3606                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3607                                 DEBUG_OPTIMISE_r(
3608                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3609                                         depth+1,
3610                                         (UV)state));
3611                                 if (first_ofs >= 0) {
3612                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3613                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3614
3615                                     TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3616                                     DEBUG_OPTIMISE_r(
3617                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3618                                     );
3619                                 }
3620                             }
3621                             /* store the current firstchar in the bitmap */
3622                             TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3623                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3624                         }
3625                         first_ofs = ofs;
3626                     }
3627                 }
3628                 if ( count == 1 ) {
3629                     /* This state has only one transition, its transition is part
3630                      * of a common prefix - we need to concatenate the char it
3631                      * represents to what we have so far. */
3632                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3633                     STRLEN len;
3634                     char *ch = SvPV( *tmp, len );
3635                     DEBUG_OPTIMISE_r({
3636                         SV *sv=sv_newmortal();
3637                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3638                             depth+1,
3639                             (UV)state, (UV)first_ofs,
3640                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3641                                 PL_colors[0], PL_colors[1],
3642                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3643                                 PERL_PV_ESCAPE_FIRSTCHAR
3644                             )
3645                         );
3646                     });
3647                     if ( state==1 ) {
3648                         OP( convert ) = nodetype;
3649                         str=STRING(convert);
3650                         setSTR_LEN(convert, 0);
3651                     }
3652                     assert( ( STR_LEN(convert) + len ) < 256 );
3653                     setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
3654                     while (len--)
3655                         *str++ = *ch++;
3656                 } else {
3657 #ifdef DEBUGGING
3658                     if (state>1)
3659                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3660 #endif
3661                     break;
3662                 }
3663             }
3664             trie->prefixlen = (state-1);
3665             if (str) {
3666                 regnode *n = convert+NODE_SZ_STR(convert);
3667                 assert( NODE_SZ_STR(convert) <= U16_MAX );
3668                 NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
3669                 trie->startstate = state;
3670                 trie->minlen -= (state - 1);
3671                 trie->maxlen -= (state - 1);
3672 #ifdef DEBUGGING
3673                /* At least the UNICOS C compiler choked on this
3674                 * being argument to DEBUG_r(), so let's just have
3675                 * it right here. */
3676                if (
3677 #ifdef PERL_EXT_RE_BUILD
3678                    1
3679 #else
3680                    DEBUG_r_TEST
3681 #endif
3682                    ) {
3683                    regnode *fix = convert;
3684                    U32 word = trie->wordcount;
3685 #ifdef RE_TRACK_PATTERN_OFFSETS
3686                    mjd_nodelen++;
3687 #endif
3688                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3689                    while( ++fix < n ) {
3690                        Set_Node_Offset_Length(fix, 0, 0);
3691                    }
3692                    while (word--) {
3693                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3694                        if (tmp) {
3695                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3696                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3697                            else
3698                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3699                        }
3700                    }
3701                }
3702 #endif
3703                 if (trie->maxlen) {
3704                     convert = n;
3705                 } else {
3706                     NEXT_OFF(convert) = (U16)(tail - convert);
3707                     DEBUG_r(optimize= n);
3708                 }
3709             }
3710         }
3711         if (!jumper)
3712             jumper = last;
3713         if ( trie->maxlen ) {
3714             NEXT_OFF( convert ) = (U16)(tail - convert);
3715             ARG_SET( convert, data_slot );
3716             /* Store the offset to the first unabsorbed branch in
3717                jump[0], which is otherwise unused by the jump logic.
3718                We use this when dumping a trie and during optimisation. */
3719             if (trie->jump)
3720                 trie->jump[0] = (U16)(nextbranch - convert);
3721
3722             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3723              *   and there is a bitmap
3724              *   and the first "jump target" node we found leaves enough room
3725              * then convert the TRIE node into a TRIEC node, with the bitmap
3726              * embedded inline in the opcode - this is hypothetically faster.
3727              */
3728             if ( !trie->states[trie->startstate].wordnum
3729                  && trie->bitmap
3730                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3731             {
3732                 OP( convert ) = TRIEC;
3733                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3734                 PerlMemShared_free(trie->bitmap);
3735                 trie->bitmap= NULL;
3736             } else
3737                 OP( convert ) = TRIE;
3738
3739             /* store the type in the flags */
3740             convert->flags = nodetype;
3741             DEBUG_r({
3742             optimize = convert
3743                       + NODE_STEP_REGNODE
3744                       + regarglen[ OP( convert ) ];
3745             });
3746             /* XXX We really should free up the resource in trie now,
3747                    as we won't use them - (which resources?) dmq */
3748         }
3749         /* needed for dumping*/
3750         DEBUG_r(if (optimize) {
3751             regnode *opt = convert;
3752
3753             while ( ++opt < optimize) {
3754                 Set_Node_Offset_Length(opt, 0, 0);
3755             }
3756             /*
3757                 Try to clean up some of the debris left after the
3758                 optimisation.
3759              */
3760             while( optimize < jumper ) {
3761                 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3762                 OP( optimize ) = OPTIMIZED;
3763                 Set_Node_Offset_Length(optimize, 0, 0);
3764                 optimize++;
3765             }
3766             Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3767         });
3768     } /* end node insert */
3769
3770     /*  Finish populating the prev field of the wordinfo array.  Walk back
3771      *  from each accept state until we find another accept state, and if
3772      *  so, point the first word's .prev field at the second word. If the
3773      *  second already has a .prev field set, stop now. This will be the
3774      *  case either if we've already processed that word's accept state,
3775      *  or that state had multiple words, and the overspill words were
3776      *  already linked up earlier.
3777      */
3778     {
3779         U16 word;
3780         U32 state;
3781         U16 prev;
3782
3783         for (word=1; word <= trie->wordcount; word++) {
3784             prev = 0;
3785             if (trie->wordinfo[word].prev)
3786                 continue;
3787             state = trie->wordinfo[word].accept;
3788             while (state) {
3789                 state = prev_states[state];
3790                 if (!state)
3791                     break;
3792                 prev = trie->states[state].wordnum;
3793                 if (prev)
3794                     break;
3795             }
3796             trie->wordinfo[word].prev = prev;
3797         }
3798         Safefree(prev_states);
3799     }
3800
3801
3802     /* and now dump out the compressed format */
3803     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3804
3805     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3806 #ifdef DEBUGGING
3807     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3808     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3809 #else
3810     SvREFCNT_dec_NN(revcharmap);
3811 #endif
3812     return trie->jump
3813            ? MADE_JUMP_TRIE
3814            : trie->startstate>1
3815              ? MADE_EXACT_TRIE
3816              : MADE_TRIE;
3817 }
3818
3819 STATIC regnode *
3820 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3821 {
3822 /* The Trie is constructed and compressed now so we can build a fail array if
3823  * it's needed
3824
3825    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3826    3.32 in the
3827    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3828    Ullman 1985/88
3829    ISBN 0-201-10088-6
3830
3831    We find the fail state for each state in the trie, this state is the longest
3832    proper suffix of the current state's 'word' that is also a proper prefix of
3833    another word in our trie. State 1 represents the word '' and is thus the
3834    default fail state. This allows the DFA not to have to restart after its
3835    tried and failed a word at a given point, it simply continues as though it
3836    had been matching the other word in the first place.
3837    Consider
3838       'abcdgu'=~/abcdefg|cdgu/
3839    When we get to 'd' we are still matching the first word, we would encounter
3840    'g' which would fail, which would bring us to the state representing 'd' in
3841    the second word where we would try 'g' and succeed, proceeding to match
3842    'cdgu'.
3843  */
3844  /* add a fail transition */
3845     const U32 trie_offset = ARG(source);
3846     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3847     U32 *q;
3848     const U32 ucharcount = trie->uniquecharcount;
3849     const U32 numstates = trie->statecount;
3850     const U32 ubound = trie->lasttrans + ucharcount;
3851     U32 q_read = 0;
3852     U32 q_write = 0;
3853     U32 charid;
3854     U32 base = trie->states[ 1 ].trans.base;
3855     U32 *fail;
3856     reg_ac_data *aho;
3857     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3858     regnode *stclass;
3859     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3860
3861     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3862     PERL_UNUSED_CONTEXT;
3863 #ifndef DEBUGGING
3864     PERL_UNUSED_ARG(depth);
3865 #endif
3866
3867     if ( OP(source) == TRIE ) {
3868         struct regnode_1 *op = (struct regnode_1 *)
3869             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3870         StructCopy(source, op, struct regnode_1);
3871         stclass = (regnode *)op;
3872     } else {
3873         struct regnode_charclass *op = (struct regnode_charclass *)
3874             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3875         StructCopy(source, op, struct regnode_charclass);
3876         stclass = (regnode *)op;
3877     }
3878     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3879
3880     ARG_SET( stclass, data_slot );
3881     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3882     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3883     aho->trie=trie_offset;
3884     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3885     Copy( trie->states, aho->states, numstates, reg_trie_state );
3886     Newx( q, numstates, U32);
3887     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3888     aho->refcount = 1;
3889     fail = aho->fail;
3890     /* initialize fail[0..1] to be 1 so that we always have
3891        a valid final fail state */
3892     fail[ 0 ] = fail[ 1 ] = 1;
3893
3894     for ( charid = 0; charid < ucharcount ; charid++ ) {
3895         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3896         if ( newstate ) {
3897             q[ q_write ] = newstate;
3898             /* set to point at the root */
3899             fail[ q[ q_write++ ] ]=1;
3900         }
3901     }
3902     while ( q_read < q_write) {
3903         const U32 cur = q[ q_read++ % numstates ];
3904         base = trie->states[ cur ].trans.base;
3905
3906         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3907             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3908             if (ch_state) {
3909                 U32 fail_state = cur;
3910                 U32 fail_base;
3911                 do {
3912                     fail_state = fail[ fail_state ];
3913                     fail_base = aho->states[ fail_state ].trans.base;
3914                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3915
3916                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3917                 fail[ ch_state ] = fail_state;
3918                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3919                 {
3920                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3921                 }
3922                 q[ q_write++ % numstates] = ch_state;
3923             }
3924         }
3925     }
3926     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3927        when we fail in state 1, this allows us to use the
3928        charclass scan to find a valid start char. This is based on the principle
3929        that theres a good chance the string being searched contains lots of stuff
3930        that cant be a start char.
3931      */
3932     fail[ 0 ] = fail[ 1 ] = 0;
3933     DEBUG_TRIE_COMPILE_r({
3934         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3935                       depth, (UV)numstates
3936         );
3937         for( q_read=1; q_read<numstates; q_read++ ) {
3938             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3939         }
3940         Perl_re_printf( aTHX_  "\n");
3941     });
3942     Safefree(q);
3943     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3944     return stclass;
3945 }
3946
3947
3948 /* The below joins as many adjacent EXACTish nodes as possible into a single
3949  * one.  The regop may be changed if the node(s) contain certain sequences that
3950  * require special handling.  The joining is only done if:
3951  * 1) there is room in the current conglomerated node to entirely contain the
3952  *    next one.
3953  * 2) they are compatible node types
3954  *
3955  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3956  * these get optimized out
3957  *
3958  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3959  * as possible, even if that means splitting an existing node so that its first
3960  * part is moved to the preceeding node.  This would maximise the efficiency of
3961  * memEQ during matching.
3962  *
3963  * If a node is to match under /i (folded), the number of characters it matches
3964  * can be different than its character length if it contains a multi-character
3965  * fold.  *min_subtract is set to the total delta number of characters of the
3966  * input nodes.
3967  *
3968  * And *unfolded_multi_char is set to indicate whether or not the node contains
3969  * an unfolded multi-char fold.  This happens when it won't be known until
3970  * runtime whether the fold is valid or not; namely
3971  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3972  *      target string being matched against turns out to be UTF-8 is that fold
3973  *      valid; or
3974  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
3975  *      runtime.
3976  * (Multi-char folds whose components are all above the Latin1 range are not
3977  * run-time locale dependent, and have already been folded by the time this
3978  * function is called.)
3979  *
3980  * This is as good a place as any to discuss the design of handling these
3981  * multi-character fold sequences.  It's been wrong in Perl for a very long
3982  * time.  There are three code points in Unicode whose multi-character folds
3983  * were long ago discovered to mess things up.  The previous designs for
3984  * dealing with these involved assigning a special node for them.  This
3985  * approach doesn't always work, as evidenced by this example:
3986  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3987  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3988  * would match just the \xDF, it won't be able to handle the case where a
3989  * successful match would have to cross the node's boundary.  The new approach
3990  * that hopefully generally solves the problem generates an EXACTFUP node
3991  * that is "sss" in this case.
3992  *
3993  * It turns out that there are problems with all multi-character folds, and not
3994  * just these three.  Now the code is general, for all such cases.  The
3995  * approach taken is:
3996  * 1)   This routine examines each EXACTFish node that could contain multi-
3997  *      character folded sequences.  Since a single character can fold into
3998  *      such a sequence, the minimum match length for this node is less than
3999  *      the number of characters in the node.  This routine returns in
4000  *      *min_subtract how many characters to subtract from the actual
4001  *      length of the string to get a real minimum match length; it is 0 if
4002  *      there are no multi-char foldeds.  This delta is used by the caller to
4003  *      adjust the min length of the match, and the delta between min and max,
4004  *      so that the optimizer doesn't reject these possibilities based on size
4005  *      constraints.
4006  *
4007  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
4008  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
4009  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
4010  *      EXACTFU nodes.  The node type of such nodes is then changed to
4011  *      EXACTFUP, indicating it is problematic, and needs careful handling.
4012  *      (The procedures in step 1) above are sufficient to handle this case in
4013  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
4014  *      the only case where there is a possible fold length change in non-UTF-8
4015  *      patterns.  By reserving a special node type for problematic cases, the
4016  *      far more common regular EXACTFU nodes can be processed faster.
4017  *      regexec.c takes advantage of this.
4018  *
4019  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
4020  *      problematic cases.   These all only occur when the pattern is not
4021  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
4022  *      length change, it handles the situation where the string cannot be
4023  *      entirely folded.  The strings in an EXACTFish node are folded as much
4024  *      as possible during compilation in regcomp.c.  This saves effort in
4025  *      regex matching.  By using an EXACTFUP node when it is not possible to
4026  *      fully fold at compile time, regexec.c can know that everything in an
4027  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
4028  *      case where folding in EXACTFU nodes can't be done at compile time is
4029  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
4030  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
4031  *      handle two very different cases.  Alternatively, there could have been
4032  *      a node type where there are length changes, one for unfolded, and one
4033  *      for both.  If yet another special case needed to be created, the number
4034  *      of required node types would have to go to 7.  khw figures that even
4035  *      though there are plenty of node types to spare, that the maintenance
4036  *      cost wasn't worth the small speedup of doing it that way, especially
4037  *      since he thinks the MICRO SIGN is rarely encountered in practice.
4038  *
4039  *      There are other cases where folding isn't done at compile time, but
4040  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
4041  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
4042  *      changes.  Some folds in EXACTF depend on if the runtime target string
4043  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
4044  *      when no fold in it depends on the UTF-8ness of the target string.)
4045  *
4046  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
4047  *      validity of the fold won't be known until runtime, and so must remain
4048  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
4049  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
4050  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
4051  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
4052  *      The reason this is a problem is that the optimizer part of regexec.c
4053  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
4054  *      that a character in the pattern corresponds to at most a single
4055  *      character in the target string.  (And I do mean character, and not byte
4056  *      here, unlike other parts of the documentation that have never been
4057  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
4058  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
4059  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
4060  *      EXACTFL nodes, violate the assumption, and they are the only instances
4061  *      where it is violated.  I'm reluctant to try to change the assumption,
4062  *      as the code involved is impenetrable to me (khw), so instead the code
4063  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
4064  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
4065  *      boolean indicating whether or not the node contains such a fold.  When
4066  *      it is true, the caller sets a flag that later causes the optimizer in
4067  *      this file to not set values for the floating and fixed string lengths,
4068  *      and thus avoids the optimizer code in regexec.c that makes the invalid
4069  *      assumption.  Thus, there is no optimization based on string lengths for
4070  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
4071  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
4072  *      assumption is wrong only in these cases is that all other non-UTF-8
4073  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
4074  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
4075  *      EXACTF nodes because we don't know at compile time if it actually
4076  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
4077  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
4078  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
4079  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
4080  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
4081  *      string would require the pattern to be forced into UTF-8, the overhead
4082  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
4083  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
4084  *      locale.)
4085  *
4086  *      Similarly, the code that generates tries doesn't currently handle
4087  *      not-already-folded multi-char folds, and it looks like a pain to change
4088  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
4089  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
4090  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
4091  *      using /iaa matching will be doing so almost entirely with ASCII
4092  *      strings, so this should rarely be encountered in practice */
4093
4094 STATIC U32
4095 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4096                    UV *min_subtract, bool *unfolded_multi_char,
4097                    U32 flags, regnode *val, U32 depth)
4098 {
4099     /* Merge several consecutive EXACTish nodes into one. */
4100
4101     regnode *n = regnext(scan);
4102     U32 stringok = 1;
4103     regnode *next = scan + NODE_SZ_STR(scan);
4104     U32 merged = 0;
4105     U32 stopnow = 0;
4106 #ifdef DEBUGGING
4107     regnode *stop = scan;
4108     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4109 #else
4110     PERL_UNUSED_ARG(depth);
4111 #endif
4112
4113     PERL_ARGS_ASSERT_JOIN_EXACT;
4114 #ifndef EXPERIMENTAL_INPLACESCAN
4115     PERL_UNUSED_ARG(flags);
4116     PERL_UNUSED_ARG(val);
4117 #endif
4118     DEBUG_PEEP("join", scan, depth, 0);
4119
4120     assert(PL_regkind[OP(scan)] == EXACT);
4121
4122     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
4123      * EXACT ones that are mergeable to the current one. */
4124     while (    n
4125            && (    PL_regkind[OP(n)] == NOTHING
4126                || (stringok && PL_regkind[OP(n)] == EXACT))
4127            && NEXT_OFF(n)
4128            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4129     {
4130
4131         if (OP(n) == TAIL || n > next)
4132             stringok = 0;
4133         if (PL_regkind[OP(n)] == NOTHING) {
4134             DEBUG_PEEP("skip:", n, depth, 0);
4135             NEXT_OFF(scan) += NEXT_OFF(n);
4136             next = n + NODE_STEP_REGNODE;
4137 #ifdef DEBUGGING
4138             if (stringok)
4139                 stop = n;
4140 #endif
4141             n = regnext(n);
4142         }
4143         else if (stringok) {
4144             const unsigned int oldl = STR_LEN(scan);
4145             regnode * const nnext = regnext(n);
4146
4147             /* XXX I (khw) kind of doubt that this works on platforms (should
4148              * Perl ever run on one) where U8_MAX is above 255 because of lots
4149              * of other assumptions */
4150             /* Don't join if the sum can't fit into a single node */
4151             if (oldl + STR_LEN(n) > U8_MAX)
4152                 break;
4153
4154             /* Joining something that requires UTF-8 with something that
4155              * doesn't, means the result requires UTF-8. */
4156             if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4157                 OP(scan) = EXACT_REQ8;
4158             }
4159             else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4160                 ;   /* join is compatible, no need to change OP */
4161             }
4162             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4163                 OP(scan) = EXACTFU_REQ8;
4164             }
4165             else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4166                 ;   /* join is compatible, no need to change OP */
4167             }
4168             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4169                 ;   /* join is compatible, no need to change OP */
4170             }
4171             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4172
4173                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4174                   * which can join with EXACTFU ones.  We check for this case
4175                   * here.  These need to be resolved to either EXACTFU or
4176                   * EXACTF at joining time.  They have nothing in them that
4177                   * would forbid them from being the more desirable EXACTFU
4178                   * nodes except that they begin and/or end with a single [Ss].
4179                   * The reason this is problematic is because they could be
4180                   * joined in this loop with an adjacent node that ends and/or
4181                   * begins with [Ss] which would then form the sequence 'ss',
4182                   * which matches differently under /di than /ui, in which case
4183                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
4184                   * formed, the nodes get absorbed into any adjacent EXACTFU
4185                   * node.  And if the only adjacent node is EXACTF, they get
4186                   * absorbed into that, under the theory that a longer node is
4187                   * better than two shorter ones, even if one is EXACTFU.  Note
4188                   * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4189                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
4190
4191                 if (STRING(n)[STR_LEN(n)-1] == 's') {
4192
4193                     /* Here the joined node would end with 's'.  If the node
4194                      * following the combination is an EXACTF one, it's better to
4195                      * join this trailing edge 's' node with that one, leaving the
4196                      * current one in 'scan' be the more desirable EXACTFU */
4197                     if (OP(nnext) == EXACTF) {
4198                         break;
4199                     }
4200
4201                     OP(scan) = EXACTFU_S_EDGE;
4202
4203                 }   /* Otherwise, the beginning 's' of the 2nd node just
4204                        becomes an interior 's' in 'scan' */
4205             }
4206             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4207                 ;   /* join is compatible, no need to change OP */
4208             }
4209             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4210
4211                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4212                  * nodes.  But the latter nodes can be also joined with EXACTFU
4213                  * ones, and that is a better outcome, so if the node following
4214                  * 'n' is EXACTFU, quit now so that those two can be joined
4215                  * later */
4216                 if (OP(nnext) == EXACTFU) {
4217                     break;
4218                 }
4219
4220                 /* The join is compatible, and the combined node will be
4221                  * EXACTF.  (These don't care if they begin or end with 's' */
4222             }
4223             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4224                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
4225                     && STRING(n)[0] == 's')
4226                 {
4227                     /* When combined, we have the sequence 'ss', which means we
4228                      * have to remain /di */
4229                     OP(scan) = EXACTF;
4230                 }
4231             }
4232             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4233                 if (STRING(n)[0] == 's') {
4234                     ;   /* Here the join is compatible and the combined node
4235                            starts with 's', no need to change OP */
4236                 }
4237                 else {  /* Now the trailing 's' is in the interior */
4238                     OP(scan) = EXACTFU;
4239                 }
4240             }
4241             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4242
4243                 /* The join is compatible, and the combined node will be
4244                  * EXACTF.  (These don't care if they begin or end with 's' */
4245                 OP(scan) = EXACTF;
4246             }
4247             else if (OP(scan) != OP(n)) {
4248
4249                 /* The only other compatible joinings are the same node type */
4250                 break;
4251             }
4252
4253             DEBUG_PEEP("merg", n, depth, 0);
4254             merged++;
4255
4256             NEXT_OFF(scan) += NEXT_OFF(n);
4257             assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
4258             setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
4259             next = n + NODE_SZ_STR(n);
4260             /* Now we can overwrite *n : */
4261             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4262 #ifdef DEBUGGING
4263             stop = next - 1;
4264 #endif
4265             n = nnext;
4266             if (stopnow) break;
4267         }
4268
4269 #ifdef EXPERIMENTAL_INPLACESCAN
4270         if (flags && !NEXT_OFF(n)) {
4271             DEBUG_PEEP("atch", val, depth, 0);
4272             if (reg_off_by_arg[OP(n)]) {
4273                 ARG_SET(n, val - n);
4274             }
4275             else {
4276                 NEXT_OFF(n) = val - n;
4277             }
4278             stopnow = 1;
4279         }
4280 #endif
4281     }
4282
4283     /* This temporary node can now be turned into EXACTFU, and must, as
4284      * regexec.c doesn't handle it */
4285     if (OP(scan) == EXACTFU_S_EDGE) {
4286         OP(scan) = EXACTFU;
4287     }
4288
4289     *min_subtract = 0;
4290     *unfolded_multi_char = FALSE;
4291
4292     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
4293      * can now analyze for sequences of problematic code points.  (Prior to
4294      * this final joining, sequences could have been split over boundaries, and
4295      * hence missed).  The sequences only happen in folding, hence for any
4296      * non-EXACT EXACTish node */
4297     if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4298         U8* s0 = (U8*) STRING(scan);
4299         U8* s = s0;
4300         U8* s_end = s0 + STR_LEN(scan);
4301
4302         int total_count_delta = 0;  /* Total delta number of characters that
4303                                        multi-char folds expand to */
4304
4305         /* One pass is made over the node's string looking for all the
4306          * possibilities.  To avoid some tests in the loop, there are two main
4307          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4308          * non-UTF-8 */
4309         if (UTF) {
4310             U8* folded = NULL;
4311
4312             if (OP(scan) == EXACTFL) {
4313                 U8 *d;
4314
4315                 /* An EXACTFL node would already have been changed to another
4316                  * node type unless there is at least one character in it that
4317                  * is problematic; likely a character whose fold definition
4318                  * won't be known until runtime, and so has yet to be folded.
4319                  * For all but the UTF-8 locale, folds are 1-1 in length, but
4320                  * to handle the UTF-8 case, we need to create a temporary
4321                  * folded copy using UTF-8 locale rules in order to analyze it.
4322                  * This is because our macros that look to see if a sequence is
4323                  * a multi-char fold assume everything is folded (otherwise the
4324                  * tests in those macros would be too complicated and slow).
4325                  * Note that here, the non-problematic folds will have already
4326                  * been done, so we can just copy such characters.  We actually
4327                  * don't completely fold the EXACTFL string.  We skip the
4328                  * unfolded multi-char folds, as that would just create work
4329                  * below to figure out the size they already are */
4330
4331                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4332                 d = folded;
4333                 while (s < s_end) {
4334                     STRLEN s_len = UTF8SKIP(s);
4335                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4336                         Copy(s, d, s_len, U8);
4337                         d += s_len;
4338                     }
4339                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
4340                         *unfolded_multi_char = TRUE;
4341                         Copy(s, d, s_len, U8);
4342                         d += s_len;
4343                     }
4344                     else if (isASCII(*s)) {
4345                         *(d++) = toFOLD(*s);
4346                     }
4347                     else {
4348                         STRLEN len;
4349                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4350                         d += len;
4351                     }
4352                     s += s_len;
4353                 }
4354
4355                 /* Point the remainder of the routine to look at our temporary
4356                  * folded copy */
4357                 s = folded;
4358                 s_end = d;
4359             } /* End of creating folded copy of EXACTFL string */
4360
4361             /* Examine the string for a multi-character fold sequence.  UTF-8
4362              * patterns have all characters pre-folded by the time this code is
4363              * executed */
4364             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4365                                      length sequence we are looking for is 2 */
4366             {
4367                 int count = 0;  /* How many characters in a multi-char fold */
4368                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4369                 if (! len) {    /* Not a multi-char fold: get next char */
4370                     s += UTF8SKIP(s);
4371                     continue;
4372                 }
4373
4374                 { /* Here is a generic multi-char fold. */
4375                     U8* multi_end  = s + len;
4376
4377                     /* Count how many characters are in it.  In the case of
4378                      * /aa, no folds which contain ASCII code points are
4379                      * allowed, so check for those, and skip if found. */
4380                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4381                         count = utf8_length(s, multi_end);
4382                         s = multi_end;
4383                     }
4384                     else {
4385                         while (s < multi_end) {
4386                             if (isASCII(*s)) {
4387                                 s++;
4388                                 goto next_iteration;
4389                             }
4390                             else {
4391                                 s += UTF8SKIP(s);
4392                             }
4393                             count++;
4394                         }
4395                     }
4396                 }
4397
4398                 /* The delta is how long the sequence is minus 1 (1 is how long
4399                  * the character that folds to the sequence is) */
4400                 total_count_delta += count - 1;
4401               next_iteration: ;
4402             }
4403
4404             /* We created a temporary folded copy of the string in EXACTFL
4405              * nodes.  Therefore we need to be sure it doesn't go below zero,
4406              * as the real string could be shorter */
4407             if (OP(scan) == EXACTFL) {
4408                 int total_chars = utf8_length((U8*) STRING(scan),
4409                                            (U8*) STRING(scan) + STR_LEN(scan));
4410                 if (total_count_delta > total_chars) {
4411                     total_count_delta = total_chars;
4412                 }
4413             }
4414
4415             *min_subtract += total_count_delta;
4416             Safefree(folded);
4417         }
4418         else if (OP(scan) == EXACTFAA) {
4419
4420             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
4421              * fold to the ASCII range (and there are no existing ones in the
4422              * upper latin1 range).  But, as outlined in the comments preceding
4423              * this function, we need to flag any occurrences of the sharp s.
4424              * This character forbids trie formation (because of added
4425              * complexity) */
4426 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4427    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4428                                       || UNICODE_DOT_DOT_VERSION > 0)
4429             while (s < s_end) {
4430                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4431                     OP(scan) = EXACTFAA_NO_TRIE;
4432                     *unfolded_multi_char = TRUE;
4433                     break;
4434                 }
4435                 s++;
4436             }
4437         }
4438         else if (OP(scan) != EXACTFAA_NO_TRIE) {
4439
4440             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
4441              * folds that are all Latin1.  As explained in the comments
4442              * preceding this function, we look also for the sharp s in EXACTF
4443              * and EXACTFL nodes; it can be in the final position.  Otherwise
4444              * we can stop looking 1 byte earlier because have to find at least
4445              * two characters for a multi-fold */
4446             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4447                               ? s_end
4448                               : s_end -1;
4449
4450             while (s < upper) {
4451                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4452                 if (! len) {    /* Not a multi-char fold. */
4453                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4454                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4455                     {
4456                         *unfolded_multi_char = TRUE;
4457                     }
4458                     s++;
4459                     continue;
4460                 }
4461
4462                 if (len == 2
4463                     && isALPHA_FOLD_EQ(*s, 's')
4464                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4465                 {
4466
4467                     /* EXACTF nodes need to know that the minimum length
4468                      * changed so that a sharp s in the string can match this
4469                      * ss in the pattern, but they remain EXACTF nodes, as they
4470                      * won't match this unless the target string is in UTF-8,
4471                      * which we don't know until runtime.  EXACTFL nodes can't
4472                      * transform into EXACTFU nodes */
4473                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4474                         OP(scan) = EXACTFUP;
4475                     }
4476                 }
4477
4478                 *min_subtract += len - 1;
4479                 s += len;
4480             }
4481 #endif
4482         }
4483     }
4484
4485 #ifdef DEBUGGING
4486     /* Allow dumping but overwriting the collection of skipped
4487      * ops and/or strings with fake optimized ops */
4488     n = scan + NODE_SZ_STR(scan);
4489     while (n <= stop) {
4490         OP(n) = OPTIMIZED;
4491         FLAGS(n) = 0;
4492         NEXT_OFF(n) = 0;
4493         n++;
4494     }
4495 #endif
4496     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4497     return stopnow;
4498 }
4499
4500 /* REx optimizer.  Converts nodes into quicker variants "in place".
4501    Finds fixed substrings.  */
4502
4503 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4504    to the position after last scanned or to NULL. */
4505
4506 #define INIT_AND_WITHP \
4507     assert(!and_withp); \
4508     Newx(and_withp, 1, regnode_ssc); \
4509     SAVEFREEPV(and_withp)
4510
4511
4512 static void
4513 S_unwind_scan_frames(pTHX_ const void *p)
4514 {
4515     scan_frame *f= (scan_frame *)p;
4516     do {
4517         scan_frame *n= f->next_frame;
4518         Safefree(f);
4519         f= n;
4520     } while (f);
4521 }
4522
4523 /* Follow the next-chain of the current node and optimize away
4524    all the NOTHINGs from it.
4525  */
4526 STATIC void
4527 S_rck_elide_nothing(pTHX_ regnode *node)
4528 {
4529     PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
4530
4531     if (OP(node) != CURLYX) {
4532         const int max = (reg_off_by_arg[OP(node)]
4533                         ? I32_MAX
4534                           /* I32 may be smaller than U16 on CRAYs! */
4535                         : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4536         int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
4537         int noff;
4538         regnode *n = node;
4539
4540         /* Skip NOTHING and LONGJMP. */
4541         while (
4542             (n = regnext(n))
4543             && (
4544                 (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4545                 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
4546             )
4547             && off + noff < max
4548         ) {
4549             off += noff;
4550         }
4551         if (reg_off_by_arg[OP(node)])
4552             ARG(node) = off;
4553         else
4554             NEXT_OFF(node) = off;
4555     }
4556     return;
4557 }
4558
4559 /* the return from this sub is the minimum length that could possibly match */
4560 STATIC SSize_t
4561 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4562                         SSize_t *minlenp, SSize_t *deltap,
4563                         regnode *last,
4564                         scan_data_t *data,
4565                         I32 stopparen,
4566                         U32 recursed_depth,
4567                         regnode_ssc *and_withp,
4568                         U32 flags, U32 depth, bool was_mutate_ok)
4569                         /* scanp: Start here (read-write). */
4570                         /* deltap: Write maxlen-minlen here. */
4571                         /* last: Stop before this one. */
4572                         /* data: string data about the pattern */
4573                         /* stopparen: treat close N as END */
4574                         /* recursed: which subroutines have we recursed into */
4575                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4576 {
4577     SSize_t final_minlen;
4578     /* There must be at least this number of characters to match */
4579     SSize_t min = 0;
4580     I32 pars = 0, code;
4581     regnode *scan = *scanp, *next;
4582     SSize_t delta = 0;
4583     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4584     int is_inf_internal = 0;            /* The studied chunk is infinite */
4585     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4586     scan_data_t data_fake;
4587     SV *re_trie_maxbuff = NULL;
4588     regnode *first_non_open = scan;
4589     SSize_t stopmin = OPTIMIZE_INFTY;
4590     scan_frame *frame = NULL;
4591     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4592
4593     PERL_ARGS_ASSERT_STUDY_CHUNK;
4594     RExC_study_started= 1;
4595
4596     Zero(&data_fake, 1, scan_data_t);
4597
4598     if ( depth == 0 ) {
4599         while (first_non_open && OP(first_non_open) == OPEN)
4600             first_non_open=regnext(first_non_open);
4601     }
4602
4603
4604   fake_study_recurse:
4605     DEBUG_r(
4606         RExC_study_chunk_recursed_count++;
4607     );
4608     DEBUG_OPTIMISE_MORE_r(
4609     {
4610         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4611             depth, (long)stopparen,
4612             (unsigned long)RExC_study_chunk_recursed_count,
4613             (unsigned long)depth, (unsigned long)recursed_depth,
4614             scan,
4615             last);
4616         if (recursed_depth) {
4617             U32 i;
4618             U32 j;
4619             for ( j = 0 ; j < recursed_depth ; j++ ) {
4620                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4621                     if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
4622                         Perl_re_printf( aTHX_ " %d",(int)i);
4623                         break;
4624                     }
4625                 }
4626                 if ( j + 1 < recursed_depth ) {
4627                     Perl_re_printf( aTHX_  ",");
4628                 }
4629             }
4630         }
4631         Perl_re_printf( aTHX_ "\n");
4632     }
4633     );
4634     while ( scan && OP(scan) != END && scan < last ){
4635         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4636                                    node length to get a real minimum (because
4637                                    the folded version may be shorter) */
4638         bool unfolded_multi_char = FALSE;
4639         /* avoid mutating ops if we are anywhere within the recursed or
4640          * enframed handling for a GOSUB: the outermost level will handle it.
4641          */
4642         bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
4643         /* Peephole optimizer: */
4644         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4645         DEBUG_PEEP("Peep", scan, depth, flags);
4646
4647
4648         /* The reason we do this here is that we need to deal with things like
4649          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4650          * parsing code, as each (?:..) is handled by a different invocation of
4651          * reg() -- Yves
4652          */
4653         if (PL_regkind[OP(scan)] == EXACT
4654             && OP(scan) != LEXACT
4655             && OP(scan) != LEXACT_REQ8
4656             && mutate_ok
4657         ) {
4658             join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
4659                     0, NULL, depth + 1);
4660         }
4661
4662         /* Follow the next-chain of the current node and optimize
4663            away all the NOTHINGs from it.
4664          */
4665         rck_elide_nothing(scan);
4666
4667         /* The principal pseudo-switch.  Cannot be a switch, since we look into
4668          * several different things.  */
4669         if ( OP(scan) == DEFINEP ) {
4670             SSize_t minlen = 0;
4671             SSize_t deltanext = 0;
4672             SSize_t fake_last_close = 0;
4673             I32 f = SCF_IN_DEFINE;
4674
4675             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4676             scan = regnext(scan);
4677             assert( OP(scan) == IFTHEN );
4678             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4679
4680             data_fake.last_closep= &fake_last_close;
4681             minlen = *minlenp;
4682             next = regnext(scan);
4683             scan = NEXTOPER(NEXTOPER(scan));
4684             DEBUG_PEEP("scan", scan, depth, flags);
4685             DEBUG_PEEP("next", next, depth, flags);
4686
4687             /* we suppose the run is continuous, last=next...
4688              * NOTE we dont use the return here! */
4689             /* DEFINEP study_chunk() recursion */
4690             (void)study_chunk(pRExC_state, &scan, &minlen,
4691                               &deltanext, next, &data_fake, stopparen,
4692                               recursed_depth, NULL, f, depth+1, mutate_ok);
4693
4694             scan = next;
4695         } else
4696         if (
4697             OP(scan) == BRANCH  ||
4698             OP(scan) == BRANCHJ ||
4699             OP(scan) == IFTHEN
4700         ) {
4701             next = regnext(scan);
4702             code = OP(scan);
4703
4704             /* The op(next)==code check below is to see if we
4705              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4706              * IFTHEN is special as it might not appear in pairs.
4707              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4708              * we dont handle it cleanly. */
4709             if (OP(next) == code || code == IFTHEN) {
4710                 /* NOTE - There is similar code to this block below for
4711                  * handling TRIE nodes on a re-study.  If you change stuff here
4712                  * check there too. */
4713                 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
4714                 regnode_ssc accum;
4715                 regnode * const startbranch=scan;
4716
4717                 if (flags & SCF_DO_SUBSTR) {
4718                     /* Cannot merge strings after this. */
4719                     scan_commit(pRExC_state, data, minlenp, is_inf);
4720                 }
4721
4722                 if (flags & SCF_DO_STCLASS)
4723                     ssc_init_zero(pRExC_state, &accum);
4724
4725                 while (OP(scan) == code) {
4726                     SSize_t deltanext, minnext, fake;
4727                     I32 f = 0;
4728                     regnode_ssc this_class;
4729
4730                     DEBUG_PEEP("Branch", scan, depth, flags);
4731
4732                     num++;
4733                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4734                     if (data) {
4735                         data_fake.whilem_c = data->whilem_c;
4736                         data_fake.last_closep = data->last_closep;
4737                     }
4738                     else
4739                         data_fake.last_closep = &fake;
4740
4741                     data_fake.pos_delta = delta;
4742                     next = regnext(scan);
4743
4744                     scan = NEXTOPER(scan); /* everything */
4745                     if (code != BRANCH)    /* everything but BRANCH */
4746                         scan = NEXTOPER(scan);
4747
4748                     if (flags & SCF_DO_STCLASS) {
4749                         ssc_init(pRExC_state, &this_class);
4750                         data_fake.start_class = &this_class;
4751                         f = SCF_DO_STCLASS_AND;
4752                     }
4753                     if (flags & SCF_WHILEM_VISITED_POS)
4754                         f |= SCF_WHILEM_VISITED_POS;
4755
4756                     /* we suppose the run is continuous, last=next...*/
4757                     /* recurse study_chunk() for each BRANCH in an alternation */
4758                     minnext = study_chunk(pRExC_state, &scan, minlenp,
4759                                       &deltanext, next, &data_fake, stopparen,
4760                                       recursed_depth, NULL, f, depth+1,
4761                                       mutate_ok);
4762
4763                     if (min1 > minnext)
4764                         min1 = minnext;
4765                     if (deltanext == OPTIMIZE_INFTY) {
4766                         is_inf = is_inf_internal = 1;
4767                         max1 = OPTIMIZE_INFTY;
4768                     } else if (max1 < minnext + deltanext)
4769                         max1 = minnext + deltanext;
4770                     scan = next;
4771                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4772                         pars++;
4773                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4774                         if ( stopmin > minnext)
4775                             stopmin = min + min1;
4776                         flags &= ~SCF_DO_SUBSTR;
4777                         if (data)
4778                             data->flags |= SCF_SEEN_ACCEPT;
4779                     }
4780                     if (data) {
4781                         if (data_fake.flags & SF_HAS_EVAL)
4782                             data->flags |= SF_HAS_EVAL;
4783                         data->whilem_c = data_fake.whilem_c;
4784                     }
4785                     if (flags & SCF_DO_STCLASS)
4786                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4787                 }
4788                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4789                     min1 = 0;
4790                 if (flags & SCF_DO_SUBSTR) {
4791                     data->pos_min += min1;
4792                     if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
4793                         data->pos_delta = OPTIMIZE_INFTY;
4794                     else
4795                         data->pos_delta += max1 - min1;
4796                     if (max1 != min1 || is_inf)
4797                         data->cur_is_floating = 1;
4798                 }
4799                 min += min1;
4800                 if (delta == OPTIMIZE_INFTY
4801                  || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
4802                     delta = OPTIMIZE_INFTY;
4803                 else
4804                     delta += max1 - min1;
4805                 if (flags & SCF_DO_STCLASS_OR) {
4806                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4807                     if (min1) {
4808                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4809                         flags &= ~SCF_DO_STCLASS;
4810                     }
4811                 }
4812                 else if (flags & SCF_DO_STCLASS_AND) {
4813                     if (min1) {
4814                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4815                         flags &= ~SCF_DO_STCLASS;
4816                     }
4817                     else {
4818                         /* Switch to OR mode: cache the old value of
4819                          * data->start_class */
4820                         INIT_AND_WITHP;
4821                         StructCopy(data->start_class, and_withp, regnode_ssc);
4822                         flags &= ~SCF_DO_STCLASS_AND;
4823                         StructCopy(&accum, data->start_class, regnode_ssc);
4824                         flags |= SCF_DO_STCLASS_OR;
4825                     }
4826                 }
4827
4828                 if (PERL_ENABLE_TRIE_OPTIMISATION
4829                     && OP(startbranch) == BRANCH
4830                     && mutate_ok
4831                 ) {
4832                 /* demq.
4833
4834                    Assuming this was/is a branch we are dealing with: 'scan'
4835                    now points at the item that follows the branch sequence,
4836                    whatever it is. We now start at the beginning of the
4837                    sequence and look for subsequences of
4838
4839                    BRANCH->EXACT=>x1
4840                    BRANCH->EXACT=>x2
4841                    tail
4842
4843                    which would be constructed from a pattern like
4844                    /A|LIST|OF|WORDS/
4845
4846                    If we can find such a subsequence we need to turn the first
4847                    element into a trie and then add the subsequent branch exact
4848                    strings to the trie.
4849
4850                    We have two cases
4851
4852                      1. patterns where the whole set of branches can be
4853                         converted.
4854
4855                      2. patterns where only a subset can be converted.
4856
4857                    In case 1 we can replace the whole set with a single regop
4858                    for the trie. In case 2 we need to keep the start and end
4859                    branches so
4860
4861                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4862                      becomes BRANCH TRIE; BRANCH X;
4863
4864                   There is an additional case, that being where there is a
4865                   common prefix, which gets split out into an EXACT like node
4866                   preceding the TRIE node.
4867
4868                   If x(1..n)==tail then we can do a simple trie, if not we make
4869                   a "jump" trie, such that when we match the appropriate word
4870                   we "jump" to the appropriate tail node. Essentially we turn
4871                   a nested if into a case structure of sorts.
4872
4873                 */
4874
4875                     int made=0;
4876                     if (!re_trie_maxbuff) {
4877                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4878                         if (!SvIOK(re_trie_maxbuff))
4879                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4880                     }
4881                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4882                         regnode *cur;
4883                         regnode *first = (regnode *)NULL;
4884                         regnode *prev = (regnode *)NULL;
4885                         regnode *tail = scan;
4886                         U8 trietype = 0;
4887                         U32 count=0;
4888
4889                         /* var tail is used because there may be a TAIL
4890                            regop in the way. Ie, the exacts will point to the
4891                            thing following the TAIL, but the last branch will
4892                            point at the TAIL. So we advance tail. If we
4893                            have nested (?:) we may have to move through several
4894                            tails.
4895                          */
4896
4897                         while ( OP( tail ) == TAIL ) {
4898                             /* this is the TAIL generated by (?:) */
4899                             tail = regnext( tail );
4900                         }
4901
4902
4903                         DEBUG_TRIE_COMPILE_r({
4904                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4905                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4906                               depth+1,
4907                               "Looking for TRIE'able sequences. Tail node is ",
4908                               (UV) REGNODE_OFFSET(tail),
4909                               SvPV_nolen_const( RExC_mysv )
4910                             );
4911                         });
4912
4913                         /*
4914
4915                             Step through the branches
4916                                 cur represents each branch,
4917                                 noper is the first thing to be matched as part
4918                                       of that branch
4919                                 noper_next is the regnext() of that node.
4920
4921                             We normally handle a case like this
4922                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4923                             support building with NOJUMPTRIE, which restricts
4924                             the trie logic to structures like /FOO|BAR/.
4925
4926                             If noper is a trieable nodetype then the branch is
4927                             a possible optimization target. If we are building
4928                             under NOJUMPTRIE then we require that noper_next is
4929                             the same as scan (our current position in the regex
4930                             program).
4931
4932                             Once we have two or more consecutive such branches
4933                             we can create a trie of the EXACT's contents and
4934                             stitch it in place into the program.
4935
4936                             If the sequence represents all of the branches in
4937                             the alternation we replace the entire thing with a
4938                             single TRIE node.
4939
4940                             Otherwise when it is a subsequence we need to
4941                             stitch it in place and replace only the relevant
4942                             branches. This means the first branch has to remain
4943                             as it is used by the alternation logic, and its
4944                             next pointer, and needs to be repointed at the item
4945                             on the branch chain following the last branch we
4946                             have optimized away.
4947
4948                             This could be either a BRANCH, in which case the
4949                             subsequence is internal, or it could be the item
4950                             following the branch sequence in which case the
4951                             subsequence is at the end (which does not
4952                             necessarily mean the first node is the start of the
4953                             alternation).
4954
4955                             TRIE_TYPE(X) is a define which maps the optype to a
4956                             trietype.
4957
4958                                 optype          |  trietype
4959                                 ----------------+-----------
4960                                 NOTHING         | NOTHING
4961                                 EXACT           | EXACT
4962                                 EXACT_REQ8     | EXACT
4963                                 EXACTFU         | EXACTFU
4964                                 EXACTFU_REQ8   | EXACTFU
4965                                 EXACTFUP        | EXACTFU
4966                                 EXACTFAA        | EXACTFAA
4967                                 EXACTL          | EXACTL
4968                                 EXACTFLU8       | EXACTFLU8
4969
4970
4971                         */
4972 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4973                        ? NOTHING                                            \
4974                        : ( EXACT == (X) || EXACT_REQ8 == (X) )             \
4975                          ? EXACT                                            \
4976                          : (     EXACTFU == (X)                             \
4977                               || EXACTFU_REQ8 == (X)                       \
4978                               || EXACTFUP == (X) )                          \
4979                            ? EXACTFU                                        \
4980                            : ( EXACTFAA == (X) )                            \
4981                              ? EXACTFAA                                     \
4982                              : ( EXACTL == (X) )                            \
4983                                ? EXACTL                                     \
4984                                : ( EXACTFLU8 == (X) )                       \
4985                                  ? EXACTFLU8                                \
4986                                  : 0 )
4987
4988                         /* dont use tail as the end marker for this traverse */
4989                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4990                             regnode * const noper = NEXTOPER( cur );
4991                             U8 noper_type = OP( noper );
4992                             U8 noper_trietype = TRIE_TYPE( noper_type );
4993 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4994                             regnode * const noper_next = regnext( noper );
4995                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4996                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4997 #endif
4998
4999                             DEBUG_TRIE_COMPILE_r({
5000                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5001                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
5002                                    depth+1,
5003                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
5004
5005                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
5006                                 Perl_re_printf( aTHX_  " -> %d:%s",
5007                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
5008
5009                                 if ( noper_next ) {
5010                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
5011                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
5012                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
5013                                 }
5014                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
5015                                    REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5016                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
5017                                 );
5018                             });
5019
5020                             /* Is noper a trieable nodetype that can be merged
5021                              * with the current trie (if there is one)? */
5022                             if ( noper_trietype
5023                                   &&
5024                                   (
5025                                         ( noper_trietype == NOTHING )
5026                                         || ( trietype == NOTHING )
5027                                         || ( trietype == noper_trietype )
5028                                   )
5029 #ifdef NOJUMPTRIE
5030                                   && noper_next >= tail
5031 #endif
5032                                   && count < U16_MAX)
5033                             {
5034                                 /* Handle mergable triable node Either we are
5035                                  * the first node in a new trieable sequence,
5036                                  * in which case we do some bookkeeping,
5037                                  * otherwise we update the end pointer. */
5038                                 if ( !first ) {
5039                                     first = cur;
5040                                     if ( noper_trietype == NOTHING ) {
5041 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
5042                                         regnode * const noper_next = regnext( noper );
5043                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5044                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
5045 #endif
5046
5047                                         if ( noper_next_trietype ) {
5048                                             trietype = noper_next_trietype;
5049                                         } else if (noper_next_type)  {
5050                                             /* a NOTHING regop is 1 regop wide.
5051                                              * We need at least two for a trie
5052                                              * so we can't merge this in */
5053                                             first = NULL;
5054                                         }
5055                                     } else {
5056                                         trietype = noper_trietype;
5057                                     }
5058                                 } else {
5059                                     if ( trietype == NOTHING )
5060                                         trietype = noper_trietype;
5061                                     prev = cur;
5062                                 }
5063                                 if (first)
5064                                     count++;
5065                             } /* end handle mergable triable node */
5066                             else {
5067                                 /* handle unmergable node -
5068                                  * noper may either be a triable node which can
5069                                  * not be tried together with the current trie,
5070                                  * or a non triable node */
5071                                 if ( prev ) {
5072                                     /* If last is set and trietype is not
5073                                      * NOTHING then we have found at least two
5074                                      * triable branch sequences in a row of a
5075                                      * similar trietype so we can turn them
5076                                      * into a trie. If/when we allow NOTHING to
5077                                      * start a trie sequence this condition
5078                                      * will be required, and it isn't expensive
5079                                      * so we leave it in for now. */
5080                                     if ( trietype && trietype != NOTHING )
5081                                         make_trie( pRExC_state,
5082                                                 startbranch, first, cur, tail,
5083                                                 count, trietype, depth+1 );
5084                                     prev = NULL; /* note: we clear/update
5085                                                     first, trietype etc below,
5086                                                     so we dont do it here */
5087                                 }
5088                                 if ( noper_trietype
5089 #ifdef NOJUMPTRIE
5090                                      && noper_next >= tail
5091 #endif
5092                                 ){
5093                                     /* noper is triable, so we can start a new
5094                                      * trie sequence */
5095                                     count = 1;
5096                                     first = cur;
5097                                     trietype = noper_trietype;
5098                                 } else if (first) {
5099                                     /* if we already saw a first but the
5100                                      * current node is not triable then we have
5101                                      * to reset the first information. */
5102                                     count = 0;
5103                                     first = NULL;
5104                                     trietype = 0;
5105                                 }
5106                             } /* end handle unmergable node */
5107                         } /* loop over branches */
5108                         DEBUG_TRIE_COMPILE_r({
5109                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5110                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
5111                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5112                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5113                                REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5114                                PL_reg_name[trietype]
5115                             );
5116
5117                         });
5118                         if ( prev && trietype ) {
5119                             if ( trietype != NOTHING ) {
5120                                 /* the last branch of the sequence was part of
5121                                  * a trie, so we have to construct it here
5122                                  * outside of the loop */
5123                                 made= make_trie( pRExC_state, startbranch,
5124                                                  first, scan, tail, count,
5125                                                  trietype, depth+1 );
5126 #ifdef TRIE_STUDY_OPT
5127                                 if ( ((made == MADE_EXACT_TRIE &&
5128                                      startbranch == first)
5129                                      || ( first_non_open == first )) &&
5130                                      depth==0 ) {
5131                                     flags |= SCF_TRIE_RESTUDY;
5132                                     if ( startbranch == first
5133                                          && scan >= tail )
5134                                     {
5135                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5136                                     }
5137                                 }
5138 #endif
5139                             } else {
5140                                 /* at this point we know whatever we have is a
5141                                  * NOTHING sequence/branch AND if 'startbranch'
5142                                  * is 'first' then we can turn the whole thing
5143                                  * into a NOTHING
5144                                  */
5145                                 if ( startbranch == first ) {
5146                                     regnode *opt;
5147                                     /* the entire thing is a NOTHING sequence,
5148                                      * something like this: (?:|) So we can
5149                                      * turn it into a plain NOTHING op. */
5150                                     DEBUG_TRIE_COMPILE_r({
5151                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5152                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5153                                           depth+1,
5154                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5155
5156                                     });
5157                                     OP(startbranch)= NOTHING;
5158                                     NEXT_OFF(startbranch)= tail - startbranch;
5159                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
5160                                         OP(opt)= OPTIMIZED;
5161                                 }
5162                             }
5163                         } /* end if ( prev) */
5164                     } /* TRIE_MAXBUF is non zero */
5165                 } /* do trie */
5166
5167             }
5168             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
5169                 scan = NEXTOPER(NEXTOPER(scan));
5170             } else                      /* single branch is optimized. */
5171                 scan = NEXTOPER(scan);
5172             continue;
5173         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5174             I32 paren = 0;
5175             regnode *start = NULL;
5176             regnode *end = NULL;
5177             U32 my_recursed_depth= recursed_depth;
5178
5179             if (OP(scan) != SUSPEND) { /* GOSUB */
5180                 /* Do setup, note this code has side effects beyond
5181                  * the rest of this block. Specifically setting
5182                  * RExC_recurse[] must happen at least once during
5183                  * study_chunk(). */
5184                 paren = ARG(scan);
5185                 RExC_recurse[ARG2L(scan)] = scan;
5186                 start = REGNODE_p(RExC_open_parens[paren]);
5187                 end   = REGNODE_p(RExC_close_parens[paren]);
5188
5189                 /* NOTE we MUST always execute the above code, even
5190                  * if we do nothing with a GOSUB */
5191                 if (
5192                     ( flags & SCF_IN_DEFINE )
5193                     ||
5194                     (
5195                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5196                         &&
5197                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5198                     )
5199                 ) {
5200                     /* no need to do anything here if we are in a define. */
5201                     /* or we are after some kind of infinite construct
5202                      * so we can skip recursing into this item.
5203                      * Since it is infinite we will not change the maxlen
5204                      * or delta, and if we miss something that might raise
5205                      * the minlen it will merely pessimise a little.
5206                      *
5207                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5208                      * might result in a minlen of 1 and not of 4,
5209                      * but this doesn't make us mismatch, just try a bit
5210                      * harder than we should.
5211                      *
5212                      * However we must assume this GOSUB is infinite, to
5213                      * avoid wrongly applying other optimizations in the
5214                      * enclosing scope - see GH 18096, for example.
5215                      */
5216                     is_inf = is_inf_internal = 1;
5217                     scan= regnext(scan);
5218                     continue;
5219                 }
5220
5221                 if (
5222                     !recursed_depth
5223                     || !PAREN_TEST(recursed_depth - 1, paren)
5224                 ) {
5225                     /* it is quite possible that there are more efficient ways
5226                      * to do this. We maintain a bitmap per level of recursion
5227                      * of which patterns we have entered so we can detect if a
5228                      * pattern creates a possible infinite loop. When we
5229                      * recurse down a level we copy the previous levels bitmap
5230                      * down. When we are at recursion level 0 we zero the top
5231                      * level bitmap. It would be nice to implement a different
5232                      * more efficient way of doing this. In particular the top
5233                      * level bitmap may be unnecessary.
5234                      */
5235                     if (!recursed_depth) {
5236                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5237                     } else {
5238                         Copy(PAREN_OFFSET(recursed_depth - 1),
5239                              PAREN_OFFSET(recursed_depth),
5240                              RExC_study_chunk_recursed_bytes, U8);
5241                     }
5242                     /* we havent recursed into this paren yet, so recurse into it */
5243                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5244                     PAREN_SET(recursed_depth, paren);
5245                     my_recursed_depth= recursed_depth + 1;
5246                 } else {
5247                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5248                     /* some form of infinite recursion, assume infinite length
5249                      * */
5250                     if (flags & SCF_DO_SUBSTR) {
5251                         scan_commit(pRExC_state, data, minlenp, is_inf);
5252                         data->cur_is_floating = 1;
5253                     }
5254                     is_inf = is_inf_internal = 1;
5255                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5256                         ssc_anything(data->start_class);
5257                     flags &= ~SCF_DO_STCLASS;
5258
5259                     start= NULL; /* reset start so we dont recurse later on. */
5260                 }
5261             } else {
5262                 paren = stopparen;
5263                 start = scan + 2;
5264                 end = regnext(scan);
5265             }
5266             if (start) {
5267                 scan_frame *newframe;
5268                 assert(end);
5269                 if (!RExC_frame_last) {
5270                     Newxz(newframe, 1, scan_frame);
5271                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5272                     RExC_frame_head= newframe;
5273                     RExC_frame_count++;
5274                 } else if (!RExC_frame_last->next_frame) {
5275                     Newxz(newframe, 1, scan_frame);
5276                     RExC_frame_last->next_frame= newframe;
5277                     newframe->prev_frame= RExC_frame_last;
5278                     RExC_frame_count++;
5279                 } else {
5280                     newframe= RExC_frame_last->next_frame;
5281                 }
5282                 RExC_frame_last= newframe;
5283
5284                 newframe->next_regnode = regnext(scan);
5285                 newframe->last_regnode = last;
5286                 newframe->stopparen = stopparen;
5287                 newframe->prev_recursed_depth = recursed_depth;
5288                 newframe->this_prev_frame= frame;
5289                 newframe->in_gosub = (
5290                     (frame && frame->in_gosub) || OP(scan) == GOSUB
5291                 );
5292
5293                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5294                 DEBUG_PEEP("fnew", scan, depth, flags);
5295
5296                 frame = newframe;
5297                 scan =  start;
5298                 stopparen = paren;
5299                 last = end;
5300                 depth = depth + 1;
5301                 recursed_depth= my_recursed_depth;
5302
5303                 continue;
5304             }
5305         }
5306         else if (   OP(scan) == EXACT
5307                  || OP(scan) == LEXACT
5308                  || OP(scan) == EXACT_REQ8
5309                  || OP(scan) == LEXACT_REQ8
5310                  || OP(scan) == EXACTL)
5311         {
5312             SSize_t bytelen = STR_LEN(scan), charlen;
5313             UV uc;
5314             assert(bytelen);
5315             if (UTF) {
5316                 const U8 * const s = (U8*)STRING(scan);
5317                 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
5318                 charlen = utf8_length(s, s + bytelen);
5319             } else {
5320                 uc = *((U8*)STRING(scan));
5321                 charlen = bytelen;
5322             }
5323             min += charlen;
5324             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5325                 /* The code below prefers earlier match for fixed
5326                    offset, later match for variable offset.  */
5327                 if (data->last_end == -1) { /* Update the start info. */
5328                     data->last_start_min = data->pos_min;
5329                     data->last_start_max =
5330                         is_inf ? OPTIMIZE_INFTY
5331                         : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
5332                             ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
5333                 }
5334                 sv_catpvn(data->last_found, STRING(scan), bytelen);
5335                 if (UTF)
5336                     SvUTF8_on(data->last_found);
5337                 {
5338                     SV * const sv = data->last_found;
5339                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5340                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5341                     if (mg && mg->mg_len >= 0)
5342                         mg->mg_len += charlen;
5343                 }
5344                 data->last_end = data->pos_min + charlen;
5345                 data->pos_min += charlen; /* As in the first entry. */
5346                 data->flags &= ~SF_BEFORE_EOL;
5347             }
5348
5349             /* ANDing the code point leaves at most it, and not in locale, and
5350              * can't match null string */
5351             if (flags & SCF_DO_STCLASS_AND) {
5352                 ssc_cp_and(data->start_class, uc);
5353                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5354                 ssc_clear_locale(data->start_class);
5355             }
5356             else if (flags & SCF_DO_STCLASS_OR) {
5357                 ssc_add_cp(data->start_class, uc);
5358                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5359
5360                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5361                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5362             }
5363             flags &= ~SCF_DO_STCLASS;
5364         }
5365         else if (PL_regkind[OP(scan)] == EXACT) {
5366             /* But OP != EXACT!, so is EXACTFish */
5367             SSize_t bytelen = STR_LEN(scan), charlen;
5368             const U8 * s = (U8*)STRING(scan);
5369
5370             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
5371              * with the mask set to the complement of the bit that differs
5372              * between upper and lower case, and the lowest code point of the
5373              * pair (which the '&' forces) */
5374             if (     bytelen == 1
5375                 &&   isALPHA_A(*s)
5376                 &&  (         OP(scan) == EXACTFAA
5377                      || (     OP(scan) == EXACTFU
5378                          && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
5379                 &&   mutate_ok
5380             ) {
5381                 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
5382
5383                 OP(scan) = ANYOFM;
5384                 ARG_SET(scan, *s & mask);
5385                 FLAGS(scan) = mask;
5386                 /* we're not EXACTFish any more, so restudy */
5387                 continue;
5388             }
5389
5390             /* Search for fixed substrings supports EXACT only. */
5391             if (flags & SCF_DO_SUBSTR) {
5392                 assert(data);
5393                 scan_commit(pRExC_state, data, minlenp, is_inf);
5394             }
5395             charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
5396             if (unfolded_multi_char) {
5397                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5398             }
5399             min += charlen - min_subtract;
5400             assert (min >= 0);
5401             delta += min_subtract;
5402             if (flags & SCF_DO_SUBSTR) {
5403                 data->pos_min += charlen - min_subtract;
5404                 if (data->pos_min < 0) {
5405                     data->pos_min = 0;
5406                 }
5407                 data->pos_delta += min_subtract;
5408                 if (min_subtract) {
5409                     data->cur_is_floating = 1; /* float */
5410                 }
5411             }
5412
5413             if (flags & SCF_DO_STCLASS) {
5414                 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5415
5416                 assert(EXACTF_invlist);
5417                 if (flags & SCF_DO_STCLASS_AND) {
5418                     if (OP(scan) != EXACTFL)
5419                         ssc_clear_locale(data->start_class);
5420                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5421                     ANYOF_POSIXL_ZERO(data->start_class);
5422                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5423                 }
5424                 else {  /* SCF_DO_STCLASS_OR */
5425                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
5426                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5427
5428                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5429                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5430                 }
5431                 flags &= ~SCF_DO_STCLASS;
5432                 SvREFCNT_dec(EXACTF_invlist);
5433             }
5434         }
5435         else if (REGNODE_VARIES(OP(scan))) {
5436             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5437             I32 fl = 0, f = flags;
5438             regnode * const oscan = scan;
5439             regnode_ssc this_class;
5440             regnode_ssc *oclass = NULL;
5441             I32 next_is_eval = 0;
5442
5443             switch (PL_regkind[OP(scan)]) {
5444             case WHILEM:                /* End of (?:...)* . */
5445                 scan = NEXTOPER(scan);
5446                 goto finish;
5447             case PLUS:
5448                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5449                     next = NEXTOPER(scan);
5450                     if (   OP(next) == EXACT
5451                         || OP(next) == LEXACT
5452                         || OP(next) == EXACT_REQ8
5453                         || OP(next) == LEXACT_REQ8
5454                         || OP(next) == EXACTL
5455                         || (flags & SCF_DO_STCLASS))
5456                     {
5457                         mincount = 1;
5458                         maxcount = REG_INFTY;
5459                         next = regnext(scan);
5460                         scan = NEXTOPER(scan);
5461                         goto do_curly;
5462                     }
5463                 }
5464                 if (flags & SCF_DO_SUBSTR)
5465                     data->pos_min++;
5466                 /* This will bypass the formal 'min += minnext * mincount'
5467                  * calculation in the do_curly path, so assumes min width
5468                  * of the PLUS payload is exactly one. */
5469                 min++;
5470                 /* FALLTHROUGH */
5471             case STAR:
5472                 next = NEXTOPER(scan);
5473
5474                 /* This temporary node can now be turned into EXACTFU, and
5475                  * must, as regexec.c doesn't handle it */
5476                 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
5477                     OP(next) = EXACTFU;
5478                 }
5479
5480                 if (     STR_LEN(next) == 1
5481                     &&   isALPHA_A(* STRING(next))
5482                     && (         OP(next) == EXACTFAA
5483                         || (     OP(next) == EXACTFU
5484                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5485                     &&   mutate_ok
5486                 ) {
5487                     /* These differ in just one bit */
5488                     U8 mask = ~ ('A' ^ 'a');
5489
5490                     assert(isALPHA_A(* STRING(next)));
5491
5492                     /* Then replace it by an ANYOFM node, with
5493                     * the mask set to the complement of the
5494                     * bit that differs between upper and lower
5495                     * case, and the lowest code point of the
5496                     * pair (which the '&' forces) */
5497                     OP(next) = ANYOFM;
5498                     ARG_SET(next, *STRING(next) & mask);
5499                     FLAGS(next) = mask;
5500                 }
5501
5502                 if (flags & SCF_DO_STCLASS) {
5503                     mincount = 0;
5504                     maxcount = REG_INFTY;
5505                     next = regnext(scan);
5506                     scan = NEXTOPER(scan);
5507                     goto do_curly;
5508                 }
5509                 if (flags & SCF_DO_SUBSTR) {
5510                     scan_commit(pRExC_state, data, minlenp, is_inf);
5511                     /* Cannot extend fixed substrings */
5512                     data->cur_is_floating = 1; /* float */
5513                 }
5514                 is_inf = is_inf_internal = 1;
5515                 scan = regnext(scan);
5516                 goto optimize_curly_tail;
5517             case CURLY:
5518                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5519                     && (scan->flags == stopparen))
5520                 {
5521                     mincount = 1;
5522                     maxcount = 1;
5523                 } else {
5524                     mincount = ARG1(scan);
5525                     maxcount = ARG2(scan);
5526                 }
5527                 next = regnext(scan);
5528                 if (OP(scan) == CURLYX) {
5529                     I32 lp = (data ? *(data->last_closep) : 0);
5530                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5531                 }
5532                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5533                 next_is_eval = (OP(scan) == EVAL);
5534               do_curly:
5535                 if (flags & SCF_DO_SUBSTR) {
5536                     if (mincount == 0)
5537                         scan_commit(pRExC_state, data, minlenp, is_inf);
5538                     /* Cannot extend fixed substrings */
5539                     pos_before = data->pos_min;
5540                 }
5541                 if (data) {
5542                     fl = data->flags;
5543                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5544                     if (is_inf)
5545                         data->flags |= SF_IS_INF;
5546                 }
5547                 if (flags & SCF_DO_STCLASS) {
5548                     ssc_init(pRExC_state, &this_class);
5549                     oclass = data->start_class;
5550                     data->start_class = &this_class;
5551                     f |= SCF_DO_STCLASS_AND;
5552                     f &= ~SCF_DO_STCLASS_OR;
5553                 }
5554                 /* Exclude from super-linear cache processing any {n,m}
5555                    regops for which the combination of input pos and regex
5556                    pos is not enough information to determine if a match
5557                    will be possible.
5558
5559                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
5560                    regex pos at the \s*, the prospects for a match depend not
5561                    only on the input position but also on how many (bar\s*)
5562                    repeats into the {4,8} we are. */
5563                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5564                     f &= ~SCF_WHILEM_VISITED_POS;
5565
5566                 /* This will finish on WHILEM, setting scan, or on NULL: */
5567                 /* recurse study_chunk() on loop bodies */
5568                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5569                                   last, data, stopparen, recursed_depth, NULL,
5570                                   (mincount == 0
5571                                    ? (f & ~SCF_DO_SUBSTR)
5572                                    : f)
5573                                   , depth+1, mutate_ok);
5574
5575                 if (flags & SCF_DO_STCLASS)
5576                     data->start_class = oclass;
5577                 if (mincount == 0 || minnext == 0) {
5578                     if (flags & SCF_DO_STCLASS_OR) {
5579                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5580                     }
5581                     else if (flags & SCF_DO_STCLASS_AND) {
5582                         /* Switch to OR mode: cache the old value of
5583                          * data->start_class */
5584                         INIT_AND_WITHP;
5585                         StructCopy(data->start_class, and_withp, regnode_ssc);
5586                         flags &= ~SCF_DO_STCLASS_AND;
5587                         StructCopy(&this_class, data->start_class, regnode_ssc);
5588                         flags |= SCF_DO_STCLASS_OR;
5589                         ANYOF_FLAGS(data->start_class)
5590                                                 |= SSC_MATCHES_EMPTY_STRING;
5591                     }
5592                 } else {                /* Non-zero len */
5593                     if (flags & SCF_DO_STCLASS_OR) {
5594                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5595                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5596                     }
5597                     else if (flags & SCF_DO_STCLASS_AND)
5598                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5599                     flags &= ~SCF_DO_STCLASS;
5600                 }
5601                 if (!scan)              /* It was not CURLYX, but CURLY. */
5602                     scan = next;
5603                 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5604                     /* ? quantifier ok, except for (?{ ... }) */
5605                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
5606                     && (minnext == 0) && (deltanext == 0)
5607                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5608                     && maxcount <= REG_INFTY/3) /* Complement check for big
5609                                                    count */
5610                 {
5611                     _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5612                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5613                             "Quantifier unexpected on zero-length expression "
5614                             "in regex m/%" UTF8f "/",
5615                              UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5616                                   RExC_precomp)));
5617                 }
5618
5619                 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
5620                     || min >= SSize_t_MAX - minnext * mincount )
5621                 {
5622                     FAIL("Regexp out of space");
5623                 }
5624
5625                 min += minnext * mincount;
5626                 is_inf_internal |= deltanext == OPTIMIZE_INFTY
5627                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5628                 is_inf |= is_inf_internal;
5629                 if (is_inf) {
5630                     delta = OPTIMIZE_INFTY;
5631                 } else {
5632                     delta += (minnext + deltanext) * maxcount
5633                              - minnext * mincount;
5634                 }
5635                 /* Try powerful optimization CURLYX => CURLYN. */
5636                 if (  OP(oscan) == CURLYX && data
5637                       && data->flags & SF_IN_PAR
5638                       && !(data->flags & SF_HAS_EVAL)
5639                       && !deltanext && minnext == 1
5640                       && mutate_ok
5641                 ) {
5642                     /* Try to optimize to CURLYN.  */
5643                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5644                     regnode * const nxt1 = nxt;
5645 #ifdef DEBUGGING
5646                     regnode *nxt2;
5647 #endif
5648
5649                     /* Skip open. */
5650                     nxt = regnext(nxt);
5651                     if (!REGNODE_SIMPLE(OP(nxt))
5652                         && !(PL_regkind[OP(nxt)] == EXACT
5653                              && STR_LEN(nxt) == 1))
5654                         goto nogo;
5655 #ifdef DEBUGGING
5656                     nxt2 = nxt;
5657 #endif
5658                     nxt = regnext(nxt);
5659                     if (OP(nxt) != CLOSE)
5660                         goto nogo;
5661                     if (RExC_open_parens) {
5662
5663                         /*open->CURLYM*/
5664                         RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5665
5666                         /*close->while*/
5667                         RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5668                     }
5669                     /* Now we know that nxt2 is the only contents: */
5670                     oscan->flags = (U8)ARG(nxt);
5671                     OP(oscan) = CURLYN;
5672                     OP(nxt1) = NOTHING; /* was OPEN. */
5673
5674 #ifdef DEBUGGING
5675                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5676                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5677                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5678                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
5679                     OP(nxt + 1) = OPTIMIZED; /* was count. */
5680                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5681 #endif
5682                 }
5683               nogo:
5684
5685                 /* Try optimization CURLYX => CURLYM. */
5686                 if (  OP(oscan) == CURLYX && data
5687                       && !(data->flags & SF_HAS_PAR)
5688                       && !(data->flags & SF_HAS_EVAL)
5689                       && !deltanext     /* atom is fixed width */
5690                       && minnext != 0   /* CURLYM can't handle zero width */
5691                          /* Nor characters whose fold at run-time may be
5692                           * multi-character */
5693                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5694                       && mutate_ok
5695                 ) {
5696                     /* XXXX How to optimize if data == 0? */
5697                     /* Optimize to a simpler form.  */
5698                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5699                     regnode *nxt2;
5700
5701                     OP(oscan) = CURLYM;
5702                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5703                             && (OP(nxt2) != WHILEM))
5704                         nxt = nxt2;
5705                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5706                     /* Need to optimize away parenths. */
5707                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5708                         /* Set the parenth number.  */
5709                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5710
5711                         oscan->flags = (U8)ARG(nxt);
5712                         if (RExC_open_parens) {
5713                              /*open->CURLYM*/
5714                             RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5715
5716                             /*close->NOTHING*/
5717                             RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5718                                                          + 1;
5719                         }
5720                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
5721                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
5722
5723 #ifdef DEBUGGING
5724                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5725                         OP(nxt + 1) = OPTIMIZED; /* was count. */
5726                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5727                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5728 #endif
5729 #if 0
5730                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
5731                             regnode *nnxt = regnext(nxt1);
5732                             if (nnxt == nxt) {
5733                                 if (reg_off_by_arg[OP(nxt1)])
5734                                     ARG_SET(nxt1, nxt2 - nxt1);
5735                                 else if (nxt2 - nxt1 < U16_MAX)
5736                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
5737                                 else
5738                                     OP(nxt) = NOTHING;  /* Cannot beautify */
5739                             }
5740                             nxt1 = nnxt;
5741                         }
5742 #endif
5743                         /* Optimize again: */
5744                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
5745                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5746                                     NULL, stopparen, recursed_depth, NULL, 0,
5747                                     depth+1, mutate_ok);
5748                     }
5749                     else
5750                         oscan->flags = 0;
5751                 }
5752                 else if ((OP(oscan) == CURLYX)
5753                          && (flags & SCF_WHILEM_VISITED_POS)
5754                          /* See the comment on a similar expression above.
5755                             However, this time it's not a subexpression
5756                             we care about, but the expression itself. */
5757                          && (maxcount == REG_INFTY)
5758                          && data) {
5759                     /* This stays as CURLYX, we can put the count/of pair. */
5760                     /* Find WHILEM (as in regexec.c) */
5761                     regnode *nxt = oscan + NEXT_OFF(oscan);
5762
5763                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5764                         nxt += ARG(nxt);
5765                     nxt = PREVOPER(nxt);
5766                     if (nxt->flags & 0xf) {
5767                         /* we've already set whilem count on this node */
5768                     } else if (++data->whilem_c < 16) {
5769                         assert(data->whilem_c <= RExC_whilem_seen);
5770                         nxt->flags = (U8)(data->whilem_c
5771                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5772                     }
5773                 }
5774                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5775                     pars++;
5776                 if (flags & SCF_DO_SUBSTR) {
5777                     SV *last_str = NULL;
5778                     STRLEN last_chrs = 0;
5779                     int counted = mincount != 0;
5780
5781                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5782                                                                   string. */
5783                         SSize_t b = pos_before >= data->last_start_min
5784                             ? pos_before : data->last_start_min;
5785                         STRLEN l;
5786                         const char * const s = SvPV_const(data->last_found, l);
5787                         SSize_t old = b - data->last_start_min;
5788                         assert(old >= 0);
5789
5790                         if (UTF)
5791                             old = utf8_hop_forward((U8*)s, old,
5792                                                (U8 *) SvEND(data->last_found))
5793                                 - (U8*)s;
5794                         l -= old;
5795                         /* Get the added string: */
5796                         last_str = newSVpvn_utf8(s  + old, l, UTF);
5797                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5798                                             (U8*)(s + old + l)) : l;
5799                         if (deltanext == 0 && pos_before == b) {
5800                             /* What was added is a constant string */
5801                             if (mincount > 1) {
5802
5803                                 SvGROW(last_str, (mincount * l) + 1);
5804                                 repeatcpy(SvPVX(last_str) + l,
5805                                           SvPVX_const(last_str), l,
5806                                           mincount - 1);
5807                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5808                                 /* Add additional parts. */
5809                                 SvCUR_set(data->last_found,
5810                                           SvCUR(data->last_found) - l);
5811                                 sv_catsv(data->last_found, last_str);
5812                                 {
5813                                     SV * sv = data->last_found;
5814                                     MAGIC *mg =
5815                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5816                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5817                                     if (mg && mg->mg_len >= 0)
5818                                         mg->mg_len += last_chrs * (mincount-1);
5819                                 }
5820                                 last_chrs *= mincount;
5821                                 data->last_end += l * (mincount - 1);
5822                             }
5823                         } else {
5824                             /* start offset must point into the last copy */
5825                             data->last_start_min += minnext * (mincount - 1);
5826                             data->last_start_max =
5827                               is_inf
5828                                ? OPTIMIZE_INFTY
5829                                : data->last_start_max +
5830                                  (maxcount - 1) * (minnext + data->pos_delta);
5831                         }
5832                     }
5833                     /* It is counted once already... */
5834                     data->pos_min += minnext * (mincount - counted);
5835 #if 0
5836 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5837                               " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
5838                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5839     (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, (UV)maxcount,
5840     (UV)mincount);
5841 if (deltanext != OPTIMIZE_INFTY)
5842 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5843     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5844           - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
5845 #endif
5846                     if (deltanext == OPTIMIZE_INFTY
5847                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
5848                         data->pos_delta = OPTIMIZE_INFTY;
5849                     else
5850                         data->pos_delta += - counted * deltanext +
5851                         (minnext + deltanext) * maxcount - minnext * mincount;
5852                     if (mincount != maxcount) {
5853                          /* Cannot extend fixed substrings found inside
5854                             the group.  */
5855                         scan_commit(pRExC_state, data, minlenp, is_inf);
5856                         if (mincount && last_str) {
5857                             SV * const sv = data->last_found;
5858                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5859                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5860
5861                             if (mg)
5862                                 mg->mg_len = -1;
5863                             sv_setsv(sv, last_str);
5864                             data->last_end = data->pos_min;
5865                             data->last_start_min = data->pos_min - last_chrs;
5866                             data->last_start_max = is_inf
5867                                 ? OPTIMIZE_INFTY
5868                                 : data->pos_min + data->pos_delta - last_chrs;
5869                         }
5870                         data->cur_is_floating = 1; /* float */
5871                     }
5872                     SvREFCNT_dec(last_str);
5873                 }
5874                 if (data && (fl & SF_HAS_EVAL))
5875                     data->flags |= SF_HAS_EVAL;
5876               optimize_curly_tail:
5877                 rck_elide_nothing(oscan);
5878                 continue;
5879
5880             default:
5881                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5882                                                                     OP(scan));
5883             case REF:
5884             case CLUMP:
5885                 if (flags & SCF_DO_SUBSTR) {
5886                     /* Cannot expect anything... */
5887                     scan_commit(pRExC_state, data, minlenp, is_inf);
5888                     data->cur_is_floating = 1; /* float */
5889                 }
5890                 is_inf = is_inf_internal = 1;
5891                 if (flags & SCF_DO_STCLASS_OR) {
5892                     if (OP(scan) == CLUMP) {
5893                         /* Actually is any start char, but very few code points
5894                          * aren't start characters */
5895                         ssc_match_all_cp(data->start_class);
5896                     }
5897                     else {
5898                         ssc_anything(data->start_class);
5899                     }
5900                 }
5901                 flags &= ~SCF_DO_STCLASS;
5902                 break;
5903             }
5904         }
5905         else if (OP(scan) == LNBREAK) {
5906             if (flags & SCF_DO_STCLASS) {
5907                 if (flags & SCF_DO_STCLASS_AND) {
5908                     ssc_intersection(data->start_class,
5909                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5910                     ssc_clear_locale(data->start_class);
5911                     ANYOF_FLAGS(data->start_class)
5912                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5913                 }
5914                 else if (flags & SCF_DO_STCLASS_OR) {
5915                     ssc_union(data->start_class,
5916                               PL_XPosix_ptrs[_CC_VERTSPACE],
5917                               FALSE);
5918                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5919
5920                     /* See commit msg for
5921                      * 749e076fceedeb708a624933726e7989f2302f6a */
5922                     ANYOF_FLAGS(data->start_class)
5923                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5924                 }
5925                 flags &= ~SCF_DO_STCLASS;
5926             }
5927             min++;
5928             if (delta != OPTIMIZE_INFTY)
5929                 delta++;    /* Because of the 2 char string cr-lf */
5930             if (flags & SCF_DO_SUBSTR) {
5931                 /* Cannot expect anything... */
5932                 scan_commit(pRExC_state, data, minlenp, is_inf);
5933                 data->pos_min += 1;
5934                 if (data->pos_delta != OPTIMIZE_INFTY) {
5935                     data->pos_delta += 1;
5936                 }
5937                 data->cur_is_floating = 1; /* float */
5938             }
5939         }
5940         else if (REGNODE_SIMPLE(OP(scan))) {
5941
5942             if (flags & SCF_DO_SUBSTR) {
5943                 scan_commit(pRExC_state, data, minlenp, is_inf);
5944                 data->pos_min++;
5945             }
5946             min++;
5947             if (flags & SCF_DO_STCLASS) {
5948                 bool invert = 0;
5949                 SV* my_invlist = NULL;
5950                 U8 namedclass;
5951
5952                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5953                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5954
5955                 /* Some of the logic below assumes that switching
5956                    locale on will only add false positives. */
5957                 switch (OP(scan)) {
5958
5959                 default:
5960 #ifdef DEBUGGING
5961                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5962                                                                      OP(scan));
5963 #endif
5964                 case SANY:
5965                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5966                         ssc_match_all_cp(data->start_class);
5967                     break;
5968
5969                 case REG_ANY:
5970                     {
5971                         SV* REG_ANY_invlist = _new_invlist(2);
5972                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5973                                                             '\n');
5974                         if (flags & SCF_DO_STCLASS_OR) {
5975                             ssc_union(data->start_class,
5976                                       REG_ANY_invlist,
5977                                       TRUE /* TRUE => invert, hence all but \n
5978                                             */
5979                                       );
5980                         }
5981                         else if (flags & SCF_DO_STCLASS_AND) {
5982                             ssc_intersection(data->start_class,
5983                                              REG_ANY_invlist,
5984                                              TRUE  /* TRUE => invert */
5985                                              );
5986                             ssc_clear_locale(data->start_class);
5987                         }
5988                         SvREFCNT_dec_NN(REG_ANY_invlist);
5989                     }
5990                     break;
5991
5992                 case ANYOFD:
5993                 case ANYOFL:
5994                 case ANYOFPOSIXL:
5995                 case ANYOFH:
5996                 case ANYOFHb:
5997                 case ANYOFHr:
5998                 case ANYOFHs:
5999                 case ANYOF:
6000                     if (flags & SCF_DO_STCLASS_AND)
6001                         ssc_and(pRExC_state, data->start_class,
6002                                 (regnode_charclass *) scan);
6003                     else
6004                         ssc_or(pRExC_state, data->start_class,
6005                                                           (regnode_charclass *) scan);
6006                     break;
6007
6008                 case NANYOFM: /* NANYOFM already contains the inversion of the
6009                                  input ANYOF data, so, unlike things like
6010                                  NPOSIXA, don't change 'invert' to TRUE */
6011                     /* FALLTHROUGH */
6012                 case ANYOFM:
6013                   {
6014                     SV* cp_list = get_ANYOFM_contents(scan);
6015
6016                     if (flags & SCF_DO_STCLASS_OR) {
6017                         ssc_union(data->start_class, cp_list, invert);
6018                     }
6019                     else if (flags & SCF_DO_STCLASS_AND) {
6020                         ssc_intersection(data->start_class, cp_list, invert);
6021                     }
6022
6023                     SvREFCNT_dec_NN(cp_list);
6024                     break;
6025                   }
6026
6027                 case ANYOFR:
6028                 case ANYOFRb:
6029                   {
6030                     SV* cp_list = NULL;
6031
6032                     cp_list = _add_range_to_invlist(cp_list,
6033                                         ANYOFRbase(scan),
6034                                         ANYOFRbase(scan) + ANYOFRdelta(scan));
6035
6036                     if (flags & SCF_DO_STCLASS_OR) {
6037                         ssc_union(data->start_class, cp_list, invert);
6038                     }
6039                     else if (flags & SCF_DO_STCLASS_AND) {
6040                         ssc_intersection(data->start_class, cp_list, invert);
6041                     }
6042
6043                     SvREFCNT_dec_NN(cp_list);
6044                     break;
6045                   }
6046
6047                 case NPOSIXL:
6048                     invert = 1;
6049                     /* FALLTHROUGH */
6050
6051                 case POSIXL:
6052                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
6053                     if (flags & SCF_DO_STCLASS_AND) {
6054                         bool was_there = cBOOL(
6055                                           ANYOF_POSIXL_TEST(data->start_class,
6056                                                                  namedclass));
6057                         ANYOF_POSIXL_ZERO(data->start_class);
6058                         if (was_there) {    /* Do an AND */
6059                             ANYOF_POSIXL_SET(data->start_class, namedclass);
6060                         }
6061                         /* No individual code points can now match */
6062                         data->start_class->invlist
6063                                                 = sv_2mortal(_new_invlist(0));
6064                     }
6065                     else {
6066                         int complement = namedclass + ((invert) ? -1 : 1);
6067
6068                         assert(flags & SCF_DO_STCLASS_OR);
6069
6070                         /* If the complement of this class was already there,
6071                          * the result is that they match all code points,
6072                          * (\d + \D == everything).  Remove the classes from
6073                          * future consideration.  Locale is not relevant in
6074                          * this case */
6075                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
6076                             ssc_match_all_cp(data->start_class);
6077                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
6078                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
6079                         }
6080                         else {  /* The usual case; just add this class to the
6081                                    existing set */
6082                             ANYOF_POSIXL_SET(data->start_class, namedclass);
6083                         }
6084                     }
6085                     break;
6086
6087                 case NPOSIXA:   /* For these, we always know the exact set of
6088                                    what's matched */
6089                     invert = 1;
6090                     /* FALLTHROUGH */
6091                 case POSIXA:
6092                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
6093                     goto join_posix_and_ascii;
6094
6095                 case NPOSIXD:
6096                 case NPOSIXU:
6097                     invert = 1;
6098                     /* FALLTHROUGH */
6099                 case POSIXD:
6100                 case POSIXU:
6101                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
6102
6103                     /* NPOSIXD matches all upper Latin1 code points unless the
6104                      * target string being matched is UTF-8, which is
6105                      * unknowable until match time.  Since we are going to
6106                      * invert, we want to get rid of all of them so that the
6107                      * inversion will match all */
6108                     if (OP(scan) == NPOSIXD) {
6109                         _invlist_subtract(my_invlist, PL_UpperLatin1,
6110                                           &my_invlist);
6111                     }
6112
6113                   join_posix_and_ascii:
6114
6115                     if (flags & SCF_DO_STCLASS_AND) {
6116                         ssc_intersection(data->start_class, my_invlist, invert);
6117                         ssc_clear_locale(data->start_class);
6118                     }
6119                     else {
6120                         assert(flags & SCF_DO_STCLASS_OR);
6121                         ssc_union(data->start_class, my_invlist, invert);
6122                     }
6123                     SvREFCNT_dec(my_invlist);
6124                 }
6125                 if (flags & SCF_DO_STCLASS_OR)
6126                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6127                 flags &= ~SCF_DO_STCLASS;
6128             }
6129         }
6130         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
6131             data->flags |= (OP(scan) == MEOL
6132                             ? SF_BEFORE_MEOL
6133                             : SF_BEFORE_SEOL);
6134             scan_commit(pRExC_state, data, minlenp, is_inf);
6135
6136         }
6137         else if (  PL_regkind[OP(scan)] == BRANCHJ
6138                  /* Lookbehind, or need to calculate parens/evals/stclass: */
6139                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
6140                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6141         {
6142             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6143                 || OP(scan) == UNLESSM )
6144             {
6145                 /* Negative Lookahead/lookbehind
6146                    In this case we can't do fixed string optimisation.
6147                 */
6148
6149                 SSize_t deltanext, minnext, fake = 0;
6150                 regnode *nscan;
6151                 regnode_ssc intrnl;
6152                 int f = 0;
6153
6154                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6155                 if (data) {
6156                     data_fake.whilem_c = data->whilem_c;
6157                     data_fake.last_closep = data->last_closep;
6158                 }
6159                 else
6160                     data_fake.last_closep = &fake;
6161                 data_fake.pos_delta = delta;
6162                 if ( flags & SCF_DO_STCLASS && !scan->flags
6163                      && OP(scan) == IFMATCH ) { /* Lookahead */
6164                     ssc_init(pRExC_state, &intrnl);
6165                     data_fake.start_class = &intrnl;
6166                     f |= SCF_DO_STCLASS_AND;
6167                 }
6168                 if (flags & SCF_WHILEM_VISITED_POS)
6169                     f |= SCF_WHILEM_VISITED_POS;
6170                 next = regnext(scan);
6171                 nscan = NEXTOPER(NEXTOPER(scan));
6172
6173                 /* recurse study_chunk() for lookahead body */
6174                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6175                                       last, &data_fake, stopparen,
6176                                       recursed_depth, NULL, f, depth+1,
6177                                       mutate_ok);
6178                 if (scan->flags) {
6179                     if (   deltanext < 0
6180                         || deltanext > (I32) U8_MAX
6181                         || minnext > (I32)U8_MAX
6182                         || minnext + deltanext > (I32)U8_MAX)
6183                     {
6184                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6185                               (UV)U8_MAX);
6186                     }
6187
6188                     /* The 'next_off' field has been repurposed to count the
6189                      * additional starting positions to try beyond the initial
6190                      * one.  (This leaves it at 0 for non-variable length
6191                      * matches to avoid breakage for those not using this
6192                      * extension) */
6193                     if (deltanext) {
6194                         scan->next_off = deltanext;
6195                         ckWARNexperimental(RExC_parse,
6196                             WARN_EXPERIMENTAL__VLB,
6197                             "Variable length lookbehind is experimental");
6198                     }
6199                     scan->flags = (U8)minnext + deltanext;
6200                 }
6201                 if (data) {
6202                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6203                         pars++;
6204                     if (data_fake.flags & SF_HAS_EVAL)
6205                         data->flags |= SF_HAS_EVAL;
6206                     data->whilem_c = data_fake.whilem_c;
6207                 }
6208                 if (f & SCF_DO_STCLASS_AND) {
6209                     if (flags & SCF_DO_STCLASS_OR) {
6210                         /* OR before, AND after: ideally we would recurse with
6211                          * data_fake to get the AND applied by study of the
6212                          * remainder of the pattern, and then derecurse;
6213                          * *** HACK *** for now just treat as "no information".
6214                          * See [perl #56690].
6215                          */
6216                         ssc_init(pRExC_state, data->start_class);
6217                     }  else {
6218                         /* AND before and after: combine and continue.  These
6219                          * assertions are zero-length, so can match an EMPTY
6220                          * string */
6221                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6222                         ANYOF_FLAGS(data->start_class)
6223                                                    |= SSC_MATCHES_EMPTY_STRING;
6224                     }
6225                 }
6226             }
6227 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6228             else {
6229                 /* Positive Lookahead/lookbehind
6230                    In this case we can do fixed string optimisation,
6231                    but we must be careful about it. Note in the case of
6232                    lookbehind the positions will be offset by the minimum
6233                    length of the pattern, something we won't know about
6234                    until after the recurse.
6235                 */
6236                 SSize_t deltanext, fake = 0;
6237                 regnode *nscan;
6238                 regnode_ssc intrnl;
6239                 int f = 0;
6240                 /* We use SAVEFREEPV so that when the full compile
6241                     is finished perl will clean up the allocated
6242                     minlens when it's all done. This way we don't
6243                     have to worry about freeing them when we know
6244                     they wont be used, which would be a pain.
6245                  */
6246                 SSize_t *minnextp;
6247                 Newx( minnextp, 1, SSize_t );
6248                 SAVEFREEPV(minnextp);
6249
6250                 if (data) {
6251                     StructCopy(data, &data_fake, scan_data_t);
6252                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6253                         f |= SCF_DO_SUBSTR;
6254                         if (scan->flags)
6255                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6256                         data_fake.last_found=newSVsv(data->last_found);
6257                     }
6258                 }
6259                 else
6260                     data_fake.last_closep = &fake;
6261                 data_fake.flags = 0;
6262                 data_fake.substrs[0].flags = 0;
6263                 data_fake.substrs[1].flags = 0;
6264                 data_fake.pos_delta = delta;
6265                 if (is_inf)
6266                     data_fake.flags |= SF_IS_INF;
6267                 if ( flags & SCF_DO_STCLASS && !scan->flags
6268                      && OP(scan) == IFMATCH ) { /* Lookahead */
6269                     ssc_init(pRExC_state, &intrnl);
6270                     data_fake.start_class = &intrnl;
6271                     f |= SCF_DO_STCLASS_AND;
6272                 }
6273                 if (flags & SCF_WHILEM_VISITED_POS)
6274                     f |= SCF_WHILEM_VISITED_POS;
6275                 next = regnext(scan);
6276                 nscan = NEXTOPER(NEXTOPER(scan));
6277
6278                 /* positive lookahead study_chunk() recursion */
6279                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6280                                         &deltanext, last, &data_fake,
6281                                         stopparen, recursed_depth, NULL,
6282                                         f, depth+1, mutate_ok);
6283                 if (scan->flags) {
6284                     assert(0);  /* This code has never been tested since this
6285                                    is normally not compiled */
6286                     if (   deltanext < 0
6287                         || deltanext > (I32) U8_MAX
6288                         || *minnextp > (I32)U8_MAX
6289                         || *minnextp + deltanext > (I32)U8_MAX)
6290                     {
6291                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6292                               (UV)U8_MAX);
6293                     }
6294
6295                     if (deltanext) {
6296                         scan->next_off = deltanext;
6297                     }
6298                     scan->flags = (U8)*minnextp + deltanext;
6299                 }
6300
6301                 *minnextp += min;
6302
6303                 if (f & SCF_DO_STCLASS_AND) {
6304                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6305                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6306                 }
6307                 if (data) {
6308                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6309                         pars++;
6310                     if (data_fake.flags & SF_HAS_EVAL)
6311                         data->flags |= SF_HAS_EVAL;
6312                     data->whilem_c = data_fake.whilem_c;
6313                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6314                         int i;
6315                         if (RExC_rx->minlen<*minnextp)
6316                             RExC_rx->minlen=*minnextp;
6317                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6318                         SvREFCNT_dec_NN(data_fake.last_found);
6319
6320                         for (i = 0; i < 2; i++) {
6321                             if (data_fake.substrs[i].minlenp != minlenp) {
6322                                 data->substrs[i].min_offset =
6323                                             data_fake.substrs[i].min_offset;
6324                                 data->substrs[i].max_offset =
6325                                             data_fake.substrs[i].max_offset;
6326                                 data->substrs[i].minlenp =
6327                                             data_fake.substrs[i].minlenp;
6328                                 data->substrs[i].lookbehind += scan->flags;
6329                             }
6330                         }
6331                     }
6332                 }
6333             }
6334 #endif
6335         }
6336         else if (OP(scan) == OPEN) {
6337             if (stopparen != (I32)ARG(scan))
6338                 pars++;
6339         }
6340         else if (OP(scan) == CLOSE) {
6341             if (stopparen == (I32)ARG(scan)) {
6342                 break;
6343             }
6344             if ((I32)ARG(scan) == is_par) {
6345                 next = regnext(scan);
6346
6347                 if ( next && (OP(next) != WHILEM) && next < last)
6348                     is_par = 0;         /* Disable optimization */
6349             }
6350             if (data)
6351                 *(data->last_closep) = ARG(scan);
6352         }
6353         else if (OP(scan) == EVAL) {
6354                 if (data)
6355                     data->flags |= SF_HAS_EVAL;
6356         }
6357         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6358             if (flags & SCF_DO_SUBSTR) {
6359                 scan_commit(pRExC_state, data, minlenp, is_inf);
6360                 flags &= ~SCF_DO_SUBSTR;
6361             }
6362             if (data && OP(scan)==ACCEPT) {
6363                 data->flags |= SCF_SEEN_ACCEPT;
6364                 if (stopmin > min)
6365                     stopmin = min;
6366             }
6367         }
6368         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6369         {
6370                 if (flags & SCF_DO_SUBSTR) {
6371                     scan_commit(pRExC_state, data, minlenp, is_inf);
6372                     data->cur_is_floating = 1; /* float */
6373                 }
6374                 is_inf = is_inf_internal = 1;
6375                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6376                     ssc_anything(data->start_class);
6377                 flags &= ~SCF_DO_STCLASS;
6378         }
6379         else if (OP(scan) == GPOS) {
6380             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6381                 !(delta || is_inf || (data && data->pos_delta)))
6382             {
6383                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6384                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
6385                 if (RExC_rx->gofs < (STRLEN)min)
6386                     RExC_rx->gofs = min;
6387             } else {
6388                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6389                 RExC_rx->gofs = 0;
6390             }
6391         }
6392 #ifdef TRIE_STUDY_OPT
6393 #ifdef FULL_TRIE_STUDY
6394         else if (PL_regkind[OP(scan)] == TRIE) {
6395             /* NOTE - There is similar code to this block above for handling
6396                BRANCH nodes on the initial study.  If you change stuff here
6397                check there too. */
6398             regnode *trie_node= scan;
6399             regnode *tail= regnext(scan);
6400             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6401             SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
6402             regnode_ssc accum;
6403
6404             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6405                 /* Cannot merge strings after this. */
6406                 scan_commit(pRExC_state, data, minlenp, is_inf);
6407             }
6408             if (flags & SCF_DO_STCLASS)
6409                 ssc_init_zero(pRExC_state, &accum);
6410
6411             if (!trie->jump) {
6412                 min1= trie->minlen;
6413                 max1= trie->maxlen;
6414             } else {
6415                 const regnode *nextbranch= NULL;
6416                 U32 word;
6417
6418                 for ( word=1 ; word <= trie->wordcount ; word++)
6419                 {
6420                     SSize_t deltanext=0, minnext=0, f = 0, fake;
6421                     regnode_ssc this_class;
6422
6423                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6424                     if (data) {
6425                         data_fake.whilem_c = data->whilem_c;
6426                         data_fake.last_closep = data->last_closep;
6427                     }
6428                     else
6429                         data_fake.last_closep = &fake;
6430                     data_fake.pos_delta = delta;
6431                     if (flags & SCF_DO_STCLASS) {
6432                         ssc_init(pRExC_state, &this_class);
6433                         data_fake.start_class = &this_class;
6434                         f = SCF_DO_STCLASS_AND;
6435                     }
6436                     if (flags & SCF_WHILEM_VISITED_POS)
6437                         f |= SCF_WHILEM_VISITED_POS;
6438
6439                     if (trie->jump[word]) {
6440                         if (!nextbranch)
6441                             nextbranch = trie_node + trie->jump[0];
6442                         scan= trie_node + trie->jump[word];
6443                         /* We go from the jump point to the branch that follows
6444                            it. Note this means we need the vestigal unused
6445                            branches even though they arent otherwise used. */
6446                         /* optimise study_chunk() for TRIE */
6447                         minnext = study_chunk(pRExC_state, &scan, minlenp,
6448                             &deltanext, (regnode *)nextbranch, &data_fake,
6449                             stopparen, recursed_depth, NULL, f, depth+1,
6450                             mutate_ok);
6451                     }
6452                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6453                         nextbranch= regnext((regnode*)nextbranch);
6454
6455                     if (min1 > (SSize_t)(minnext + trie->minlen))
6456                         min1 = minnext + trie->minlen;
6457                     if (deltanext == OPTIMIZE_INFTY) {
6458                         is_inf = is_inf_internal = 1;
6459                         max1 = OPTIMIZE_INFTY;
6460                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6461                         max1 = minnext + deltanext + trie->maxlen;
6462
6463                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6464                         pars++;
6465                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
6466                         if ( stopmin > min + min1)
6467                             stopmin = min + min1;
6468                         flags &= ~SCF_DO_SUBSTR;
6469                         if (data)
6470                             data->flags |= SCF_SEEN_ACCEPT;
6471                     }
6472                     if (data) {
6473                         if (data_fake.flags & SF_HAS_EVAL)
6474                             data->flags |= SF_HAS_EVAL;
6475                         data->whilem_c = data_fake.whilem_c;
6476                     }
6477                     if (flags & SCF_DO_STCLASS)
6478                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6479                 }
6480             }
6481             if (flags & SCF_DO_SUBSTR) {
6482                 data->pos_min += min1;
6483                 data->pos_delta += max1 - min1;
6484                 if (max1 != min1 || is_inf)
6485                     data->cur_is_floating = 1; /* float */
6486             }
6487             min += min1;
6488             if (delta != OPTIMIZE_INFTY) {
6489                 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
6490                     delta += max1 - min1;
6491                 else
6492                     delta = OPTIMIZE_INFTY;
6493             }
6494             if (flags & SCF_DO_STCLASS_OR) {
6495                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6496                 if (min1) {
6497                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6498                     flags &= ~SCF_DO_STCLASS;
6499                 }
6500             }
6501             else if (flags & SCF_DO_STCLASS_AND) {
6502                 if (min1) {
6503                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6504                     flags &= ~SCF_DO_STCLASS;
6505                 }
6506                 else {
6507                     /* Switch to OR mode: cache the old value of
6508                      * data->start_class */
6509                     INIT_AND_WITHP;
6510                     StructCopy(data->start_class, and_withp, regnode_ssc);
6511                     flags &= ~SCF_DO_STCLASS_AND;
6512                     StructCopy(&accum, data->start_class, regnode_ssc);
6513                     flags |= SCF_DO_STCLASS_OR;
6514                 }
6515             }
6516             scan= tail;
6517             continue;
6518         }
6519 #else
6520         else if (PL_regkind[OP(scan)] == TRIE) {
6521             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6522             U8*bang=NULL;
6523
6524             min += trie->minlen;
6525             delta += (trie->maxlen - trie->minlen);
6526             flags &= ~SCF_DO_STCLASS; /* xxx */
6527             if (flags & SCF_DO_SUBSTR) {
6528                 /* Cannot expect anything... */
6529                 scan_commit(pRExC_state, data, minlenp, is_inf);
6530                 data->pos_min += trie->minlen;
6531                 data->pos_delta += (trie->maxlen - trie->minlen);
6532                 if (trie->maxlen != trie->minlen)
6533                     data->cur_is_floating = 1; /* float */
6534             }
6535             if (trie->jump) /* no more substrings -- for now /grr*/
6536                flags &= ~SCF_DO_SUBSTR;
6537         }
6538         else if (OP(scan) == REGEX_SET) {
6539             Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6540                              " before optimization", reg_name[REGEX_SET]);
6541         }
6542
6543 #endif /* old or new */
6544 #endif /* TRIE_STUDY_OPT */
6545
6546         /* Else: zero-length, ignore. */
6547         scan = regnext(scan);
6548     }
6549
6550   finish:
6551     if (frame) {
6552         /* we need to unwind recursion. */
6553         depth = depth - 1;
6554
6555         DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6556         DEBUG_PEEP("fend", scan, depth, flags);
6557
6558         /* restore previous context */
6559         last = frame->last_regnode;
6560         scan = frame->next_regnode;
6561         stopparen = frame->stopparen;
6562         recursed_depth = frame->prev_recursed_depth;
6563
6564         RExC_frame_last = frame->prev_frame;
6565         frame = frame->this_prev_frame;
6566         goto fake_study_recurse;
6567     }
6568
6569     assert(!frame);
6570     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6571
6572     *scanp = scan;
6573     *deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
6574
6575     if (flags & SCF_DO_SUBSTR && is_inf)
6576         data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
6577     if (is_par > (I32)U8_MAX)
6578         is_par = 0;
6579     if (is_par && pars==1 && data) {
6580         data->flags |= SF_IN_PAR;
6581         data->flags &= ~SF_HAS_PAR;
6582     }
6583     else if (pars && data) {
6584         data->flags |= SF_HAS_PAR;
6585         data->flags &= ~SF_IN_PAR;
6586     }
6587     if (flags & SCF_DO_STCLASS_OR)
6588         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6589     if (flags & SCF_TRIE_RESTUDY)
6590         data->flags |=  SCF_TRIE_RESTUDY;
6591
6592     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6593
6594     final_minlen = min < stopmin
6595             ? min : stopmin;
6596
6597     if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6598         if (final_minlen > OPTIMIZE_INFTY - delta)
6599             RExC_maxlen = OPTIMIZE_INFTY;
6600         else if (RExC_maxlen < final_minlen + delta)
6601             RExC_maxlen = final_minlen + delta;
6602     }
6603     return final_minlen;
6604 }
6605
6606 STATIC U32
6607 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6608 {
6609     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6610
6611     PERL_ARGS_ASSERT_ADD_DATA;
6612
6613     Renewc(RExC_rxi->data,
6614            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6615            char, struct reg_data);
6616     if(count)
6617         Renew(RExC_rxi->data->what, count + n, U8);
6618     else
6619         Newx(RExC_rxi->data->what, n, U8);
6620     RExC_rxi->data->count = count + n;
6621     Copy(s, RExC_rxi->data->what + count, n, U8);
6622     return count;
6623 }
6624
6625 /*XXX: todo make this not included in a non debugging perl, but appears to be
6626  * used anyway there, in 'use re' */
6627 #ifndef PERL_IN_XSUB_RE
6628 void
6629 Perl_reginitcolors(pTHX)
6630 {
6631     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6632     if (s) {
6633         char *t = savepv(s);
6634         int i = 0;
6635         PL_colors[0] = t;
6636         while (++i < 6) {
6637             t = strchr(t, '\t');
6638             if (t) {
6639                 *t = '\0';
6640                 PL_colors[i] = ++t;
6641             }
6642             else
6643                 PL_colors[i] = t = (char *)"";
6644         }
6645     } else {
6646         int i = 0;
6647         while (i < 6)
6648             PL_colors[i++] = (char *)"";
6649     }
6650     PL_colorset = 1;
6651 }
6652 #endif
6653
6654
6655 #ifdef TRIE_STUDY_OPT
6656 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6657     STMT_START {                                            \
6658         if (                                                \
6659               (data.flags & SCF_TRIE_RESTUDY)               \
6660               && ! restudied++                              \
6661         ) {                                                 \
6662             dOsomething;                                    \
6663             goto reStudy;                                   \
6664         }                                                   \
6665     } STMT_END
6666 #else
6667 #define CHECK_RESTUDY_GOTO_butfirst
6668 #endif
6669
6670 /*
6671  * pregcomp - compile a regular expression into internal code
6672  *
6673  * Decides which engine's compiler to call based on the hint currently in
6674  * scope
6675  */
6676
6677 #ifndef PERL_IN_XSUB_RE
6678
6679 /* return the currently in-scope regex engine (or the default if none)  */
6680
6681 regexp_engine const *
6682 Perl_current_re_engine(pTHX)
6683 {
6684     if (IN_PERL_COMPILETIME) {
6685         HV * const table = GvHV(PL_hintgv);
6686         SV **ptr;
6687
6688         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6689             return &PL_core_reg_engine;
6690         ptr = hv_fetchs(table, "regcomp", FALSE);
6691         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6692             return &PL_core_reg_engine;
6693         return INT2PTR(regexp_engine*, SvIV(*ptr));
6694     }
6695     else {
6696         SV *ptr;
6697         if (!PL_curcop->cop_hints_hash)
6698             return &PL_core_reg_engine;
6699         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6700         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6701             return &PL_core_reg_engine;
6702         return INT2PTR(regexp_engine*, SvIV(ptr));
6703     }
6704 }
6705
6706
6707 REGEXP *
6708 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6709 {
6710     regexp_engine const *eng = current_re_engine();
6711     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6712
6713     PERL_ARGS_ASSERT_PREGCOMP;
6714
6715     /* Dispatch a request to compile a regexp to correct regexp engine. */
6716     DEBUG_COMPILE_r({
6717         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6718                         PTR2UV(eng));
6719     });
6720     return CALLREGCOMP_ENG(eng, pattern, flags);
6721 }
6722 #endif
6723
6724 /* public(ish) entry point for the perl core's own regex compiling code.
6725  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6726  * pattern rather than a list of OPs, and uses the internal engine rather
6727  * than the current one */
6728
6729 REGEXP *
6730 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6731 {
6732     SV *pat = pattern; /* defeat constness! */
6733
6734     PERL_ARGS_ASSERT_RE_COMPILE;
6735
6736     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6737 #ifdef PERL_IN_XSUB_RE
6738                                 &my_reg_engine,
6739 #else
6740                                 &PL_core_reg_engine,
6741 #endif
6742                                 NULL, NULL, rx_flags, 0);
6743 }
6744
6745 static void
6746 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6747 {
6748     int n;
6749
6750     if (--cbs->refcnt > 0)
6751         return;
6752     for (n = 0; n < cbs->count; n++) {
6753         REGEXP *rx = cbs->cb[n].src_regex;
6754         if (rx) {
6755             cbs->cb[n].src_regex = NULL;
6756             SvREFCNT_dec_NN(rx);
6757         }
6758     }
6759     Safefree(cbs->cb);
6760     Safefree(cbs);
6761 }
6762
6763
6764 static struct reg_code_blocks *
6765 S_alloc_code_blocks(pTHX_  int ncode)
6766 {
6767      struct reg_code_blocks *cbs;
6768     Newx(cbs, 1, struct reg_code_blocks);
6769     cbs->count = ncode;
6770     cbs->refcnt = 1;
6771     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6772     if (ncode)
6773         Newx(cbs->cb, ncode, struct reg_code_block);
6774     else
6775         cbs->cb = NULL;
6776     return cbs;
6777 }
6778
6779
6780 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6781  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6782  * point to the realloced string and length.
6783  *
6784  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6785  * stuff added */
6786
6787 static void
6788 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6789                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
6790 {
6791     U8 *const src = (U8*)*pat_p;
6792     U8 *dst, *d;
6793     int n=0;
6794     STRLEN s = 0;
6795     bool do_end = 0;
6796     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6797
6798     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6799         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6800
6801     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6802     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6803     d = dst;
6804
6805     while (s < *plen_p) {
6806         append_utf8_from_native_byte(src[s], &d);
6807
6808         if (n < num_code_blocks) {
6809             assert(pRExC_state->code_blocks);
6810             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6811                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6812                 assert(*(d - 1) == '(');
6813                 do_end = 1;
6814             }
6815             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6816                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6817                 assert(*(d - 1) == ')');
6818                 do_end = 0;
6819                 n++;
6820             }
6821         }
6822         s++;
6823     }
6824     *d = '\0';
6825     *plen_p = d - dst;
6826     *pat_p = (char*) dst;
6827     SAVEFREEPV(*pat_p);
6828     RExC_orig_utf8 = RExC_utf8 = 1;
6829 }
6830
6831
6832
6833 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6834  * while recording any code block indices, and handling overloading,
6835  * nested qr// objects etc.  If pat is null, it will allocate a new
6836  * string, or just return the first arg, if there's only one.
6837  *
6838  * Returns the malloced/updated pat.
6839  * patternp and pat_count is the array of SVs to be concatted;
6840  * oplist is the optional list of ops that generated the SVs;
6841  * recompile_p is a pointer to a boolean that will be set if
6842  *   the regex will need to be recompiled.
6843  * delim, if non-null is an SV that will be inserted between each element
6844  */
6845
6846 static SV*
6847 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6848                 SV *pat, SV ** const patternp, int pat_count,
6849                 OP *oplist, bool *recompile_p, SV *delim)
6850 {
6851     SV **svp;
6852     int n = 0;
6853     bool use_delim = FALSE;
6854     bool alloced = FALSE;
6855
6856     /* if we know we have at least two args, create an empty string,
6857      * then concatenate args to that. For no args, return an empty string */
6858     if (!pat && pat_count != 1) {
6859         pat = newSVpvs("");
6860         SAVEFREESV(pat);
6861         alloced = TRUE;
6862     }
6863
6864     for (svp = patternp; svp < patternp + pat_count; svp++) {
6865         SV *sv;
6866         SV *rx  = NULL;
6867         STRLEN orig_patlen = 0;
6868         bool code = 0;
6869         SV *msv = use_delim ? delim : *svp;
6870         if (!msv) msv = &PL_sv_undef;
6871
6872         /* if we've got a delimiter, we go round the loop twice for each
6873          * svp slot (except the last), using the delimiter the second
6874          * time round */
6875         if (use_delim) {
6876             svp--;
6877             use_delim = FALSE;
6878         }
6879         else if (delim)
6880             use_delim = TRUE;
6881
6882         if (SvTYPE(msv) == SVt_PVAV) {
6883             /* we've encountered an interpolated array within
6884              * the pattern, e.g. /...@a..../. Expand the list of elements,
6885              * then recursively append elements.
6886              * The code in this block is based on S_pushav() */
6887
6888             AV *const av = (AV*)msv;
6889             const SSize_t maxarg = AvFILL(av) + 1;
6890             SV **array;
6891
6892             if (oplist) {
6893                 assert(oplist->op_type == OP_PADAV
6894                     || oplist->op_type == OP_RV2AV);
6895                 oplist = OpSIBLING(oplist);
6896             }
6897
6898             if (SvRMAGICAL(av)) {
6899                 SSize_t i;
6900
6901                 Newx(array, maxarg, SV*);
6902                 SAVEFREEPV(array);
6903                 for (i=0; i < maxarg; i++) {
6904                     SV ** const svp = av_fetch(av, i, FALSE);
6905                     array[i] = svp ? *svp : &PL_sv_undef;
6906                 }
6907             }
6908             else
6909                 array = AvARRAY(av);
6910
6911             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6912                                 array, maxarg, NULL, recompile_p,
6913                                 /* $" */
6914                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6915
6916             continue;
6917         }
6918
6919
6920         /* we make the assumption here that each op in the list of
6921          * op_siblings maps to one SV pushed onto the stack,
6922          * except for code blocks, with have both an OP_NULL and
6923          * an OP_CONST.
6924          * This allows us to match up the list of SVs against the
6925          * list of OPs to find the next code block.
6926          *
6927          * Note that       PUSHMARK PADSV PADSV ..
6928          * is optimised to
6929          *                 PADRANGE PADSV  PADSV  ..
6930          * so the alignment still works. */
6931
6932         if (oplist) {
6933             if (oplist->op_type == OP_NULL
6934                 && (oplist->op_flags & OPf_SPECIAL))
6935             {
6936                 assert(n < pRExC_state->code_blocks->count);
6937                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6938                 pRExC_state->code_blocks->cb[n].block = oplist;
6939                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6940                 n++;
6941                 code = 1;
6942                 oplist = OpSIBLING(oplist); /* skip CONST */
6943                 assert(oplist);
6944             }
6945             oplist = OpSIBLING(oplist);;
6946         }
6947
6948         /* apply magic and QR overloading to arg */
6949
6950         SvGETMAGIC(msv);
6951         if (SvROK(msv) && SvAMAGIC(msv)) {
6952             SV *sv = AMG_CALLunary(msv, regexp_amg);
6953             if (sv) {
6954                 if (SvROK(sv))
6955                     sv = SvRV(sv);
6956                 if (SvTYPE(sv) != SVt_REGEXP)
6957                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6958                 msv = sv;
6959             }
6960         }
6961
6962         /* try concatenation overload ... */
6963         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6964                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6965         {
6966             sv_setsv(pat, sv);
6967             /* overloading involved: all bets are off over literal
6968              * code. Pretend we haven't seen it */
6969             if (n)
6970                 pRExC_state->code_blocks->count -= n;
6971             n = 0;
6972         }
6973         else {
6974             /* ... or failing that, try "" overload */
6975             while (SvAMAGIC(msv)
6976                     && (sv = AMG_CALLunary(msv, string_amg))
6977                     && sv != msv
6978                     &&  !(   SvROK(msv)
6979                           && SvROK(sv)
6980                           && SvRV(msv) == SvRV(sv))
6981             ) {
6982                 msv = sv;
6983                 SvGETMAGIC(msv);
6984             }
6985             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6986                 msv = SvRV(msv);
6987
6988             if (pat) {
6989                 /* this is a partially unrolled
6990                  *     sv_catsv_nomg(pat, msv);
6991                  * that allows us to adjust code block indices if
6992                  * needed */
6993                 STRLEN dlen;
6994                 char *dst = SvPV_force_nomg(pat, dlen);
6995                 orig_patlen = dlen;
6996                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6997                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6998                     sv_setpvn(pat, dst, dlen);
6999                     SvUTF8_on(pat);
7000                 }
7001                 sv_catsv_nomg(pat, msv);
7002                 rx = msv;
7003             }
7004             else {
7005                 /* We have only one SV to process, but we need to verify
7006                  * it is properly null terminated or we will fail asserts
7007                  * later. In theory we probably shouldn't get such SV's,
7008                  * but if we do we should handle it gracefully. */
7009                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
7010                     /* not a string, or a string with a trailing null */
7011                     pat = msv;
7012                 } else {
7013                     /* a string with no trailing null, we need to copy it
7014                      * so it has a trailing null */
7015                     pat = sv_2mortal(newSVsv(msv));
7016                 }
7017             }
7018
7019             if (code)
7020                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
7021         }
7022
7023         /* extract any code blocks within any embedded qr//'s */
7024         if (rx && SvTYPE(rx) == SVt_REGEXP
7025             && RX_ENGINE((REGEXP*)rx)->op_comp)
7026         {
7027
7028             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
7029             if (ri->code_blocks && ri->code_blocks->count) {
7030                 int i;
7031                 /* the presence of an embedded qr// with code means
7032                  * we should always recompile: the text of the
7033                  * qr// may not have changed, but it may be a
7034                  * different closure than last time */
7035                 *recompile_p = 1;
7036                 if (pRExC_state->code_blocks) {
7037                     int new_count = pRExC_state->code_blocks->count
7038                             + ri->code_blocks->count;
7039                     Renew(pRExC_state->code_blocks->cb,
7040                             new_count, struct reg_code_block);
7041                     pRExC_state->code_blocks->count = new_count;
7042                 }
7043                 else
7044                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
7045                                                     ri->code_blocks->count);
7046
7047                 for (i=0; i < ri->code_blocks->count; i++) {
7048                     struct reg_code_block *src, *dst;
7049                     STRLEN offset =  orig_patlen
7050                         + ReANY((REGEXP *)rx)->pre_prefix;
7051                     assert(n < pRExC_state->code_blocks->count);
7052                     src = &ri->code_blocks->cb[i];
7053                     dst = &pRExC_state->code_blocks->cb[n];
7054                     dst->start      = src->start + offset;
7055                     dst->end        = src->end   + offset;
7056                     dst->block      = src->block;
7057                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
7058                                             src->src_regex
7059                                                 ? src->src_regex
7060                                                 : (REGEXP*)rx);
7061                     n++;
7062                 }
7063             }
7064         }
7065     }
7066     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
7067     if (alloced)
7068         SvSETMAGIC(pat);
7069
7070     return pat;
7071 }
7072
7073
7074
7075 /* see if there are any run-time code blocks in the pattern.
7076  * False positives are allowed */
7077
7078 static bool
7079 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7080                     char *pat, STRLEN plen)
7081 {
7082     int n = 0;
7083     STRLEN s;
7084
7085     PERL_UNUSED_CONTEXT;
7086
7087     for (s = 0; s < plen; s++) {
7088         if (   pRExC_state->code_blocks
7089             && n < pRExC_state->code_blocks->count
7090             && s == pRExC_state->code_blocks->cb[n].start)
7091         {
7092             s = pRExC_state->code_blocks->cb[n].end;
7093             n++;
7094             continue;
7095         }
7096         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
7097          * positives here */
7098         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
7099             (pat[s+2] == '{'
7100                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
7101         )
7102             return 1;
7103     }
7104     return 0;
7105 }
7106
7107 /* Handle run-time code blocks. We will already have compiled any direct
7108  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
7109  * copy of it, but with any literal code blocks blanked out and
7110  * appropriate chars escaped; then feed it into
7111  *
7112  *    eval "qr'modified_pattern'"
7113  *
7114  * For example,
7115  *
7116  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
7117  *
7118  * becomes
7119  *
7120  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
7121  *
7122  * After eval_sv()-ing that, grab any new code blocks from the returned qr
7123  * and merge them with any code blocks of the original regexp.
7124  *
7125  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7126  * instead, just save the qr and return FALSE; this tells our caller that
7127  * the original pattern needs upgrading to utf8.
7128  */
7129
7130 static bool
7131 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7132     char *pat, STRLEN plen)
7133 {
7134     SV *qr;
7135
7136     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7137
7138     if (pRExC_state->runtime_code_qr) {
7139         /* this is the second time we've been called; this should
7140          * only happen if the main pattern got upgraded to utf8
7141          * during compilation; re-use the qr we compiled first time
7142          * round (which should be utf8 too)
7143          */
7144         qr = pRExC_state->runtime_code_qr;
7145         pRExC_state->runtime_code_qr = NULL;
7146         assert(RExC_utf8 && SvUTF8(qr));
7147     }
7148     else {
7149         int n = 0;
7150         STRLEN s;
7151         char *p, *newpat;
7152         int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7153         SV *sv, *qr_ref;
7154         dSP;
7155
7156         /* determine how many extra chars we need for ' and \ escaping */
7157         for (s = 0; s < plen; s++) {
7158             if (pat[s] == '\'' || pat[s] == '\\')
7159                 newlen++;
7160         }
7161
7162         Newx(newpat, newlen, char);
7163         p = newpat;
7164         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7165
7166         for (s = 0; s < plen; s++) {
7167             if (   pRExC_state->code_blocks
7168                 && n < pRExC_state->code_blocks->count
7169                 && s == pRExC_state->code_blocks->cb[n].start)
7170             {
7171                 /* blank out literal code block so that they aren't
7172                  * recompiled: eg change from/to:
7173                  *     /(?{xyz})/
7174                  *     /(?=====)/
7175                  * and
7176                  *     /(??{xyz})/
7177                  *     /(?======)/
7178                  * and
7179                  *     /(?(?{xyz}))/
7180                  *     /(?(?=====))/
7181                 */
7182                 assert(pat[s]   == '(');
7183                 assert(pat[s+1] == '?');
7184                 *p++ = '(';
7185                 *p++ = '?';
7186                 s += 2;
7187                 while (s < pRExC_state->code_blocks->cb[n].end) {
7188                     *p++ = '=';
7189                     s++;
7190                 }
7191                 *p++ = ')';
7192                 n++;
7193                 continue;
7194             }
7195             if (pat[s] == '\'' || pat[s] == '\\')
7196                 *p++ = '\\';
7197             *p++ = pat[s];
7198         }
7199         *p++ = '\'';
7200         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7201             *p++ = 'x';
7202             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7203                 *p++ = 'x';
7204             }
7205         }
7206         *p++ = '\0';
7207         DEBUG_COMPILE_r({
7208             Perl_re_printf( aTHX_
7209                 "%sre-parsing pattern for runtime code:%s %s\n",
7210                 PL_colors[4], PL_colors[5], newpat);
7211         });
7212
7213         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7214         Safefree(newpat);
7215
7216         ENTER;
7217         SAVETMPS;
7218         save_re_context();
7219         PUSHSTACKi(PERLSI_REQUIRE);
7220         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7221          * parsing qr''; normally only q'' does this. It also alters
7222          * hints handling */
7223         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7224         SvREFCNT_dec_NN(sv);
7225         SPAGAIN;
7226         qr_ref = POPs;
7227         PUTBACK;
7228         {
7229             SV * const errsv = ERRSV;
7230             if (SvTRUE_NN(errsv))
7231                 /* use croak_sv ? */
7232                 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7233         }
7234         assert(SvROK(qr_ref));
7235         qr = SvRV(qr_ref);
7236         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7237         /* the leaving below frees the tmp qr_ref.
7238          * Give qr a life of its own */
7239         SvREFCNT_inc(qr);
7240         POPSTACK;
7241         FREETMPS;
7242         LEAVE;
7243
7244     }
7245
7246     if (!RExC_utf8 && SvUTF8(qr)) {
7247         /* first time through; the pattern got upgraded; save the
7248          * qr for the next time through */
7249         assert(!pRExC_state->runtime_code_qr);
7250         pRExC_state->runtime_code_qr = qr;
7251         return 0;
7252     }
7253
7254
7255     /* extract any code blocks within the returned qr//  */
7256
7257
7258     /* merge the main (r1) and run-time (r2) code blocks into one */
7259     {
7260         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7261         struct reg_code_block *new_block, *dst;
7262         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7263         int i1 = 0, i2 = 0;
7264         int r1c, r2c;
7265
7266         if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7267         {
7268             SvREFCNT_dec_NN(qr);
7269             return 1;
7270         }
7271
7272         if (!r1->code_blocks)
7273             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7274
7275         r1c = r1->code_blocks->count;
7276         r2c = r2->code_blocks->count;
7277
7278         Newx(new_block, r1c + r2c, struct reg_code_block);
7279
7280         dst = new_block;
7281
7282         while (i1 < r1c || i2 < r2c) {
7283             struct reg_code_block *src;
7284             bool is_qr = 0;
7285
7286             if (i1 == r1c) {
7287                 src = &r2->code_blocks->cb[i2++];
7288                 is_qr = 1;
7289             }
7290             else if (i2 == r2c)
7291                 src = &r1->code_blocks->cb[i1++];
7292             else if (  r1->code_blocks->cb[i1].start
7293                      < r2->code_blocks->cb[i2].start)
7294             {
7295                 src = &r1->code_blocks->cb[i1++];
7296                 assert(src->end < r2->code_blocks->cb[i2].start);
7297             }
7298             else {
7299                 assert(  r1->code_blocks->cb[i1].start
7300                        > r2->code_blocks->cb[i2].start);
7301                 src = &r2->code_blocks->cb[i2++];
7302                 is_qr = 1;
7303                 assert(src->end < r1->code_blocks->cb[i1].start);
7304             }
7305
7306             assert(pat[src->start] == '(');
7307             assert(pat[src->end]   == ')');
7308             dst->start      = src->start;
7309             dst->end        = src->end;
7310             dst->block      = src->block;
7311             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7312                                     : src->src_regex;
7313             dst++;
7314         }
7315         r1->code_blocks->count += r2c;
7316         Safefree(r1->code_blocks->cb);
7317         r1->code_blocks->cb = new_block;
7318     }
7319
7320     SvREFCNT_dec_NN(qr);
7321     return 1;
7322 }
7323
7324
7325 STATIC bool
7326 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7327                       struct reg_substr_datum  *rsd,
7328                       struct scan_data_substrs *sub,
7329                       STRLEN longest_length)
7330 {
7331     /* This is the common code for setting up the floating and fixed length
7332      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
7333      * as to whether succeeded or not */
7334
7335     I32 t;
7336     SSize_t ml;
7337     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
7338     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7339
7340     if (! (longest_length
7341            || (eol /* Can't have SEOL and MULTI */
7342                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7343           )
7344             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7345         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7346     {
7347         return FALSE;
7348     }
7349
7350     /* copy the information about the longest from the reg_scan_data
7351         over to the program. */
7352     if (SvUTF8(sub->str)) {
7353         rsd->substr      = NULL;
7354         rsd->utf8_substr = sub->str;
7355     } else {
7356         rsd->substr      = sub->str;
7357         rsd->utf8_substr = NULL;
7358     }
7359     /* end_shift is how many chars that must be matched that
7360         follow this item. We calculate it ahead of time as once the
7361         lookbehind offset is added in we lose the ability to correctly
7362         calculate it.*/
7363     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7364     rsd->end_shift = ml - sub->min_offset
7365         - longest_length
7366             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7367              * intead? - DAPM
7368             + (SvTAIL(sub->str) != 0)
7369             */
7370         + sub->lookbehind;
7371
7372     t = (eol/* Can't have SEOL and MULTI */
7373          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7374     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7375
7376     return TRUE;
7377 }
7378
7379 STATIC void
7380 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7381 {
7382     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7383      * properly wrapped with the right modifiers */
7384
7385     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7386     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7387                                                 != REGEX_DEPENDS_CHARSET);
7388
7389     /* The caret is output if there are any defaults: if not all the STD
7390         * flags are set, or if no character set specifier is needed */
7391     bool has_default =
7392                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7393                 || ! has_charset);
7394     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7395                                                 == REG_RUN_ON_COMMENT_SEEN);
7396     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7397                         >> RXf_PMf_STD_PMMOD_SHIFT);
7398     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7399     char *p;
7400     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7401
7402     /* We output all the necessary flags; we never output a minus, as all
7403         * those are defaults, so are
7404         * covered by the caret */
7405     const STRLEN wraplen = pat_len + has_p + has_runon
7406         + has_default       /* If needs a caret */
7407         + PL_bitcount[reganch] /* 1 char for each set standard flag */
7408
7409             /* If needs a character set specifier */
7410         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7411         + (sizeof("(?:)") - 1);
7412
7413     PERL_ARGS_ASSERT_SET_REGEX_PV;
7414
7415     /* make sure PL_bitcount bounds not exceeded */
7416     STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
7417
7418     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7419     SvPOK_on(Rx);
7420     if (RExC_utf8)
7421         SvFLAGS(Rx) |= SVf_UTF8;
7422     *p++='('; *p++='?';
7423
7424     /* If a default, cover it using the caret */
7425     if (has_default) {
7426         *p++= DEFAULT_PAT_MOD;
7427     }
7428     if (has_charset) {
7429         STRLEN len;
7430         const char* name;
7431
7432         name = get_regex_charset_name(RExC_rx->extflags, &len);
7433         if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
7434             assert(RExC_utf8);
7435             name = UNICODE_PAT_MODS;
7436             len = sizeof(UNICODE_PAT_MODS) - 1;
7437         }
7438         Copy(name, p, len, char);
7439         p += len;
7440     }
7441     if (has_p)
7442         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7443     {
7444         char ch;
7445         while((ch = *fptr++)) {
7446             if(reganch & 1)
7447                 *p++ = ch;
7448             reganch >>= 1;
7449         }
7450     }
7451
7452     *p++ = ':';
7453     Copy(RExC_precomp, p, pat_len, char);
7454     assert ((RX_WRAPPED(Rx) - p) < 16);
7455     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7456     p += pat_len;
7457
7458     /* Adding a trailing \n causes this to compile properly:
7459             my $R = qr / A B C # D E/x; /($R)/
7460         Otherwise the parens are considered part of the comment */
7461     if (has_runon)
7462         *p++ = '\n';
7463     *p++ = ')';
7464     *p = 0;
7465     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7466 }
7467
7468 /*
7469  * Perl_re_op_compile - the perl internal RE engine's function to compile a
7470  * regular expression into internal code.
7471  * The pattern may be passed either as:
7472  *    a list of SVs (patternp plus pat_count)
7473  *    a list of OPs (expr)
7474  * If both are passed, the SV list is used, but the OP list indicates
7475  * which SVs are actually pre-compiled code blocks
7476  *
7477  * The SVs in the list have magic and qr overloading applied to them (and
7478  * the list may be modified in-place with replacement SVs in the latter
7479  * case).
7480  *
7481  * If the pattern hasn't changed from old_re, then old_re will be
7482  * returned.
7483  *
7484  * eng is the current engine. If that engine has an op_comp method, then
7485  * handle directly (i.e. we assume that op_comp was us); otherwise, just
7486  * do the initial concatenation of arguments and pass on to the external
7487  * engine.
7488  *
7489  * If is_bare_re is not null, set it to a boolean indicating whether the
7490  * arg list reduced (after overloading) to a single bare regex which has
7491  * been returned (i.e. /$qr/).
7492  *
7493  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7494  *
7495  * pm_flags contains the PMf_* flags, typically based on those from the
7496  * pm_flags field of the related PMOP. Currently we're only interested in
7497  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
7498  *
7499  * For many years this code had an initial sizing pass that calculated
7500  * (sometimes incorrectly, leading to security holes) the size needed for the
7501  * compiled pattern.  That was changed by commit
7502  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7503  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
7504  * references to this sizing pass.
7505  *
7506  * Now, an initial crude guess as to the size needed is made, based on the
7507  * length of the pattern.  Patches welcome to improve that guess.  That amount
7508  * of space is malloc'd and then immediately freed, and then clawed back node
7509  * by node.  This design is to minimze, to the extent possible, memory churn
7510  * when doing the reallocs.
7511  *
7512  * A separate parentheses counting pass may be needed in some cases.
7513  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
7514  * of these cases.
7515  *
7516  * The existence of a sizing pass necessitated design decisions that are no
7517  * longer needed.  There are potential areas of simplification.
7518  *
7519  * Beware that the optimization-preparation code in here knows about some
7520  * of the structure of the compiled regexp.  [I'll say.]
7521  */
7522
7523 REGEXP *
7524 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7525                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
7526                      bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7527 {
7528     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
7529     STRLEN plen;
7530     char *exp;
7531     regnode *scan;
7532     I32 flags;
7533     SSize_t minlen = 0;
7534     U32 rx_flags;
7535     SV *pat;
7536     SV** new_patternp = patternp;
7537
7538     /* these are all flags - maybe they should be turned
7539      * into a single int with different bit masks */
7540     I32 sawlookahead = 0;
7541     I32 sawplus = 0;
7542     I32 sawopen = 0;
7543     I32 sawminmod = 0;
7544
7545     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7546     bool recompile = 0;
7547     bool runtime_code = 0;
7548     scan_data_t data;
7549     RExC_state_t RExC_state;
7550     RExC_state_t * const pRExC_state = &RExC_state;
7551 #ifdef TRIE_STUDY_OPT
7552     int restudied = 0;
7553     RExC_state_t copyRExC_state;
7554 #endif
7555     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7556
7557     PERL_ARGS_ASSERT_RE_OP_COMPILE;
7558
7559     DEBUG_r(if (!PL_colorset) reginitcolors());
7560
7561
7562     pRExC_state->warn_text = NULL;
7563     pRExC_state->unlexed_names = NULL;
7564     pRExC_state->code_blocks = NULL;
7565
7566     if (is_bare_re)
7567         *is_bare_re = FALSE;
7568
7569     if (expr && (expr->op_type == OP_LIST ||
7570                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7571         /* allocate code_blocks if needed */
7572         OP *o;
7573         int ncode = 0;
7574
7575         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7576             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7577                 ncode++; /* count of DO blocks */
7578
7579         if (ncode)
7580             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7581     }
7582
7583     if (!pat_count) {
7584         /* compile-time pattern with just OP_CONSTs and DO blocks */
7585
7586         int n;
7587         OP *o;
7588
7589         /* find how many CONSTs there are */
7590         assert(expr);
7591         n = 0;
7592         if (expr->op_type == OP_CONST)
7593             n = 1;
7594         else
7595             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7596                 if (o->op_type == OP_CONST)
7597                     n++;
7598             }
7599
7600         /* fake up an SV array */
7601
7602         assert(!new_patternp);
7603         Newx(new_patternp, n, SV*);
7604         SAVEFREEPV(new_patternp);
7605         pat_count = n;
7606
7607         n = 0;
7608         if (expr->op_type == OP_CONST)
7609             new_patternp[n] = cSVOPx_sv(expr);
7610         else
7611             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7612                 if (o->op_type == OP_CONST)
7613                     new_patternp[n++] = cSVOPo_sv;
7614             }
7615
7616     }
7617
7618     DEBUG_PARSE_r(Perl_re_printf( aTHX_
7619         "Assembling pattern from %d elements%s\n", pat_count,
7620             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7621
7622     /* set expr to the first arg op */
7623
7624     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7625          && expr->op_type != OP_CONST)
7626     {
7627             expr = cLISTOPx(expr)->op_first;
7628             assert(   expr->op_type == OP_PUSHMARK
7629                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7630                    || expr->op_type == OP_PADRANGE);
7631             expr = OpSIBLING(expr);
7632     }
7633
7634     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7635                         expr, &recompile, NULL);
7636
7637     /* handle bare (possibly after overloading) regex: foo =~ $re */
7638     {
7639         SV *re = pat;
7640         if (SvROK(re))
7641             re = SvRV(re);
7642         if (SvTYPE(re) == SVt_REGEXP) {
7643             if (is_bare_re)
7644                 *is_bare_re = TRUE;
7645             SvREFCNT_inc(re);
7646             DEBUG_PARSE_r(Perl_re_printf( aTHX_
7647                 "Precompiled pattern%s\n",
7648                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7649
7650             return (REGEXP*)re;
7651         }
7652     }
7653
7654     exp = SvPV_nomg(pat, plen);
7655
7656     if (!eng->op_comp) {
7657         if ((SvUTF8(pat) && IN_BYTES)
7658                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7659         {
7660             /* make a temporary copy; either to convert to bytes,
7661              * or to avoid repeating get-magic / overloaded stringify */
7662             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7663                                         (IN_BYTES ? 0 : SvUTF8(pat)));
7664         }
7665         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7666     }
7667
7668     /* ignore the utf8ness if the pattern is 0 length */
7669     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7670     RExC_uni_semantics = 0;
7671     RExC_contains_locale = 0;
7672     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7673     RExC_in_script_run = 0;
7674     RExC_study_started = 0;
7675     pRExC_state->runtime_code_qr = NULL;
7676     RExC_frame_head= NULL;
7677     RExC_frame_last= NULL;
7678     RExC_frame_count= 0;
7679     RExC_latest_warn_offset = 0;
7680     RExC_use_BRANCHJ = 0;
7681     RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
7682     RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
7683     RExC_total_parens = 0;
7684     RExC_open_parens = NULL;
7685     RExC_close_parens = NULL;
7686     RExC_paren_names = NULL;
7687     RExC_size = 0;
7688     RExC_seen_d_op = FALSE;
7689 #ifdef DEBUGGING
7690     RExC_paren_name_list = NULL;
7691 #endif
7692
7693     DEBUG_r({
7694         RExC_mysv1= sv_newmortal();
7695         RExC_mysv2= sv_newmortal();
7696     });
7697
7698     DEBUG_COMPILE_r({
7699             SV *dsv= sv_newmortal();
7700             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7701             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
7702                           PL_colors[4], PL_colors[5], s);
7703         });
7704
7705     /* we jump here if we have to recompile, e.g., from upgrading the pattern
7706      * to utf8 */
7707
7708     if ((pm_flags & PMf_USE_RE_EVAL)
7709                 /* this second condition covers the non-regex literal case,
7710                  * i.e.  $foo =~ '(?{})'. */
7711                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7712     )
7713         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7714
7715   redo_parse:
7716     /* return old regex if pattern hasn't changed */
7717     /* XXX: note in the below we have to check the flags as well as the
7718      * pattern.
7719      *
7720      * Things get a touch tricky as we have to compare the utf8 flag
7721      * independently from the compile flags.  */
7722
7723     if (   old_re
7724         && !recompile
7725         && !!RX_UTF8(old_re) == !!RExC_utf8
7726         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7727         && RX_PRECOMP(old_re)
7728         && RX_PRELEN(old_re) == plen
7729         && memEQ(RX_PRECOMP(old_re), exp, plen)
7730         && !runtime_code /* with runtime code, always recompile */ )
7731     {
7732         DEBUG_COMPILE_r({
7733             SV *dsv= sv_newmortal();
7734             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7735             Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
7736                           PL_colors[4], PL_colors[5], s);
7737         });
7738         return old_re;
7739     }
7740
7741     /* Allocate the pattern's SV */
7742     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7743     RExC_rx = ReANY(Rx);
7744     if ( RExC_rx == NULL )
7745         FAIL("Regexp out of space");
7746
7747     rx_flags = orig_rx_flags;
7748
7749     if (   (UTF || RExC_uni_semantics)
7750         && initial_charset == REGEX_DEPENDS_CHARSET)
7751     {
7752
7753         /* Set to use unicode semantics if the pattern is in utf8 and has the
7754          * 'depends' charset specified, as it means unicode when utf8  */
7755         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7756         RExC_uni_semantics = 1;
7757     }
7758
7759     RExC_pm_flags = pm_flags;
7760
7761     if (runtime_code) {
7762         assert(TAINTING_get || !TAINT_get);
7763         if (TAINT_get)
7764             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7765
7766         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7767             /* whoops, we have a non-utf8 pattern, whilst run-time code
7768              * got compiled as utf8. Try again with a utf8 pattern */
7769             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7770                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7771             goto redo_parse;
7772         }
7773     }
7774     assert(!pRExC_state->runtime_code_qr);
7775
7776     RExC_sawback = 0;
7777
7778     RExC_seen = 0;
7779     RExC_maxlen = 0;
7780     RExC_in_lookbehind = 0;
7781     RExC_in_lookahead = 0;
7782     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7783     RExC_recode_x_to_native = 0;
7784     RExC_in_multi_char_class = 0;
7785
7786     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7787     RExC_precomp_end = RExC_end = exp + plen;
7788     RExC_nestroot = 0;
7789     RExC_whilem_seen = 0;
7790     RExC_end_op = NULL;
7791     RExC_recurse = NULL;
7792     RExC_study_chunk_recursed = NULL;
7793     RExC_study_chunk_recursed_bytes= 0;
7794     RExC_recurse_count = 0;
7795     RExC_sets_depth = 0;
7796     pRExC_state->code_index = 0;
7797
7798     /* Initialize the string in the compiled pattern.  This is so that there is
7799      * something to output if necessary */
7800     set_regex_pv(pRExC_state, Rx);
7801
7802     DEBUG_PARSE_r({
7803         Perl_re_printf( aTHX_
7804             "Starting parse and generation\n");
7805         RExC_lastnum=0;
7806         RExC_lastparse=NULL;
7807     });
7808
7809     /* Allocate space and zero-initialize. Note, the two step process
7810        of zeroing when in debug mode, thus anything assigned has to
7811        happen after that */
7812     if (!  RExC_size) {
7813
7814         /* On the first pass of the parse, we guess how big this will be.  Then
7815          * we grow in one operation to that amount and then give it back.  As
7816          * we go along, we re-allocate what we need.
7817          *
7818          * XXX Currently the guess is essentially that the pattern will be an
7819          * EXACT node with one byte input, one byte output.  This is crude, and
7820          * better heuristics are welcome.
7821          *
7822          * On any subsequent passes, we guess what we actually computed in the
7823          * latest earlier pass.  Such a pass probably didn't complete so is
7824          * missing stuff.  We could improve those guesses by knowing where the
7825          * parse stopped, and use the length so far plus apply the above
7826          * assumption to what's left. */
7827         RExC_size = STR_SZ(RExC_end - RExC_start);
7828     }
7829
7830     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7831     if ( RExC_rxi == NULL )
7832         FAIL("Regexp out of space");
7833
7834     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7835     RXi_SET( RExC_rx, RExC_rxi );
7836
7837     /* We start from 0 (over from 0 in the case this is a reparse.  The first
7838      * node parsed will give back any excess memory we have allocated so far).
7839      * */
7840     RExC_size = 0;
7841
7842     /* non-zero initialization begins here */
7843     RExC_rx->engine= eng;
7844     RExC_rx->extflags = rx_flags;
7845     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7846
7847     if (pm_flags & PMf_IS_QR) {
7848         RExC_rxi->code_blocks = pRExC_state->code_blocks;
7849         if (RExC_rxi->code_blocks) {
7850             RExC_rxi->code_blocks->refcnt++;
7851         }
7852     }
7853
7854     RExC_rx->intflags = 0;
7855
7856     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
7857     RExC_parse = exp;
7858
7859     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7860      * code makes sure the final byte is an uncounted NUL.  But should this
7861      * ever not be the case, lots of things could read beyond the end of the
7862      * buffer: loops like
7863      *      while(isFOO(*RExC_parse)) RExC_parse++;
7864      *      strchr(RExC_parse, "foo");
7865      * etc.  So it is worth noting. */
7866     assert(*RExC_end == '\0');
7867
7868     RExC_naughty = 0;
7869     RExC_npar = 1;
7870     RExC_parens_buf_size = 0;
7871     RExC_emit_start = RExC_rxi->program;
7872     pRExC_state->code_index = 0;
7873
7874     *((char*) RExC_emit_start) = (char) REG_MAGIC;
7875     RExC_emit = 1;
7876
7877     /* Do the parse */
7878     if (reg(pRExC_state, 0, &flags, 1)) {
7879
7880         /* Success!, But we may need to redo the parse knowing how many parens
7881          * there actually are */
7882         if (IN_PARENS_PASS) {
7883             flags |= RESTART_PARSE;
7884         }
7885
7886         /* We have that number in RExC_npar */
7887         RExC_total_parens = RExC_npar;
7888     }
7889     else if (! MUST_RESTART(flags)) {
7890         ReREFCNT_dec(Rx);
7891         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7892     }
7893
7894     /* Here, we either have success, or we have to redo the parse for some reason */
7895     if (MUST_RESTART(flags)) {
7896
7897         /* It's possible to write a regexp in ascii that represents Unicode
7898         codepoints outside of the byte range, such as via \x{100}. If we
7899         detect such a sequence we have to convert the entire pattern to utf8
7900         and then recompile, as our sizing calculation will have been based
7901         on 1 byte == 1 character, but we will need to use utf8 to encode
7902         at least some part of the pattern, and therefore must convert the whole
7903         thing.
7904         -- dmq */
7905         if (flags & NEED_UTF8) {
7906
7907             /* We have stored the offset of the final warning output so far.
7908              * That must be adjusted.  Any variant characters between the start
7909              * of the pattern and this warning count for 2 bytes in the final,
7910              * so just add them again */
7911             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7912                 RExC_latest_warn_offset +=
7913                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
7914                                                 + RExC_latest_warn_offset);
7915             }
7916             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7917             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7918             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7919         }
7920         else {
7921             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7922         }
7923
7924         if (ALL_PARENS_COUNTED) {
7925             /* Make enough room for all the known parens, and zero it */
7926             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7927             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7928             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
7929
7930             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7931             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7932         }
7933         else { /* Parse did not complete.  Reinitialize the parentheses
7934                   structures */
7935             RExC_total_parens = 0;
7936             if (RExC_open_parens) {
7937                 Safefree(RExC_open_parens);
7938                 RExC_open_parens = NULL;
7939             }
7940             if (RExC_close_parens) {
7941                 Safefree(RExC_close_parens);
7942                 RExC_close_parens = NULL;
7943             }
7944         }
7945
7946         /* Clean up what we did in this parse */
7947         SvREFCNT_dec_NN(RExC_rx_sv);
7948
7949         goto redo_parse;
7950     }
7951
7952     /* Here, we have successfully parsed and generated the pattern's program
7953      * for the regex engine.  We are ready to finish things up and look for
7954      * optimizations. */
7955
7956     /* Update the string to compile, with correct modifiers, etc */
7957     set_regex_pv(pRExC_state, Rx);
7958
7959     RExC_rx->nparens = RExC_total_parens - 1;
7960
7961     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7962     if (RExC_whilem_seen > 15)
7963         RExC_whilem_seen = 15;
7964
7965     DEBUG_PARSE_r({
7966         Perl_re_printf( aTHX_
7967             "Required size %" IVdf " nodes\n", (IV)RExC_size);
7968         RExC_lastnum=0;
7969         RExC_lastparse=NULL;
7970     });
7971
7972 #ifdef RE_TRACK_PATTERN_OFFSETS
7973     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7974                           "%s %" UVuf " bytes for offset annotations.\n",
7975                           RExC_offsets ? "Got" : "Couldn't get",
7976                           (UV)((RExC_offsets[0] * 2 + 1))));
7977     DEBUG_OFFSETS_r(if (RExC_offsets) {
7978         const STRLEN len = RExC_offsets[0];
7979         STRLEN i;
7980         DECLARE_AND_GET_RE_DEBUG_FLAGS;
7981         Perl_re_printf( aTHX_
7982                       "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7983         for (i = 1; i <= len; i++) {
7984             if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7985                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
7986                 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7987         }
7988         Perl_re_printf( aTHX_  "\n");
7989     });
7990
7991 #else
7992     SetProgLen(RExC_rxi,RExC_size);
7993 #endif
7994
7995     DEBUG_DUMP_PRE_OPTIMIZE_r({
7996         SV * const sv = sv_newmortal();
7997         RXi_GET_DECL(RExC_rx, ri);
7998         DEBUG_RExC_seen();
7999         Perl_re_printf( aTHX_ "Program before optimization:\n");
8000
8001         (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
8002                         sv, 0, 0);
8003     });
8004
8005     DEBUG_OPTIMISE_r(
8006         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
8007     );
8008
8009     /* XXXX To minimize changes to RE engine we always allocate
8010        3-units-long substrs field. */
8011     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
8012     if (RExC_recurse_count) {
8013         Newx(RExC_recurse, RExC_recurse_count, regnode *);
8014         SAVEFREEPV(RExC_recurse);
8015     }
8016
8017     if (RExC_seen & REG_RECURSE_SEEN) {
8018         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
8019          * So its 1 if there are no parens. */
8020         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
8021                                          ((RExC_total_parens & 0x07) != 0);
8022         Newx(RExC_study_chunk_recursed,
8023              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8024         SAVEFREEPV(RExC_study_chunk_recursed);
8025     }
8026
8027   reStudy:
8028     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
8029     DEBUG_r(
8030         RExC_study_chunk_recursed_count= 0;
8031     );
8032     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
8033     if (RExC_study_chunk_recursed) {
8034         Zero(RExC_study_chunk_recursed,
8035              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8036     }
8037
8038
8039 #ifdef TRIE_STUDY_OPT
8040     if (!restudied) {
8041         StructCopy(&zero_scan_data, &data, scan_data_t);
8042         copyRExC_state = RExC_state;
8043     } else {
8044         U32 seen=RExC_seen;
8045         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
8046
8047         RExC_state = copyRExC_state;
8048         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
8049             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
8050         else
8051             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
8052         StructCopy(&zero_scan_data, &data, scan_data_t);
8053     }
8054 #else
8055     StructCopy(&zero_scan_data, &data, scan_data_t);
8056 #endif
8057
8058     /* Dig out information for optimizations. */
8059     RExC_rx->extflags = RExC_flags; /* was pm_op */
8060     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
8061
8062     if (UTF)
8063         SvUTF8_on(Rx);  /* Unicode in it? */
8064     RExC_rxi->regstclass = NULL;
8065     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
8066         RExC_rx->intflags |= PREGf_NAUGHTY;
8067     scan = RExC_rxi->program + 1;               /* First BRANCH. */
8068
8069     /* testing for BRANCH here tells us whether there is "must appear"
8070        data in the pattern. If there is then we can use it for optimisations */
8071     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
8072                                                   */
8073         SSize_t fake;
8074         STRLEN longest_length[2];
8075         regnode_ssc ch_class; /* pointed to by data */
8076         int stclass_flag;
8077         SSize_t last_close = 0; /* pointed to by data */
8078         regnode *first= scan;
8079         regnode *first_next= regnext(first);
8080         int i;
8081
8082         /*
8083          * Skip introductions and multiplicators >= 1
8084          * so that we can extract the 'meat' of the pattern that must
8085          * match in the large if() sequence following.
8086          * NOTE that EXACT is NOT covered here, as it is normally
8087          * picked up by the optimiser separately.
8088          *
8089          * This is unfortunate as the optimiser isnt handling lookahead
8090          * properly currently.
8091          *
8092          */
8093         while ((OP(first) == OPEN && (sawopen = 1)) ||
8094                /* An OR of *one* alternative - should not happen now. */
8095             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
8096             /* for now we can't handle lookbehind IFMATCH*/
8097             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
8098             (OP(first) == PLUS) ||
8099             (OP(first) == MINMOD) ||
8100                /* An {n,m} with n>0 */
8101             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
8102             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
8103         {
8104                 /*
8105                  * the only op that could be a regnode is PLUS, all the rest
8106                  * will be regnode_1 or regnode_2.
8107                  *
8108                  * (yves doesn't think this is true)
8109                  */
8110                 if (OP(first) == PLUS)
8111                     sawplus = 1;
8112                 else {
8113                     if (OP(first) == MINMOD)
8114                         sawminmod = 1;
8115                     first += regarglen[OP(first)];
8116                 }
8117                 first = NEXTOPER(first);
8118                 first_next= regnext(first);
8119         }
8120
8121         /* Starting-point info. */
8122       again:
8123         DEBUG_PEEP("first:", first, 0, 0);
8124         /* Ignore EXACT as we deal with it later. */
8125         if (PL_regkind[OP(first)] == EXACT) {
8126             if (   OP(first) == EXACT
8127                 || OP(first) == LEXACT
8128                 || OP(first) == EXACT_REQ8
8129                 || OP(first) == LEXACT_REQ8
8130                 || OP(first) == EXACTL)
8131             {
8132                 NOOP;   /* Empty, get anchored substr later. */
8133             }
8134             else
8135                 RExC_rxi->regstclass = first;
8136         }
8137 #ifdef TRIE_STCLASS
8138         else if (PL_regkind[OP(first)] == TRIE &&
8139                 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8140         {
8141             /* this can happen only on restudy */
8142             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8143         }
8144 #endif
8145         else if (REGNODE_SIMPLE(OP(first)))
8146             RExC_rxi->regstclass = first;
8147         else if (PL_regkind[OP(first)] == BOUND ||
8148                  PL_regkind[OP(first)] == NBOUND)
8149             RExC_rxi->regstclass = first;
8150         else if (PL_regkind[OP(first)] == BOL) {
8151             RExC_rx->intflags |= (OP(first) == MBOL
8152                            ? PREGf_ANCH_MBOL
8153                            : PREGf_ANCH_SBOL);
8154             first = NEXTOPER(first);
8155             goto again;
8156         }
8157         else if (OP(first) == GPOS) {
8158             RExC_rx->intflags |= PREGf_ANCH_GPOS;
8159             first = NEXTOPER(first);
8160             goto again;
8161         }
8162         else if ((!sawopen || !RExC_sawback) &&
8163             !sawlookahead &&
8164             (OP(first) == STAR &&
8165             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8166             !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8167         {
8168             /* turn .* into ^.* with an implied $*=1 */
8169             const int type =
8170                 (OP(NEXTOPER(first)) == REG_ANY)
8171                     ? PREGf_ANCH_MBOL
8172                     : PREGf_ANCH_SBOL;
8173             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8174             first = NEXTOPER(first);
8175             goto again;
8176         }
8177         if (sawplus && !sawminmod && !sawlookahead
8178             && (!sawopen || !RExC_sawback)
8179             && !pRExC_state->code_blocks) /* May examine pos and $& */
8180             /* x+ must match at the 1st pos of run of x's */
8181             RExC_rx->intflags |= PREGf_SKIP;
8182
8183         /* Scan is after the zeroth branch, first is atomic matcher. */
8184 #ifdef TRIE_STUDY_OPT
8185         DEBUG_PARSE_r(
8186             if (!restudied)
8187                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8188                               (IV)(first - scan + 1))
8189         );
8190 #else
8191         DEBUG_PARSE_r(
8192             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8193                 (IV)(first - scan + 1))
8194         );
8195 #endif
8196
8197
8198         /*
8199         * If there's something expensive in the r.e., find the
8200         * longest literal string that must appear and make it the
8201         * regmust.  Resolve ties in favor of later strings, since
8202         * the regstart check works with the beginning of the r.e.
8203         * and avoiding duplication strengthens checking.  Not a
8204         * strong reason, but sufficient in the absence of others.
8205         * [Now we resolve ties in favor of the earlier string if
8206         * it happens that c_offset_min has been invalidated, since the
8207         * earlier string may buy us something the later one won't.]
8208         */
8209
8210         data.substrs[0].str = newSVpvs("");
8211         data.substrs[1].str = newSVpvs("");
8212         data.last_found = newSVpvs("");
8213         data.cur_is_floating = 0; /* initially any found substring is fixed */
8214         ENTER_with_name("study_chunk");
8215         SAVEFREESV(data.substrs[0].str);
8216         SAVEFREESV(data.substrs[1].str);
8217         SAVEFREESV(data.last_found);
8218         first = scan;
8219         if (!RExC_rxi->regstclass) {
8220             ssc_init(pRExC_state, &ch_class);
8221             data.start_class = &ch_class;
8222             stclass_flag = SCF_DO_STCLASS_AND;
8223         } else                          /* XXXX Check for BOUND? */
8224             stclass_flag = 0;
8225         data.last_closep = &last_close;
8226
8227         DEBUG_RExC_seen();
8228         /*
8229          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8230          * (NO top level branches)
8231          */
8232         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8233                              scan + RExC_size, /* Up to end */
8234             &data, -1, 0, NULL,
8235             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8236                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8237             0, TRUE);
8238
8239
8240         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8241
8242
8243         if ( RExC_total_parens == 1 && !data.cur_is_floating
8244              && data.last_start_min == 0 && data.last_end > 0
8245              && !RExC_seen_zerolen
8246              && !(RExC_seen & REG_VERBARG_SEEN)
8247              && !(RExC_seen & REG_GPOS_SEEN)
8248         ){
8249             RExC_rx->extflags |= RXf_CHECK_ALL;
8250         }
8251         scan_commit(pRExC_state, &data,&minlen, 0);
8252
8253
8254         /* XXX this is done in reverse order because that's the way the
8255          * code was before it was parameterised. Don't know whether it
8256          * actually needs doing in reverse order. DAPM */
8257         for (i = 1; i >= 0; i--) {
8258             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8259
8260             if (   !(   i
8261                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
8262                      &&    data.substrs[0].min_offset
8263                         == data.substrs[1].min_offset
8264                      &&    SvCUR(data.substrs[0].str)
8265                         == SvCUR(data.substrs[1].str)
8266                     )
8267                 && S_setup_longest (aTHX_ pRExC_state,
8268                                         &(RExC_rx->substrs->data[i]),
8269                                         &(data.substrs[i]),
8270                                         longest_length[i]))
8271             {
8272                 RExC_rx->substrs->data[i].min_offset =
8273                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
8274
8275                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8276                 /* Don't offset infinity */
8277                 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
8278                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8279                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8280             }
8281             else {
8282                 RExC_rx->substrs->data[i].substr      = NULL;
8283                 RExC_rx->substrs->data[i].utf8_substr = NULL;
8284                 longest_length[i] = 0;
8285             }
8286         }
8287
8288         LEAVE_with_name("study_chunk");
8289
8290         if (RExC_rxi->regstclass
8291             && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8292             RExC_rxi->regstclass = NULL;
8293
8294         if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8295               || RExC_rx->substrs->data[0].min_offset)
8296             && stclass_flag
8297             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8298             && is_ssc_worth_it(pRExC_state, data.start_class))
8299         {
8300             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8301
8302             ssc_finalize(pRExC_state, data.start_class);
8303
8304             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8305             StructCopy(data.start_class,
8306                        (regnode_ssc*)RExC_rxi->data->data[n],
8307                        regnode_ssc);
8308             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8309             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8310             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8311                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8312                       Perl_re_printf( aTHX_
8313                                     "synthetic stclass \"%s\".\n",
8314                                     SvPVX_const(sv));});
8315             data.start_class = NULL;
8316         }
8317
8318         /* A temporary algorithm prefers floated substr to fixed one of
8319          * same length to dig more info. */
8320         i = (longest_length[0] <= longest_length[1]);
8321         RExC_rx->substrs->check_ix = i;
8322         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
8323         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
8324         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
8325         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8326         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8327         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8328             RExC_rx->intflags |= PREGf_NOSCAN;
8329
8330         if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8331             RExC_rx->extflags |= RXf_USE_INTUIT;
8332             if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8333                 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8334         }
8335
8336         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8337         if ( (STRLEN)minlen < longest_length[1] )
8338             minlen= longest_length[1];
8339         if ( (STRLEN)minlen < longest_length[0] )
8340             minlen= longest_length[0];
8341         */
8342     }
8343     else {
8344         /* Several toplevels. Best we can is to set minlen. */
8345         SSize_t fake;
8346         regnode_ssc ch_class;
8347         SSize_t last_close = 0;
8348
8349         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
8350
8351         scan = RExC_rxi->program + 1;
8352         ssc_init(pRExC_state, &ch_class);
8353         data.start_class = &ch_class;
8354         data.last_closep = &last_close;
8355
8356         DEBUG_RExC_seen();
8357         /*
8358          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8359          * (patterns WITH top level branches)
8360          */
8361         minlen = study_chunk(pRExC_state,
8362             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8363             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8364                                                       ? SCF_TRIE_DOING_RESTUDY
8365                                                       : 0),
8366             0, TRUE);
8367
8368         CHECK_RESTUDY_GOTO_butfirst(NOOP);
8369
8370         RExC_rx->check_substr = NULL;
8371         RExC_rx->check_utf8 = NULL;
8372         RExC_rx->substrs->data[0].substr      = NULL;
8373         RExC_rx->substrs->data[0].utf8_substr = NULL;
8374         RExC_rx->substrs->data[1].substr      = NULL;
8375         RExC_rx->substrs->data[1].utf8_substr = NULL;
8376
8377         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8378             && is_ssc_worth_it(pRExC_state, data.start_class))
8379         {
8380             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8381
8382             ssc_finalize(pRExC_state, data.start_class);
8383
8384             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8385             StructCopy(data.start_class,
8386                        (regnode_ssc*)RExC_rxi->data->data[n],
8387                        regnode_ssc);
8388             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8389             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8390             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8391                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8392                       Perl_re_printf( aTHX_
8393                                     "synthetic stclass \"%s\".\n",
8394                                     SvPVX_const(sv));});
8395             data.start_class = NULL;
8396         }
8397     }
8398
8399     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8400         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8401         RExC_rx->maxlen = REG_INFTY;
8402     }
8403     else {
8404         RExC_rx->maxlen = RExC_maxlen;
8405     }
8406
8407     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8408        the "real" pattern. */
8409     DEBUG_OPTIMISE_r({
8410         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8411                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8412     });
8413     RExC_rx->minlenret = minlen;
8414     if (RExC_rx->minlen < minlen)
8415         RExC_rx->minlen = minlen;
8416
8417     if (RExC_seen & REG_RECURSE_SEEN ) {
8418         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8419         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8420     }
8421     if (RExC_seen & REG_GPOS_SEEN)
8422         RExC_rx->intflags |= PREGf_GPOS_SEEN;
8423     if (RExC_seen & REG_LOOKBEHIND_SEEN)
8424         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8425                                                 lookbehind */
8426     if (pRExC_state->code_blocks)
8427         RExC_rx->extflags |= RXf_EVAL_SEEN;
8428     if (RExC_seen & REG_VERBARG_SEEN)
8429     {
8430         RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8431         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8432     }
8433     if (RExC_seen & REG_CUTGROUP_SEEN)
8434         RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8435     if (pm_flags & PMf_USE_RE_EVAL)
8436         RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8437     if (RExC_paren_names)
8438         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8439     else
8440         RXp_PAREN_NAMES(RExC_rx) = NULL;
8441
8442     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8443      * so it can be used in pp.c */
8444     if (RExC_rx->intflags & PREGf_ANCH)
8445         RExC_rx->extflags |= RXf_IS_ANCHORED;
8446
8447
8448     {
8449         /* this is used to identify "special" patterns that might result
8450          * in Perl NOT calling the regex engine and instead doing the match "itself",
8451          * particularly special cases in split//. By having the regex compiler
8452          * do this pattern matching at a regop level (instead of by inspecting the pattern)
8453          * we avoid weird issues with equivalent patterns resulting in different behavior,
8454          * AND we allow non Perl engines to get the same optimizations by the setting the
8455          * flags appropriately - Yves */
8456         regnode *first = RExC_rxi->program + 1;
8457         U8 fop = OP(first);
8458         regnode *next = regnext(first);
8459         U8 nop = OP(next);
8460
8461         if (PL_regkind[fop] == NOTHING && nop == END)
8462             RExC_rx->extflags |= RXf_NULL;
8463         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8464             /* when fop is SBOL first->flags will be true only when it was
8465              * produced by parsing /\A/, and not when parsing /^/. This is
8466              * very important for the split code as there we want to
8467              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8468              * See rt #122761 for more details. -- Yves */
8469             RExC_rx->extflags |= RXf_START_ONLY;
8470         else if (fop == PLUS
8471                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8472                  && nop == END)
8473             RExC_rx->extflags |= RXf_WHITE;
8474         else if ( RExC_rx->extflags & RXf_SPLIT
8475                   && (   fop == EXACT || fop == LEXACT
8476                       || fop == EXACT_REQ8 || fop == LEXACT_REQ8
8477                       || fop == EXACTL)
8478                   && STR_LEN(first) == 1
8479                   && *(STRING(first)) == ' '
8480                   && nop == END )
8481             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8482
8483     }
8484
8485     if (RExC_contains_locale) {
8486         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8487     }
8488
8489 #ifdef DEBUGGING
8490     if (RExC_paren_names) {
8491         RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8492         RExC_rxi->data->data[RExC_rxi->name_list_idx]
8493                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
8494     } else
8495 #endif
8496     RExC_rxi->name_list_idx = 0;
8497
8498     while ( RExC_recurse_count > 0 ) {
8499         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8500         /*
8501          * This data structure is set up in study_chunk() and is used
8502          * to calculate the distance between a GOSUB regopcode and
8503          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8504          * it refers to.
8505          *
8506          * If for some reason someone writes code that optimises
8507          * away a GOSUB opcode then the assert should be changed to
8508          * an if(scan) to guard the ARG2L_SET() - Yves
8509          *
8510          */
8511         assert(scan && OP(scan) == GOSUB);
8512         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8513     }
8514
8515     Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8516     /* assume we don't need to swap parens around before we match */
8517     DEBUG_TEST_r({
8518         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8519             (unsigned long)RExC_study_chunk_recursed_count);
8520     });
8521     DEBUG_DUMP_r({
8522         DEBUG_RExC_seen();
8523         Perl_re_printf( aTHX_ "Final program:\n");
8524         regdump(RExC_rx);
8525     });
8526
8527     if (RExC_open_parens) {
8528         Safefree(RExC_open_parens);
8529         RExC_open_parens = NULL;
8530     }
8531     if (RExC_close_parens) {
8532         Safefree(RExC_close_parens);
8533         RExC_close_parens = NULL;
8534     }
8535
8536 #ifdef USE_ITHREADS
8537     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8538      * by setting the regexp SV to readonly-only instead. If the
8539      * pattern's been recompiled, the USEDness should remain. */
8540     if (old_re && SvREADONLY(old_re))
8541         SvREADONLY_on(Rx);
8542 #endif
8543     return Rx;
8544 }
8545
8546
8547 SV*
8548 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8549                     const U32 flags)
8550 {
8551     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8552
8553     PERL_UNUSED_ARG(value);
8554
8555     if (flags & RXapif_FETCH) {
8556         return reg_named_buff_fetch(rx, key, flags);
8557     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8558         Perl_croak_no_modify();
8559         return NULL;
8560     } else if (flags & RXapif_EXISTS) {
8561         return reg_named_buff_exists(rx, key, flags)
8562             ? &PL_sv_yes
8563             : &PL_sv_no;
8564     } else if (flags & RXapif_REGNAMES) {
8565         return reg_named_buff_all(rx, flags);
8566     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8567         return reg_named_buff_scalar(rx, flags);
8568     } else {
8569         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8570         return NULL;
8571     }
8572 }
8573
8574 SV*
8575 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8576                          const U32 flags)
8577 {
8578     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8579     PERL_UNUSED_ARG(lastkey);
8580
8581     if (flags & RXapif_FIRSTKEY)
8582         return reg_named_buff_firstkey(rx, flags);
8583     else if (flags & RXapif_NEXTKEY)
8584         return reg_named_buff_nextkey(rx, flags);
8585     else {
8586         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8587                                             (int)flags);
8588         return NULL;
8589     }
8590 }
8591
8592 SV*
8593 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8594                           const U32 flags)
8595 {
8596     SV *ret;
8597     struct regexp *const rx = ReANY(r);
8598
8599     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8600
8601     if (rx && RXp_PAREN_NAMES(rx)) {
8602         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8603         if (he_str) {
8604             IV i;
8605             SV* sv_dat=HeVAL(he_str);
8606             I32 *nums=(I32*)SvPVX(sv_dat);
8607             AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8608             for ( i=0; i<SvIVX(sv_dat); i++ ) {
8609                 if ((I32)(rx->nparens) >= nums[i]
8610                     && rx->offs[nums[i]].start != -1
8611                     && rx->offs[nums[i]].end != -1)
8612                 {
8613                     ret = newSVpvs("");
8614                     CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8615                     if (!retarray)
8616                         return ret;
8617                 } else {
8618                     if (retarray)
8619                         ret = newSVsv(&PL_sv_undef);
8620                 }
8621                 if (retarray)
8622                     av_push(retarray, ret);
8623             }
8624             if (retarray)
8625                 return newRV_noinc(MUTABLE_SV(retarray));
8626         }
8627     }
8628     return NULL;
8629 }
8630
8631 bool
8632 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8633                            const U32 flags)
8634 {
8635     struct regexp *const rx = ReANY(r);
8636
8637     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8638
8639     if (rx && RXp_PAREN_NAMES(rx)) {
8640         if (flags & RXapif_ALL) {
8641             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8642         } else {
8643             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8644             if (sv) {
8645                 SvREFCNT_dec_NN(sv);
8646                 return TRUE;
8647             } else {
8648                 return FALSE;
8649             }
8650         }
8651     } else {
8652         return FALSE;
8653     }
8654 }
8655
8656 SV*
8657 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8658 {
8659     struct regexp *const rx = ReANY(r);
8660
8661     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8662
8663     if ( rx && RXp_PAREN_NAMES(rx) ) {
8664         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8665
8666         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8667     } else {
8668         return FALSE;
8669     }
8670 }
8671
8672 SV*
8673 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8674 {
8675     struct regexp *const rx = ReANY(r);
8676     DECLARE_AND_GET_RE_DEBUG_FLAGS;
8677
8678     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8679
8680     if (rx && RXp_PAREN_NAMES(rx)) {
8681         HV *hv = RXp_PAREN_NAMES(rx);
8682         HE *temphe;
8683         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8684             IV i;
8685             IV parno = 0;
8686             SV* sv_dat = HeVAL(temphe);
8687             I32 *nums = (I32*)SvPVX(sv_dat);
8688             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8689                 if ((I32)(rx->lastparen) >= nums[i] &&
8690                     rx->offs[nums[i]].start != -1 &&
8691                     rx->offs[nums[i]].end != -1)
8692                 {
8693                     parno = nums[i];
8694                     break;
8695                 }
8696             }
8697             if (parno || flags & RXapif_ALL) {
8698                 return newSVhek(HeKEY_hek(temphe));
8699             }
8700         }
8701     }
8702     return NULL;
8703 }
8704
8705 SV*
8706 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8707 {
8708     SV *ret;
8709     AV *av;
8710     SSize_t length;
8711     struct regexp *const rx = ReANY(r);
8712
8713     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8714
8715     if (rx && RXp_PAREN_NAMES(rx)) {
8716         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8717             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8718         } else if (flags & RXapif_ONE) {
8719             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8720             av = MUTABLE_AV(SvRV(ret));
8721             length = av_count(av);
8722             SvREFCNT_dec_NN(ret);
8723             return newSViv(length);
8724         } else {
8725             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8726                                                 (int)flags);
8727             return NULL;
8728         }
8729     }
8730     return &PL_sv_undef;
8731 }
8732
8733 SV*
8734 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8735 {
8736     struct regexp *const rx = ReANY(r);
8737     AV *av = newAV();
8738
8739     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8740
8741     if (rx && RXp_PAREN_NAMES(rx)) {
8742         HV *hv= RXp_PAREN_NAMES(rx);
8743         HE *temphe;
8744         (void)hv_iterinit(hv);
8745         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8746             IV i;
8747             IV parno = 0;
8748             SV* sv_dat = HeVAL(temphe);
8749             I32 *nums = (I32*)SvPVX(sv_dat);
8750             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8751                 if ((I32)(rx->lastparen) >= nums[i] &&
8752                     rx->offs[nums[i]].start != -1 &&
8753                     rx->offs[nums[i]].end != -1)
8754                 {
8755                     parno = nums[i];
8756                     break;
8757                 }
8758             }
8759             if (parno || flags & RXapif_ALL) {
8760                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8761             }
8762         }
8763     }
8764
8765     return newRV_noinc(MUTABLE_SV(av));
8766 }
8767
8768 void
8769 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8770                              SV * const sv)
8771 {
8772     struct regexp *const rx = ReANY(r);
8773     char *s = NULL;
8774     SSize_t i = 0;
8775     SSize_t s1, t1;
8776     I32 n = paren;
8777
8778     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8779
8780     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8781            || n == RX_BUFF_IDX_CARET_FULLMATCH
8782            || n == RX_BUFF_IDX_CARET_POSTMATCH
8783        )
8784     {
8785         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8786         if (!keepcopy) {
8787             /* on something like
8788              *    $r = qr/.../;
8789              *    /$qr/p;
8790              * the KEEPCOPY is set on the PMOP rather than the regex */
8791             if (PL_curpm && r == PM_GETRE(PL_curpm))
8792                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8793         }
8794         if (!keepcopy)
8795             goto ret_undef;
8796     }
8797
8798     if (!rx->subbeg)
8799         goto ret_undef;
8800
8801     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8802         /* no need to distinguish between them any more */
8803         n = RX_BUFF_IDX_FULLMATCH;
8804
8805     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8806         && rx->offs[0].start != -1)
8807     {
8808         /* $`, ${^PREMATCH} */
8809         i = rx->offs[0].start;
8810         s = rx->subbeg;
8811     }
8812     else
8813     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8814         && rx->offs[0].end != -1)
8815     {
8816         /* $', ${^POSTMATCH} */
8817         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8818         i = rx->sublen + rx->suboffset - rx->offs[0].end;
8819     }
8820     else
8821     if (inRANGE(n, 0, (I32)rx->nparens) &&
8822         (s1 = rx->offs[n].start) != -1  &&
8823         (t1 = rx->offs[n].end) != -1)
8824     {
8825         /* $&, ${^MATCH},  $1 ... */
8826         i = t1 - s1;
8827         s = rx->subbeg + s1 - rx->suboffset;
8828     } else {
8829         goto ret_undef;
8830     }
8831
8832     assert(s >= rx->subbeg);
8833     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8834     if (i >= 0) {
8835 #ifdef NO_TAINT_SUPPORT
8836         sv_setpvn(sv, s, i);
8837 #else
8838         const int oldtainted = TAINT_get;
8839         TAINT_NOT;
8840         sv_setpvn(sv, s, i);
8841         TAINT_set(oldtainted);
8842 #endif
8843         if (RXp_MATCH_UTF8(rx))
8844             SvUTF8_on(sv);
8845         else
8846             SvUTF8_off(sv);
8847         if (TAINTING_get) {
8848             if (RXp_MATCH_TAINTED(rx)) {
8849                 if (SvTYPE(sv) >= SVt_PVMG) {
8850                     MAGIC* const mg = SvMAGIC(sv);
8851                     MAGIC* mgt;
8852                     TAINT;
8853                     SvMAGIC_set(sv, mg->mg_moremagic);
8854                     SvTAINT(sv);
8855                     if ((mgt = SvMAGIC(sv))) {
8856                         mg->mg_moremagic = mgt;
8857                         SvMAGIC_set(sv, mg);
8858                     }
8859                 } else {
8860                     TAINT;
8861                     SvTAINT(sv);
8862                 }
8863             } else
8864                 SvTAINTED_off(sv);
8865         }
8866     } else {
8867       ret_undef:
8868         sv_set_undef(sv);
8869         return;
8870     }
8871 }
8872
8873 void
8874 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8875                                                          SV const * const value)
8876 {
8877     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8878
8879     PERL_UNUSED_ARG(rx);
8880     PERL_UNUSED_ARG(paren);
8881     PERL_UNUSED_ARG(value);
8882
8883     if (!PL_localizing)
8884         Perl_croak_no_modify();
8885 }
8886
8887 I32
8888 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8889                               const I32 paren)
8890 {
8891     struct regexp *const rx = ReANY(r);
8892     I32 i;
8893     I32 s1, t1;
8894
8895     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8896
8897     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8898         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8899         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8900     )
8901     {
8902         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8903         if (!keepcopy) {
8904             /* on something like
8905              *    $r = qr/.../;
8906              *    /$qr/p;
8907              * the KEEPCOPY is set on the PMOP rather than the regex */
8908             if (PL_curpm && r == PM_GETRE(PL_curpm))
8909                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8910         }
8911         if (!keepcopy)
8912             goto warn_undef;
8913     }
8914
8915     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8916     switch (paren) {
8917       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8918       case RX_BUFF_IDX_PREMATCH:       /* $` */
8919         if (rx->offs[0].start != -1) {
8920                         i = rx->offs[0].start;
8921                         if (i > 0) {
8922                                 s1 = 0;
8923                                 t1 = i;
8924                                 goto getlen;
8925                         }
8926             }
8927         return 0;
8928
8929       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8930       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8931             if (rx->offs[0].end != -1) {
8932                         i = rx->sublen - rx->offs[0].end;
8933                         if (i > 0) {
8934                                 s1 = rx->offs[0].end;
8935                                 t1 = rx->sublen;
8936                                 goto getlen;
8937                         }
8938             }
8939         return 0;
8940
8941       default: /* $& / ${^MATCH}, $1, $2, ... */
8942             if (paren <= (I32)rx->nparens &&
8943             (s1 = rx->offs[paren].start) != -1 &&
8944             (t1 = rx->offs[paren].end) != -1)
8945             {
8946             i = t1 - s1;
8947             goto getlen;
8948         } else {
8949           warn_undef:
8950             if (ckWARN(WARN_UNINITIALIZED))
8951                 report_uninit((const SV *)sv);
8952             return 0;
8953         }
8954     }
8955   getlen:
8956     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8957         const char * const s = rx->subbeg - rx->suboffset + s1;
8958         const U8 *ep;
8959         STRLEN el;
8960
8961         i = t1 - s1;
8962         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8963             i = el;
8964     }
8965     return i;
8966 }
8967
8968 SV*
8969 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8970 {
8971     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8972         PERL_UNUSED_ARG(rx);
8973         if (0)
8974             return NULL;
8975         else
8976             return newSVpvs("Regexp");
8977 }
8978
8979 /* Scans the name of a named buffer from the pattern.
8980  * If flags is REG_RSN_RETURN_NULL returns null.
8981  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8982  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8983  * to the parsed name as looked up in the RExC_paren_names hash.
8984  * If there is an error throws a vFAIL().. type exception.
8985  */
8986
8987 #define REG_RSN_RETURN_NULL    0
8988 #define REG_RSN_RETURN_NAME    1
8989 #define REG_RSN_RETURN_DATA    2
8990
8991 STATIC SV*
8992 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8993 {
8994     char *name_start = RExC_parse;
8995     SV* sv_name;
8996
8997     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8998
8999     assert (RExC_parse <= RExC_end);
9000     if (RExC_parse == RExC_end) NOOP;
9001     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
9002          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
9003           * using do...while */
9004         if (UTF)
9005             do {
9006                 RExC_parse += UTF8SKIP(RExC_parse);
9007             } while (   RExC_parse < RExC_end
9008                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
9009         else
9010             do {
9011                 RExC_parse++;
9012             } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
9013     } else {
9014         RExC_parse++; /* so the <- from the vFAIL is after the offending
9015                          character */
9016         vFAIL("Group name must start with a non-digit word character");
9017     }
9018     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
9019                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
9020     if ( flags == REG_RSN_RETURN_NAME)
9021         return sv_name;
9022     else if (flags==REG_RSN_RETURN_DATA) {
9023         HE *he_str = NULL;
9024         SV *sv_dat = NULL;
9025         if ( ! sv_name )      /* should not happen*/
9026             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
9027         if (RExC_paren_names)
9028             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
9029         if ( he_str )
9030             sv_dat = HeVAL(he_str);
9031         if ( ! sv_dat ) {   /* Didn't find group */
9032
9033             /* It might be a forward reference; we can't fail until we
9034                 * know, by completing the parse to get all the groups, and
9035                 * then reparsing */
9036             if (ALL_PARENS_COUNTED)  {
9037                 vFAIL("Reference to nonexistent named group");
9038             }
9039             else {
9040                 REQUIRE_PARENS_PASS;
9041             }
9042         }
9043         return sv_dat;
9044     }
9045
9046     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
9047                      (unsigned long) flags);
9048 }
9049
9050 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
9051     if (RExC_lastparse!=RExC_parse) {                           \
9052         Perl_re_printf( aTHX_  "%s",                            \
9053             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
9054                 RExC_end - RExC_parse, 16,                      \
9055                 "", "",                                         \
9056                 PERL_PV_ESCAPE_UNI_DETECT |                     \
9057                 PERL_PV_PRETTY_ELLIPSES   |                     \
9058                 PERL_PV_PRETTY_LTGT       |                     \
9059                 PERL_PV_ESCAPE_RE         |                     \
9060                 PERL_PV_PRETTY_EXACTSIZE                        \
9061             )                                                   \
9062         );                                                      \
9063     } else                                                      \
9064         Perl_re_printf( aTHX_ "%16s","");                       \
9065                                                                 \
9066     if (RExC_lastnum!=RExC_emit)                                \
9067        Perl_re_printf( aTHX_ "|%4zu", RExC_emit);                \
9068     else                                                        \
9069        Perl_re_printf( aTHX_ "|%4s","");                        \
9070     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
9071         (int)((depth*2)), "",                                   \
9072         (funcname)                                              \
9073     );                                                          \
9074     RExC_lastnum=RExC_emit;                                     \
9075     RExC_lastparse=RExC_parse;                                  \
9076 })
9077
9078
9079
9080 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
9081     DEBUG_PARSE_MSG((funcname));                            \
9082     Perl_re_printf( aTHX_ "%4s","\n");                                  \
9083 })
9084 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
9085     DEBUG_PARSE_MSG((funcname));                            \
9086     Perl_re_printf( aTHX_ fmt "\n",args);                               \
9087 })
9088
9089 /* This section of code defines the inversion list object and its methods.  The
9090  * interfaces are highly subject to change, so as much as possible is static to
9091  * this file.  An inversion list is here implemented as a malloc'd C UV array
9092  * as an SVt_INVLIST scalar.
9093  *
9094  * An inversion list for Unicode is an array of code points, sorted by ordinal
9095  * number.  Each element gives the code point that begins a range that extends
9096  * up-to but not including the code point given by the next element.  The final
9097  * element gives the first code point of a range that extends to the platform's
9098  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
9099  * ...) give ranges whose code points are all in the inversion list.  We say
9100  * that those ranges are in the set.  The odd-numbered elements give ranges
9101  * whose code points are not in the inversion list, and hence not in the set.
9102  * Thus, element [0] is the first code point in the list.  Element [1]
9103  * is the first code point beyond that not in the list; and element [2] is the
9104  * first code point beyond that that is in the list.  In other words, the first
9105  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
9106  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
9107  * all code points in that range are not in the inversion list.  The third
9108  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
9109  * list, and so forth.  Thus every element whose index is divisible by two
9110  * gives the beginning of a range that is in the list, and every element whose
9111  * index is not divisible by two gives the beginning of a range not in the
9112  * list.  If the final element's index is divisible by two, the inversion list
9113  * extends to the platform's infinity; otherwise the highest code point in the
9114  * inversion list is the contents of that element minus 1.
9115  *
9116  * A range that contains just a single code point N will look like
9117  *  invlist[i]   == N
9118  *  invlist[i+1] == N+1
9119  *
9120  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9121  * impossible to represent, so element [i+1] is omitted.  The single element
9122  * inversion list
9123  *  invlist[0] == UV_MAX
9124  * contains just UV_MAX, but is interpreted as matching to infinity.
9125  *
9126  * Taking the complement (inverting) an inversion list is quite simple, if the
9127  * first element is 0, remove it; otherwise add a 0 element at the beginning.
9128  * This implementation reserves an element at the beginning of each inversion
9129  * list to always contain 0; there is an additional flag in the header which
9130  * indicates if the list begins at the 0, or is offset to begin at the next
9131  * element.  This means that the inversion list can be inverted without any
9132  * copying; just flip the flag.
9133  *
9134  * More about inversion lists can be found in "Unicode Demystified"
9135  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9136  *
9137  * The inversion list data structure is currently implemented as an SV pointing
9138  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
9139  * array of UV whose memory management is automatically handled by the existing
9140  * facilities for SV's.
9141  *
9142  * Some of the methods should always be private to the implementation, and some
9143  * should eventually be made public */
9144
9145 /* The header definitions are in F<invlist_inline.h> */
9146
9147 #ifndef PERL_IN_XSUB_RE
9148
9149 PERL_STATIC_INLINE UV*
9150 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9151 {
9152     /* Returns a pointer to the first element in the inversion list's array.
9153      * This is called upon initialization of an inversion list.  Where the
9154      * array begins depends on whether the list has the code point U+0000 in it
9155      * or not.  The other parameter tells it whether the code that follows this
9156      * call is about to put a 0 in the inversion list or not.  The first
9157      * element is either the element reserved for 0, if TRUE, or the element
9158      * after it, if FALSE */
9159
9160     bool* offset = get_invlist_offset_addr(invlist);
9161     UV* zero_addr = (UV *) SvPVX(invlist);
9162
9163     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9164
9165     /* Must be empty */
9166     assert(! _invlist_len(invlist));
9167
9168     *zero_addr = 0;
9169
9170     /* 1^1 = 0; 1^0 = 1 */
9171     *offset = 1 ^ will_have_0;
9172     return zero_addr + *offset;
9173 }
9174
9175 STATIC void
9176 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9177 {
9178     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
9179      * steals the list from 'src', so 'src' is made to have a NULL list.  This
9180      * is similar to what SvSetMagicSV() would do, if it were implemented on
9181      * inversion lists, though this routine avoids a copy */
9182
9183     const UV src_len          = _invlist_len(src);
9184     const bool src_offset     = *get_invlist_offset_addr(src);
9185     const STRLEN src_byte_len = SvLEN(src);
9186     char * array              = SvPVX(src);
9187
9188     const int oldtainted = TAINT_get;
9189
9190     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9191
9192     assert(is_invlist(src));
9193     assert(is_invlist(dest));
9194     assert(! invlist_is_iterating(src));
9195     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9196
9197     /* Make sure it ends in the right place with a NUL, as our inversion list
9198      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9199      * asserts it */
9200     array[src_byte_len - 1] = '\0';
9201
9202     TAINT_NOT;      /* Otherwise it breaks */
9203     sv_usepvn_flags(dest,
9204                     (char *) array,
9205                     src_byte_len - 1,
9206
9207                     /* This flag is documented to cause a copy to be avoided */
9208                     SV_HAS_TRAILING_NUL);
9209     TAINT_set(oldtainted);
9210     SvPV_set(src, 0);
9211     SvLEN_set(src, 0);
9212     SvCUR_set(src, 0);
9213
9214     /* Finish up copying over the other fields in an inversion list */
9215     *get_invlist_offset_addr(dest) = src_offset;
9216     invlist_set_len(dest, src_len, src_offset);
9217     *get_invlist_previous_index_addr(dest) = 0;
9218     invlist_iterfinish(dest);
9219 }
9220
9221 PERL_STATIC_INLINE IV*
9222 S_get_invlist_previous_index_addr(SV* invlist)
9223 {
9224     /* Return the address of the IV that is reserved to hold the cached index
9225      * */
9226     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9227
9228     assert(is_invlist(invlist));
9229
9230     return &(((XINVLIST*) SvANY(invlist))->prev_index);
9231 }
9232
9233 PERL_STATIC_INLINE IV
9234 S_invlist_previous_index(SV* const invlist)
9235 {
9236     /* Returns cached index of previous search */
9237
9238     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9239
9240     return *get_invlist_previous_index_addr(invlist);
9241 }
9242
9243 PERL_STATIC_INLINE void
9244 S_invlist_set_previous_index(SV* const invlist, const IV index)
9245 {
9246     /* Caches <index> for later retrieval */
9247
9248     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9249
9250     assert(index == 0 || index < (int) _invlist_len(invlist));
9251
9252     *get_invlist_previous_index_addr(invlist) = index;
9253 }
9254
9255 PERL_STATIC_INLINE void
9256 S_invlist_trim(SV* invlist)
9257 {
9258     /* Free the not currently-being-used space in an inversion list */
9259
9260     /* But don't free up the space needed for the 0 UV that is always at the
9261      * beginning of the list, nor the trailing NUL */
9262     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9263
9264     PERL_ARGS_ASSERT_INVLIST_TRIM;
9265
9266     assert(is_invlist(invlist));
9267
9268     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9269 }
9270
9271 PERL_STATIC_INLINE void
9272 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
9273 {
9274     PERL_ARGS_ASSERT_INVLIST_CLEAR;
9275
9276     assert(is_invlist(invlist));
9277
9278     invlist_set_len(invlist, 0, 0);
9279     invlist_trim(invlist);
9280 }
9281
9282 #endif /* ifndef PERL_IN_XSUB_RE */
9283
9284 PERL_STATIC_INLINE bool
9285 S_invlist_is_iterating(SV* const invlist)
9286 {
9287     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9288
9289     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9290 }
9291
9292 #ifndef PERL_IN_XSUB_RE
9293
9294 PERL_STATIC_INLINE UV
9295 S_invlist_max(SV* const invlist)
9296 {
9297     /* Returns the maximum number of elements storable in the inversion list's
9298      * array, without having to realloc() */
9299
9300     PERL_ARGS_ASSERT_INVLIST_MAX;
9301
9302     assert(is_invlist(invlist));
9303
9304     /* Assumes worst case, in which the 0 element is not counted in the
9305      * inversion list, so subtracts 1 for that */
9306     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
9307            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9308            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9309 }
9310
9311 STATIC void
9312 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9313 {
9314     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9315
9316     /* First 1 is in case the zero element isn't in the list; second 1 is for
9317      * trailing NUL */
9318     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9319     invlist_set_len(invlist, 0, 0);
9320
9321     /* Force iterinit() to be used to get iteration to work */
9322     invlist_iterfinish(invlist);
9323
9324     *get_invlist_previous_index_addr(invlist) = 0;
9325     SvPOK_on(invlist);  /* This allows B to extract the PV */
9326 }
9327
9328 SV*
9329 Perl__new_invlist(pTHX_ IV initial_size)
9330 {
9331
9332     /* Return a pointer to a newly constructed inversion list, with enough
9333      * space to store 'initial_size' elements.  If that number is negative, a
9334      * system default is used instead */
9335
9336     SV* new_list;
9337
9338     if (initial_size < 0) {
9339         initial_size = 10;
9340     }
9341
9342     new_list = newSV_type(SVt_INVLIST);
9343     initialize_invlist_guts(new_list, initial_size);
9344
9345     return new_list;
9346 }
9347
9348 SV*
9349 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9350 {
9351     /* Return a pointer to a newly constructed inversion list, initialized to
9352      * point to <list>, which has to be in the exact correct inversion list
9353      * form, including internal fields.  Thus this is a dangerous routine that
9354      * should not be used in the wrong hands.  The passed in 'list' contains
9355      * several header fields at the beginning that are not part of the
9356      * inversion list body proper */
9357
9358     const STRLEN length = (STRLEN) list[0];
9359     const UV version_id =          list[1];
9360     const bool offset   =    cBOOL(list[2]);
9361 #define HEADER_LENGTH 3
9362     /* If any of the above changes in any way, you must change HEADER_LENGTH
9363      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9364      *      perl -E 'say int(rand 2**31-1)'
9365      */
9366 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9367                                         data structure type, so that one being
9368                                         passed in can be validated to be an
9369                                         inversion list of the correct vintage.
9370                                        */
9371
9372     SV* invlist = newSV_type(SVt_INVLIST);
9373
9374     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9375
9376     if (version_id != INVLIST_VERSION_ID) {
9377         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9378     }
9379
9380     /* The generated array passed in includes header elements that aren't part
9381      * of the list proper, so start it just after them */
9382     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9383
9384     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
9385                                shouldn't touch it */
9386
9387     *(get_invlist_offset_addr(invlist)) = offset;
9388
9389     /* The 'length' passed to us is the physical number of elements in the
9390      * inversion list.  But if there is an offset the logical number is one
9391      * less than that */
9392     invlist_set_len(invlist, length  - offset, offset);
9393
9394     invlist_set_previous_index(invlist, 0);
9395
9396     /* Initialize the iteration pointer. */
9397     invlist_iterfinish(invlist);
9398
9399     SvREADONLY_on(invlist);
9400     SvPOK_on(invlist);
9401
9402     return invlist;
9403 }
9404
9405 STATIC void
9406 S__append_range_to_invlist(pTHX_ SV* const invlist,
9407                                  const UV start, const UV end)
9408 {
9409    /* Subject to change or removal.  Append the range from 'start' to 'end' at
9410     * the end of the inversion list.  The range must be above any existing
9411     * ones. */
9412
9413     UV* array;
9414     UV max = invlist_max(invlist);
9415     UV len = _invlist_len(invlist);
9416     bool offset;
9417
9418     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9419
9420     if (len == 0) { /* Empty lists must be initialized */
9421         offset = start != 0;
9422         array = _invlist_array_init(invlist, ! offset);
9423     }
9424     else {
9425         /* Here, the existing list is non-empty. The current max entry in the
9426          * list is generally the first value not in the set, except when the
9427          * set extends to the end of permissible values, in which case it is
9428          * the first entry in that final set, and so this call is an attempt to
9429          * append out-of-order */
9430
9431         UV final_element = len - 1;
9432         array = invlist_array(invlist);
9433         if (   array[final_element] > start
9434             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9435         {
9436             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9437                      array[final_element], start,
9438                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9439         }
9440
9441         /* Here, it is a legal append.  If the new range begins 1 above the end
9442          * of the range below it, it is extending the range below it, so the
9443          * new first value not in the set is one greater than the newly
9444          * extended range.  */
9445         offset = *get_invlist_offset_addr(invlist);
9446         if (array[final_element] == start) {
9447             if (end != UV_MAX) {
9448                 array[final_element] = end + 1;
9449             }
9450             else {
9451                 /* But if the end is the maximum representable on the machine,
9452                  * assume that infinity was actually what was meant.  Just let
9453                  * the range that this would extend to have no end */
9454                 invlist_set_len(invlist, len - 1, offset);
9455             }
9456             return;
9457         }
9458     }
9459
9460     /* Here the new range doesn't extend any existing set.  Add it */
9461
9462     len += 2;   /* Includes an element each for the start and end of range */
9463
9464     /* If wll overflow the existing space, extend, which may cause the array to
9465      * be moved */
9466     if (max < len) {
9467         invlist_extend(invlist, len);
9468
9469         /* Have to set len here to avoid assert failure in invlist_array() */
9470         invlist_set_len(invlist, len, offset);
9471
9472         array = invlist_array(invlist);
9473     }
9474     else {
9475         invlist_set_len(invlist, len, offset);
9476     }
9477
9478     /* The next item on the list starts the range, the one after that is
9479      * one past the new range.  */
9480     array[len - 2] = start;
9481     if (end != UV_MAX) {
9482         array[len - 1] = end + 1;
9483     }
9484     else {
9485         /* But if the end is the maximum representable on the machine, just let
9486          * the range have no end */
9487         invlist_set_len(invlist, len - 1, offset);
9488     }
9489 }
9490
9491 SSize_t
9492 Perl__invlist_search(SV* const invlist, const UV cp)
9493 {
9494     /* Searches the inversion list for the entry that contains the input code
9495      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
9496      * return value is the index into the list's array of the range that
9497      * contains <cp>, that is, 'i' such that
9498      *  array[i] <= cp < array[i+1]
9499      */
9500
9501     IV low = 0;
9502     IV mid;
9503     IV high = _invlist_len(invlist);
9504     const IV highest_element = high - 1;
9505     const UV* array;
9506
9507     PERL_ARGS_ASSERT__INVLIST_SEARCH;
9508
9509     /* If list is empty, return failure. */
9510     if (high == 0) {
9511         return -1;
9512     }
9513
9514     /* (We can't get the array unless we know the list is non-empty) */
9515     array = invlist_array(invlist);
9516
9517     mid = invlist_previous_index(invlist);
9518     assert(mid >=0);
9519     if (mid > highest_element) {
9520         mid = highest_element;
9521     }
9522
9523     /* <mid> contains the cache of the result of the previous call to this
9524      * function (0 the first time).  See if this call is for the same result,
9525      * or if it is for mid-1.  This is under the theory that calls to this
9526      * function will often be for related code points that are near each other.
9527      * And benchmarks show that caching gives better results.  We also test
9528      * here if the code point is within the bounds of the list.  These tests
9529      * replace others that would have had to be made anyway to make sure that
9530      * the array bounds were not exceeded, and these give us extra information
9531      * at the same time */
9532     if (cp >= array[mid]) {
9533         if (cp >= array[highest_element]) {
9534             return highest_element;
9535         }
9536
9537         /* Here, array[mid] <= cp < array[highest_element].  This means that
9538          * the final element is not the answer, so can exclude it; it also
9539          * means that <mid> is not the final element, so can refer to 'mid + 1'
9540          * safely */
9541         if (cp < array[mid + 1]) {
9542             return mid;
9543         }
9544         high--;
9545         low = mid + 1;
9546     }
9547     else { /* cp < aray[mid] */
9548         if (cp < array[0]) { /* Fail if outside the array */
9549             return -1;
9550         }
9551         high = mid;
9552         if (cp >= array[mid - 1]) {
9553             goto found_entry;
9554         }
9555     }
9556
9557     /* Binary search.  What we are looking for is <i> such that
9558      *  array[i] <= cp < array[i+1]
9559      * The loop below converges on the i+1.  Note that there may not be an
9560      * (i+1)th element in the array, and things work nonetheless */
9561     while (low < high) {
9562         mid = (low + high) / 2;
9563         assert(mid <= highest_element);
9564         if (array[mid] <= cp) { /* cp >= array[mid] */
9565             low = mid + 1;
9566
9567             /* We could do this extra test to exit the loop early.
9568             if (cp < array[low]) {
9569                 return mid;
9570             }
9571             */
9572         }
9573         else { /* cp < array[mid] */
9574             high = mid;
9575         }
9576     }
9577
9578   found_entry:
9579     high--;
9580     invlist_set_previous_index(invlist, high);
9581     return high;
9582 }
9583
9584 void
9585 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9586                                          const bool complement_b, SV** output)
9587 {
9588     /* Take the union of two inversion lists and point '*output' to it.  On
9589      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9590      * even 'a' or 'b').  If to an inversion list, the contents of the original
9591      * list will be replaced by the union.  The first list, 'a', may be
9592      * NULL, in which case a copy of the second list is placed in '*output'.
9593      * If 'complement_b' is TRUE, the union is taken of the complement
9594      * (inversion) of 'b' instead of b itself.
9595      *
9596      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9597      * Richard Gillam, published by Addison-Wesley, and explained at some
9598      * length there.  The preface says to incorporate its examples into your
9599      * code at your own risk.
9600      *
9601      * The algorithm is like a merge sort. */
9602
9603     const UV* array_a;    /* a's array */
9604     const UV* array_b;
9605     UV len_a;       /* length of a's array */
9606     UV len_b;
9607
9608     SV* u;                      /* the resulting union */
9609     UV* array_u;
9610     UV len_u = 0;
9611
9612     UV i_a = 0;             /* current index into a's array */
9613     UV i_b = 0;
9614     UV i_u = 0;
9615
9616     /* running count, as explained in the algorithm source book; items are
9617      * stopped accumulating and are output when the count changes to/from 0.
9618      * The count is incremented when we start a range that's in an input's set,
9619      * and decremented when we start a range that's not in a set.  So this
9620      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9621      * and hence nothing goes into the union; 1, just one of the inputs is in
9622      * its set (and its current range gets added to the union); and 2 when both
9623      * inputs are in their sets.  */
9624     UV count = 0;
9625
9626     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9627     assert(a != b);
9628     assert(*output == NULL || is_invlist(*output));
9629
9630     len_b = _invlist_len(b);
9631     if (len_b == 0) {
9632
9633         /* Here, 'b' is empty, hence it's complement is all possible code
9634          * points.  So if the union includes the complement of 'b', it includes
9635          * everything, and we need not even look at 'a'.  It's easiest to
9636          * create a new inversion list that matches everything.  */
9637         if (complement_b) {
9638             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9639
9640             if (*output == NULL) { /* If the output didn't exist, just point it
9641                                       at the new list */
9642                 *output = everything;
9643             }
9644             else { /* Otherwise, replace its contents with the new list */
9645                 invlist_replace_list_destroys_src(*output, everything);
9646                 SvREFCNT_dec_NN(everything);
9647             }
9648
9649             return;
9650         }
9651
9652         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9653          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9654          * output will be empty */
9655
9656         if (a == NULL || _invlist_len(a) == 0) {
9657             if (*output == NULL) {
9658                 *output = _new_invlist(0);
9659             }
9660             else {
9661                 invlist_clear(*output);
9662             }
9663             return;
9664         }
9665
9666         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9667          * union.  We can just return a copy of 'a' if '*output' doesn't point
9668          * to an existing list */
9669         if (*output == NULL) {
9670             *output = invlist_clone(a, NULL);
9671             return;
9672         }
9673
9674         /* If the output is to overwrite 'a', we have a no-op, as it's
9675          * already in 'a' */
9676         if (*output == a) {
9677             return;
9678         }
9679
9680         /* Here, '*output' is to be overwritten by 'a' */
9681         u = invlist_clone(a, NULL);
9682         invlist_replace_list_destroys_src(*output, u);
9683         SvREFCNT_dec_NN(u);
9684
9685         return;
9686     }
9687
9688     /* Here 'b' is not empty.  See about 'a' */
9689
9690     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9691
9692         /* Here, 'a' is empty (and b is not).  That means the union will come
9693          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9694          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9695          * the clone */
9696
9697         SV ** dest = (*output == NULL) ? output : &u;
9698         *dest = invlist_clone(b, NULL);
9699         if (complement_b) {
9700             _invlist_invert(*dest);
9701         }
9702
9703         if (dest == &u) {
9704             invlist_replace_list_destroys_src(*output, u);
9705             SvREFCNT_dec_NN(u);
9706         }
9707
9708         return;
9709     }
9710
9711     /* Here both lists exist and are non-empty */
9712     array_a = invlist_array(a);
9713     array_b = invlist_array(b);
9714
9715     /* If are to take the union of 'a' with the complement of b, set it
9716      * up so are looking at b's complement. */
9717     if (complement_b) {
9718
9719         /* To complement, we invert: if the first element is 0, remove it.  To
9720          * do this, we just pretend the array starts one later */
9721         if (array_b[0] == 0) {
9722             array_b++;
9723             len_b--;
9724         }
9725         else {
9726
9727             /* But if the first element is not zero, we pretend the list starts
9728              * at the 0 that is always stored immediately before the array. */
9729             array_b--;
9730             len_b++;
9731         }
9732     }
9733
9734     /* Size the union for the worst case: that the sets are completely
9735      * disjoint */
9736     u = _new_invlist(len_a + len_b);
9737
9738     /* Will contain U+0000 if either component does */
9739     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9740                                       || (len_b > 0 && array_b[0] == 0));
9741
9742     /* Go through each input list item by item, stopping when have exhausted
9743      * one of them */
9744     while (i_a < len_a && i_b < len_b) {
9745         UV cp;      /* The element to potentially add to the union's array */
9746         bool cp_in_set;   /* is it in the input list's set or not */
9747
9748         /* We need to take one or the other of the two inputs for the union.
9749          * Since we are merging two sorted lists, we take the smaller of the
9750          * next items.  In case of a tie, we take first the one that is in its
9751          * set.  If we first took the one not in its set, it would decrement
9752          * the count, possibly to 0 which would cause it to be output as ending
9753          * the range, and the next time through we would take the same number,
9754          * and output it again as beginning the next range.  By doing it the
9755          * opposite way, there is no possibility that the count will be
9756          * momentarily decremented to 0, and thus the two adjoining ranges will
9757          * be seamlessly merged.  (In a tie and both are in the set or both not
9758          * in the set, it doesn't matter which we take first.) */
9759         if (       array_a[i_a] < array_b[i_b]
9760             || (   array_a[i_a] == array_b[i_b]
9761                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9762         {
9763             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9764             cp = array_a[i_a++];
9765         }
9766         else {
9767             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9768             cp = array_b[i_b++];
9769         }
9770
9771         /* Here, have chosen which of the two inputs to look at.  Only output
9772          * if the running count changes to/from 0, which marks the
9773          * beginning/end of a range that's in the set */
9774         if (cp_in_set) {
9775             if (count == 0) {
9776                 array_u[i_u++] = cp;
9777             }
9778             count++;
9779         }
9780         else {
9781             count--;
9782             if (count == 0) {
9783                 array_u[i_u++] = cp;
9784             }
9785         }
9786     }
9787
9788
9789     /* The loop above increments the index into exactly one of the input lists
9790      * each iteration, and ends when either index gets to its list end.  That
9791      * means the other index is lower than its end, and so something is
9792      * remaining in that one.  We decrement 'count', as explained below, if
9793      * that list is in its set.  (i_a and i_b each currently index the element
9794      * beyond the one we care about.) */
9795     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9796         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9797     {
9798         count--;
9799     }
9800
9801     /* Above we decremented 'count' if the list that had unexamined elements in
9802      * it was in its set.  This has made it so that 'count' being non-zero
9803      * means there isn't anything left to output; and 'count' equal to 0 means
9804      * that what is left to output is precisely that which is left in the
9805      * non-exhausted input list.
9806      *
9807      * To see why, note first that the exhausted input obviously has nothing
9808      * left to add to the union.  If it was in its set at its end, that means
9809      * the set extends from here to the platform's infinity, and hence so does
9810      * the union and the non-exhausted set is irrelevant.  The exhausted set
9811      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9812      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9813      * 'count' remains at 1.  This is consistent with the decremented 'count'
9814      * != 0 meaning there's nothing left to add to the union.
9815      *
9816      * But if the exhausted input wasn't in its set, it contributed 0 to
9817      * 'count', and the rest of the union will be whatever the other input is.
9818      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9819      * otherwise it gets decremented to 0.  This is consistent with 'count'
9820      * == 0 meaning the remainder of the union is whatever is left in the
9821      * non-exhausted list. */
9822     if (count != 0) {
9823         len_u = i_u;
9824     }
9825     else {
9826         IV copy_count = len_a - i_a;
9827         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9828             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9829         }
9830         else { /* The non-exhausted input is b */
9831             copy_count = len_b - i_b;
9832             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9833         }
9834         len_u = i_u + copy_count;
9835     }
9836
9837     /* Set the result to the final length, which can change the pointer to
9838      * array_u, so re-find it.  (Note that it is unlikely that this will
9839      * change, as we are shrinking the space, not enlarging it) */
9840     if (len_u != _invlist_len(u)) {
9841         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9842         invlist_trim(u);
9843         array_u = invlist_array(u);
9844     }
9845
9846     if (*output == NULL) {  /* Simply return the new inversion list */
9847         *output = u;
9848     }
9849     else {
9850         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9851          * could instead free '*output', and then set it to 'u', but experience
9852          * has shown [perl #127392] that if the input is a mortal, we can get a
9853          * huge build-up of these during regex compilation before they get
9854          * freed. */
9855         invlist_replace_list_destroys_src(*output, u);
9856         SvREFCNT_dec_NN(u);
9857     }
9858
9859     return;
9860 }
9861
9862 void
9863 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9864                                                const bool complement_b, SV** i)
9865 {
9866     /* Take the intersection of two inversion lists and point '*i' to it.  On
9867      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9868      * even 'a' or 'b').  If to an inversion list, the contents of the original
9869      * list will be replaced by the intersection.  The first list, 'a', may be
9870      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9871      * TRUE, the result will be the intersection of 'a' and the complement (or
9872      * inversion) of 'b' instead of 'b' directly.
9873      *
9874      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9875      * Richard Gillam, published by Addison-Wesley, and explained at some
9876      * length there.  The preface says to incorporate its examples into your
9877      * code at your own risk.  In fact, it had bugs
9878      *
9879      * The algorithm is like a merge sort, and is essentially the same as the
9880      * union above
9881      */
9882
9883     const UV* array_a;          /* a's array */
9884     const UV* array_b;
9885     UV len_a;   /* length of a's array */
9886     UV len_b;
9887
9888     SV* r;                   /* the resulting intersection */
9889     UV* array_r;
9890     UV len_r = 0;
9891
9892     UV i_a = 0;             /* current index into a's array */
9893     UV i_b = 0;
9894     UV i_r = 0;
9895
9896     /* running count of how many of the two inputs are postitioned at ranges
9897      * that are in their sets.  As explained in the algorithm source book,
9898      * items are stopped accumulating and are output when the count changes
9899      * to/from 2.  The count is incremented when we start a range that's in an
9900      * input's set, and decremented when we start a range that's not in a set.
9901      * Only when it is 2 are we in the intersection. */
9902     UV count = 0;
9903
9904     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9905     assert(a != b);
9906     assert(*i == NULL || is_invlist(*i));
9907
9908     /* Special case if either one is empty */
9909     len_a = (a == NULL) ? 0 : _invlist_len(a);
9910     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9911         if (len_a != 0 && complement_b) {
9912
9913             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9914              * must be empty.  Here, also we are using 'b's complement, which
9915              * hence must be every possible code point.  Thus the intersection
9916              * is simply 'a'. */
9917
9918             if (*i == a) {  /* No-op */
9919                 return;
9920             }
9921
9922             if (*i == NULL) {
9923                 *i = invlist_clone(a, NULL);
9924                 return;
9925             }
9926
9927             r = invlist_clone(a, NULL);
9928             invlist_replace_list_destroys_src(*i, r);
9929             SvREFCNT_dec_NN(r);
9930             return;
9931         }
9932
9933         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9934          * intersection must be empty */
9935         if (*i == NULL) {
9936             *i = _new_invlist(0);
9937             return;
9938         }
9939
9940         invlist_clear(*i);
9941         return;
9942     }
9943
9944     /* Here both lists exist and are non-empty */
9945     array_a = invlist_array(a);
9946     array_b = invlist_array(b);
9947
9948     /* If are to take the intersection of 'a' with the complement of b, set it
9949      * up so are looking at b's complement. */
9950     if (complement_b) {
9951
9952         /* To complement, we invert: if the first element is 0, remove it.  To
9953          * do this, we just pretend the array starts one later */
9954         if (array_b[0] == 0) {
9955             array_b++;
9956             len_b--;
9957         }
9958         else {
9959
9960             /* But if the first element is not zero, we pretend the list starts
9961              * at the 0 that is always stored immediately before the array. */
9962             array_b--;
9963             len_b++;
9964         }
9965     }
9966
9967     /* Size the intersection for the worst case: that the intersection ends up
9968      * fragmenting everything to be completely disjoint */
9969     r= _new_invlist(len_a + len_b);
9970
9971     /* Will contain U+0000 iff both components do */
9972     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9973                                      && len_b > 0 && array_b[0] == 0);
9974
9975     /* Go through each list item by item, stopping when have exhausted one of
9976      * them */
9977     while (i_a < len_a && i_b < len_b) {
9978         UV cp;      /* The element to potentially add to the intersection's
9979                        array */
9980         bool cp_in_set; /* Is it in the input list's set or not */
9981
9982         /* We need to take one or the other of the two inputs for the
9983          * intersection.  Since we are merging two sorted lists, we take the
9984          * smaller of the next items.  In case of a tie, we take first the one
9985          * that is not in its set (a difference from the union algorithm).  If
9986          * we first took the one in its set, it would increment the count,
9987          * possibly to 2 which would cause it to be output as starting a range
9988          * in the intersection, and the next time through we would take that
9989          * same number, and output it again as ending the set.  By doing the
9990          * opposite of this, there is no possibility that the count will be
9991          * momentarily incremented to 2.  (In a tie and both are in the set or
9992          * both not in the set, it doesn't matter which we take first.) */
9993         if (       array_a[i_a] < array_b[i_b]
9994             || (   array_a[i_a] == array_b[i_b]
9995                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9996         {
9997             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9998             cp = array_a[i_a++];
9999         }
10000         else {
10001             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
10002             cp= array_b[i_b++];
10003         }
10004
10005         /* Here, have chosen which of the two inputs to look at.  Only output
10006          * if the running count changes to/from 2, which marks the
10007          * beginning/end of a range that's in the intersection */
10008         if (cp_in_set) {
10009             count++;
10010             if (count == 2) {
10011                 array_r[i_r++] = cp;
10012             }
10013         }
10014         else {
10015             if (count == 2) {
10016                 array_r[i_r++] = cp;
10017             }
10018             count--;
10019         }
10020
10021     }
10022
10023     /* The loop above increments the index into exactly one of the input lists
10024      * each iteration, and ends when either index gets to its list end.  That
10025      * means the other index is lower than its end, and so something is
10026      * remaining in that one.  We increment 'count', as explained below, if the
10027      * exhausted list was in its set.  (i_a and i_b each currently index the
10028      * element beyond the one we care about.) */
10029     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10030         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10031     {
10032         count++;
10033     }
10034
10035     /* Above we incremented 'count' if the exhausted list was in its set.  This
10036      * has made it so that 'count' being below 2 means there is nothing left to
10037      * output; otheriwse what's left to add to the intersection is precisely
10038      * that which is left in the non-exhausted input list.
10039      *
10040      * To see why, note first that the exhausted input obviously has nothing
10041      * left to affect the intersection.  If it was in its set at its end, that
10042      * means the set extends from here to the platform's infinity, and hence
10043      * anything in the non-exhausted's list will be in the intersection, and
10044      * anything not in it won't be.  Hence, the rest of the intersection is
10045      * precisely what's in the non-exhausted list  The exhausted set also
10046      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
10047      * it means 'count' is now at least 2.  This is consistent with the
10048      * incremented 'count' being >= 2 means to add the non-exhausted list to
10049      * the intersection.
10050      *
10051      * But if the exhausted input wasn't in its set, it contributed 0 to
10052      * 'count', and the intersection can't include anything further; the
10053      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
10054      * incremented.  This is consistent with 'count' being < 2 meaning nothing
10055      * further to add to the intersection. */
10056     if (count < 2) { /* Nothing left to put in the intersection. */
10057         len_r = i_r;
10058     }
10059     else { /* copy the non-exhausted list, unchanged. */
10060         IV copy_count = len_a - i_a;
10061         if (copy_count > 0) {   /* a is the one with stuff left */
10062             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
10063         }
10064         else {  /* b is the one with stuff left */
10065             copy_count = len_b - i_b;
10066             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
10067         }
10068         len_r = i_r + copy_count;
10069     }
10070
10071     /* Set the result to the final length, which can change the pointer to
10072      * array_r, so re-find it.  (Note that it is unlikely that this will
10073      * change, as we are shrinking the space, not enlarging it) */
10074     if (len_r != _invlist_len(r)) {
10075         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
10076         invlist_trim(r);
10077         array_r = invlist_array(r);
10078     }
10079
10080     if (*i == NULL) { /* Simply return the calculated intersection */
10081         *i = r;
10082     }
10083     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
10084               instead free '*i', and then set it to 'r', but experience has
10085               shown [perl #127392] that if the input is a mortal, we can get a
10086               huge build-up of these during regex compilation before they get
10087               freed. */
10088         if (len_r) {
10089             invlist_replace_list_destroys_src(*i, r);
10090         }
10091         else {
10092             invlist_clear(*i);
10093         }
10094         SvREFCNT_dec_NN(r);
10095     }
10096
10097     return;
10098 }
10099
10100 SV*
10101 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10102 {
10103     /* Add the range from 'start' to 'end' inclusive to the inversion list's
10104      * set.  A pointer to the inversion list is returned.  This may actually be
10105      * a new list, in which case the passed in one has been destroyed.  The
10106      * passed-in inversion list can be NULL, in which case a new one is created
10107      * with just the one range in it.  The new list is not necessarily
10108      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
10109      * result of this function.  The gain would not be large, and in many
10110      * cases, this is called multiple times on a single inversion list, so
10111      * anything freed may almost immediately be needed again.
10112      *
10113      * This used to mostly call the 'union' routine, but that is much more
10114      * heavyweight than really needed for a single range addition */
10115
10116     UV* array;              /* The array implementing the inversion list */
10117     UV len;                 /* How many elements in 'array' */
10118     SSize_t i_s;            /* index into the invlist array where 'start'
10119                                should go */
10120     SSize_t i_e = 0;        /* And the index where 'end' should go */
10121     UV cur_highest;         /* The highest code point in the inversion list
10122                                upon entry to this function */
10123
10124     /* This range becomes the whole inversion list if none already existed */
10125     if (invlist == NULL) {
10126         invlist = _new_invlist(2);
10127         _append_range_to_invlist(invlist, start, end);
10128         return invlist;
10129     }
10130
10131     /* Likewise, if the inversion list is currently empty */
10132     len = _invlist_len(invlist);
10133     if (len == 0) {
10134         _append_range_to_invlist(invlist, start, end);
10135         return invlist;
10136     }
10137
10138     /* Starting here, we have to know the internals of the list */
10139     array = invlist_array(invlist);
10140
10141     /* If the new range ends higher than the current highest ... */
10142     cur_highest = invlist_highest(invlist);
10143     if (end > cur_highest) {
10144
10145         /* If the whole range is higher, we can just append it */
10146         if (start > cur_highest) {
10147             _append_range_to_invlist(invlist, start, end);
10148             return invlist;
10149         }
10150
10151         /* Otherwise, add the portion that is higher ... */
10152         _append_range_to_invlist(invlist, cur_highest + 1, end);
10153
10154         /* ... and continue on below to handle the rest.  As a result of the
10155          * above append, we know that the index of the end of the range is the
10156          * final even numbered one of the array.  Recall that the final element
10157          * always starts a range that extends to infinity.  If that range is in
10158          * the set (meaning the set goes from here to infinity), it will be an
10159          * even index, but if it isn't in the set, it's odd, and the final
10160          * range in the set is one less, which is even. */
10161         if (end == UV_MAX) {
10162             i_e = len;
10163         }
10164         else {
10165             i_e = len - 2;
10166         }
10167     }
10168
10169     /* We have dealt with appending, now see about prepending.  If the new
10170      * range starts lower than the current lowest ... */
10171     if (start < array[0]) {
10172
10173         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10174          * Let the union code handle it, rather than having to know the
10175          * trickiness in two code places.  */
10176         if (UNLIKELY(start == 0)) {
10177             SV* range_invlist;
10178
10179             range_invlist = _new_invlist(2);
10180             _append_range_to_invlist(range_invlist, start, end);
10181
10182             _invlist_union(invlist, range_invlist, &invlist);
10183
10184             SvREFCNT_dec_NN(range_invlist);
10185
10186             return invlist;
10187         }
10188
10189         /* If the whole new range comes before the first entry, and doesn't
10190          * extend it, we have to insert it as an additional range */
10191         if (end < array[0] - 1) {
10192             i_s = i_e = -1;
10193             goto splice_in_new_range;
10194         }
10195
10196         /* Here the new range adjoins the existing first range, extending it
10197          * downwards. */
10198         array[0] = start;
10199
10200         /* And continue on below to handle the rest.  We know that the index of
10201          * the beginning of the range is the first one of the array */
10202         i_s = 0;
10203     }
10204     else { /* Not prepending any part of the new range to the existing list.
10205             * Find where in the list it should go.  This finds i_s, such that:
10206             *     invlist[i_s] <= start < array[i_s+1]
10207             */
10208         i_s = _invlist_search(invlist, start);
10209     }
10210
10211     /* At this point, any extending before the beginning of the inversion list
10212      * and/or after the end has been done.  This has made it so that, in the
10213      * code below, each endpoint of the new range is either in a range that is
10214      * in the set, or is in a gap between two ranges that are.  This means we
10215      * don't have to worry about exceeding the array bounds.
10216      *
10217      * Find where in the list the new range ends (but we can skip this if we
10218      * have already determined what it is, or if it will be the same as i_s,
10219      * which we already have computed) */
10220     if (i_e == 0) {
10221         i_e = (start == end)
10222               ? i_s
10223               : _invlist_search(invlist, end);
10224     }
10225
10226     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
10227      * is a range that goes to infinity there is no element at invlist[i_e+1],
10228      * so only the first relation holds. */
10229
10230     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10231
10232         /* Here, the ranges on either side of the beginning of the new range
10233          * are in the set, and this range starts in the gap between them.
10234          *
10235          * The new range extends the range above it downwards if the new range
10236          * ends at or above that range's start */
10237         const bool extends_the_range_above = (   end == UV_MAX
10238                                               || end + 1 >= array[i_s+1]);
10239
10240         /* The new range extends the range below it upwards if it begins just
10241          * after where that range ends */
10242         if (start == array[i_s]) {
10243
10244             /* If the new range fills the entire gap between the other ranges,
10245              * they will get merged together.  Other ranges may also get
10246              * merged, depending on how many of them the new range spans.  In
10247              * the general case, we do the merge later, just once, after we
10248              * figure out how many to merge.  But in the case where the new
10249              * range exactly spans just this one gap (possibly extending into
10250              * the one above), we do the merge here, and an early exit.  This
10251              * is done here to avoid having to special case later. */
10252             if (i_e - i_s <= 1) {
10253
10254                 /* If i_e - i_s == 1, it means that the new range terminates
10255                  * within the range above, and hence 'extends_the_range_above'
10256                  * must be true.  (If the range above it extends to infinity,
10257                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10258                  * will be 0, so no harm done.) */
10259                 if (extends_the_range_above) {
10260                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10261                     invlist_set_len(invlist,
10262                                     len - 2,
10263                                     *(get_invlist_offset_addr(invlist)));
10264                     return invlist;
10265                 }
10266
10267                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
10268                  * to the same range, and below we are about to decrement i_s
10269                  * */
10270                 i_e--;
10271             }
10272
10273             /* Here, the new range is adjacent to the one below.  (It may also
10274              * span beyond the range above, but that will get resolved later.)
10275              * Extend the range below to include this one. */
10276             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10277             i_s--;
10278             start = array[i_s];
10279         }
10280         else if (extends_the_range_above) {
10281
10282             /* Here the new range only extends the range above it, but not the
10283              * one below.  It merges with the one above.  Again, we keep i_e
10284              * and i_s in sync if they point to the same range */
10285             if (i_e == i_s) {
10286                 i_e++;
10287             }
10288             i_s++;
10289             array[i_s] = start;
10290         }
10291     }
10292
10293     /* Here, we've dealt with the new range start extending any adjoining
10294      * existing ranges.
10295      *
10296      * If the new range extends to infinity, it is now the final one,
10297      * regardless of what was there before */
10298     if (UNLIKELY(end == UV_MAX)) {
10299         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10300         return invlist;
10301     }
10302
10303     /* If i_e started as == i_s, it has also been dealt with,
10304      * and been updated to the new i_s, which will fail the following if */
10305     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10306
10307         /* Here, the ranges on either side of the end of the new range are in
10308          * the set, and this range ends in the gap between them.
10309          *
10310          * If this range is adjacent to (hence extends) the range above it, it
10311          * becomes part of that range; likewise if it extends the range below,
10312          * it becomes part of that range */
10313         if (end + 1 == array[i_e+1]) {
10314             i_e++;
10315             array[i_e] = start;
10316         }
10317         else if (start <= array[i_e]) {
10318             array[i_e] = end + 1;
10319             i_e--;
10320         }
10321     }
10322
10323     if (i_s == i_e) {
10324
10325         /* If the range fits entirely in an existing range (as possibly already
10326          * extended above), it doesn't add anything new */
10327         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10328             return invlist;
10329         }
10330
10331         /* Here, no part of the range is in the list.  Must add it.  It will
10332          * occupy 2 more slots */
10333       splice_in_new_range:
10334
10335         invlist_extend(invlist, len + 2);
10336         array = invlist_array(invlist);
10337         /* Move the rest of the array down two slots. Don't include any
10338          * trailing NUL */
10339         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10340
10341         /* Do the actual splice */
10342         array[i_e+1] = start;
10343         array[i_e+2] = end + 1;
10344         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10345         return invlist;
10346     }
10347
10348     /* Here the new range crossed the boundaries of a pre-existing range.  The
10349      * code above has adjusted things so that both ends are in ranges that are
10350      * in the set.  This means everything in between must also be in the set.
10351      * Just squash things together */
10352     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10353     invlist_set_len(invlist,
10354                     len - i_e + i_s,
10355                     *(get_invlist_offset_addr(invlist)));
10356
10357     return invlist;
10358 }
10359
10360 SV*
10361 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10362                                  UV** other_elements_ptr)
10363 {
10364     /* Create and return an inversion list whose contents are to be populated
10365      * by the caller.  The caller gives the number of elements (in 'size') and
10366      * the very first element ('element0').  This function will set
10367      * '*other_elements_ptr' to an array of UVs, where the remaining elements
10368      * are to be placed.
10369      *
10370      * Obviously there is some trust involved that the caller will properly
10371      * fill in the other elements of the array.
10372      *
10373      * (The first element needs to be passed in, as the underlying code does
10374      * things differently depending on whether it is zero or non-zero) */
10375
10376     SV* invlist = _new_invlist(size);
10377     bool offset;
10378
10379     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10380
10381     invlist = add_cp_to_invlist(invlist, element0);
10382     offset = *get_invlist_offset_addr(invlist);
10383
10384     invlist_set_len(invlist, size, offset);
10385     *other_elements_ptr = invlist_array(invlist) + 1;
10386     return invlist;
10387 }
10388
10389 #endif
10390
10391 #ifndef PERL_IN_XSUB_RE
10392 void
10393 Perl__invlist_invert(pTHX_ SV* const invlist)
10394 {
10395     /* Complement the input inversion list.  This adds a 0 if the list didn't
10396      * have a zero; removes it otherwise.  As described above, the data
10397      * structure is set up so that this is very efficient */
10398
10399     PERL_ARGS_ASSERT__INVLIST_INVERT;
10400
10401     assert(! invlist_is_iterating(invlist));
10402
10403     /* The inverse of matching nothing is matching everything */
10404     if (_invlist_len(invlist) == 0) {
10405         _append_range_to_invlist(invlist, 0, UV_MAX);
10406         return;
10407     }
10408
10409     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10410 }
10411
10412 SV*
10413 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10414 {
10415     /* Return a new inversion list that is a copy of the input one, which is
10416      * unchanged.  The new list will not be mortal even if the old one was. */
10417
10418     const STRLEN nominal_length = _invlist_len(invlist);
10419     const STRLEN physical_length = SvCUR(invlist);
10420     const bool offset = *(get_invlist_offset_addr(invlist));
10421
10422     PERL_ARGS_ASSERT_INVLIST_CLONE;
10423
10424     if (new_invlist == NULL) {
10425         new_invlist = _new_invlist(nominal_length);
10426     }
10427     else {
10428         sv_upgrade(new_invlist, SVt_INVLIST);
10429         initialize_invlist_guts(new_invlist, nominal_length);
10430     }
10431
10432     *(get_invlist_offset_addr(new_invlist)) = offset;
10433     invlist_set_len(new_invlist, nominal_length, offset);
10434     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10435
10436     return new_invlist;
10437 }
10438
10439 #endif
10440
10441 PERL_STATIC_INLINE UV
10442 S_invlist_lowest(SV* const invlist)
10443 {
10444     /* Returns the lowest code point that matches an inversion list.  This API
10445      * has an ambiguity, as it returns 0 under either the lowest is actually
10446      * 0, or if the list is empty.  If this distinction matters to you, check
10447      * for emptiness before calling this function */
10448
10449     UV len = _invlist_len(invlist);
10450     UV *array;
10451
10452     PERL_ARGS_ASSERT_INVLIST_LOWEST;
10453
10454     if (len == 0) {
10455         return 0;
10456     }
10457
10458     array = invlist_array(invlist);
10459
10460     return array[0];
10461 }
10462
10463 STATIC SV *
10464 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10465 {
10466     /* Get the contents of an inversion list into a string SV so that they can
10467      * be printed out.  If 'traditional_style' is TRUE, it uses the format
10468      * traditionally done for debug tracing; otherwise it uses a format
10469      * suitable for just copying to the output, with blanks between ranges and
10470      * a dash between range components */
10471
10472     UV start, end;
10473     SV* output;
10474     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10475     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10476
10477     if (traditional_style) {
10478         output = newSVpvs("\n");
10479     }
10480     else {
10481         output = newSVpvs("");
10482     }
10483
10484     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10485
10486     assert(! invlist_is_iterating(invlist));
10487
10488     invlist_iterinit(invlist);
10489     while (invlist_iternext(invlist, &start, &end)) {
10490         if (end == UV_MAX) {
10491             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10492                                           start, intra_range_delimiter,
10493                                                  inter_range_delimiter);
10494         }
10495         else if (end != start) {
10496             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10497                                           start,
10498                                                    intra_range_delimiter,
10499                                                   end, inter_range_delimiter);
10500         }
10501         else {
10502             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10503                                           start, inter_range_delimiter);
10504         }
10505     }
10506
10507     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10508         SvCUR_set(output, SvCUR(output) - 1);
10509     }
10510
10511     return output;
10512 }
10513
10514 #ifndef PERL_IN_XSUB_RE
10515 void
10516 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10517                          const char * const indent, SV* const invlist)
10518 {
10519     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10520      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10521      * the string 'indent'.  The output looks like this:
10522          [0] 0x000A .. 0x000D
10523          [2] 0x0085
10524          [4] 0x2028 .. 0x2029
10525          [6] 0x3104 .. INFTY
10526      * This means that the first range of code points matched by the list are
10527      * 0xA through 0xD; the second range contains only the single code point
10528      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10529      * are used to define each range (except if the final range extends to
10530      * infinity, only a single element is needed).  The array index of the
10531      * first element for the corresponding range is given in brackets. */
10532
10533     UV start, end;
10534     STRLEN count = 0;
10535
10536     PERL_ARGS_ASSERT__INVLIST_DUMP;
10537
10538     if (invlist_is_iterating(invlist)) {
10539         Perl_dump_indent(aTHX_ level, file,
10540              "%sCan't dump inversion list because is in middle of iterating\n",
10541              indent);
10542         return;
10543     }
10544
10545     invlist_iterinit(invlist);
10546     while (invlist_iternext(invlist, &start, &end)) {
10547         if (end == UV_MAX) {
10548             Perl_dump_indent(aTHX_ level, file,
10549                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10550                                    indent, (UV)count, start);
10551         }
10552         else if (end != start) {
10553             Perl_dump_indent(aTHX_ level, file,
10554                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10555                                 indent, (UV)count, start,         end);
10556         }
10557         else {
10558             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10559                                             indent, (UV)count, start);
10560         }
10561         count += 2;
10562     }
10563 }
10564
10565 #endif
10566
10567 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10568 bool
10569 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10570 {
10571     /* Return a boolean as to if the two passed in inversion lists are
10572      * identical.  The final argument, if TRUE, says to take the complement of
10573      * the second inversion list before doing the comparison */
10574
10575     const UV len_a = _invlist_len(a);
10576     UV len_b = _invlist_len(b);
10577
10578     const UV* array_a = NULL;
10579     const UV* array_b = NULL;
10580
10581     PERL_ARGS_ASSERT__INVLISTEQ;
10582
10583     /* This code avoids accessing the arrays unless it knows the length is
10584      * non-zero */
10585
10586     if (len_a == 0) {
10587         if (len_b == 0) {
10588             return ! complement_b;
10589         }
10590     }
10591     else {
10592         array_a = invlist_array(a);
10593     }
10594
10595     if (len_b != 0) {
10596         array_b = invlist_array(b);
10597     }
10598
10599     /* If are to compare 'a' with the complement of b, set it
10600      * up so are looking at b's complement. */
10601     if (complement_b) {
10602
10603         /* The complement of nothing is everything, so <a> would have to have
10604          * just one element, starting at zero (ending at infinity) */
10605         if (len_b == 0) {
10606             return (len_a == 1 && array_a[0] == 0);
10607         }
10608         if (array_b[0] == 0) {
10609
10610             /* Otherwise, to complement, we invert.  Here, the first element is
10611              * 0, just remove it.  To do this, we just pretend the array starts
10612              * one later */
10613
10614             array_b++;
10615             len_b--;
10616         }
10617         else {
10618
10619             /* But if the first element is not zero, we pretend the list starts
10620              * at the 0 that is always stored immediately before the array. */
10621             array_b--;
10622             len_b++;
10623         }
10624     }
10625
10626     return    len_a == len_b
10627            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10628
10629 }
10630 #endif
10631
10632 /*
10633  * As best we can, determine the characters that can match the start of
10634  * the given EXACTF-ish node.  This is for use in creating ssc nodes, so there
10635  * can be false positive matches
10636  *
10637  * Returns the invlist as a new SV*; it is the caller's responsibility to
10638  * call SvREFCNT_dec() when done with it.
10639  */
10640 STATIC SV*
10641 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10642 {
10643     const U8 * s = (U8*)STRING(node);
10644     SSize_t bytelen = STR_LEN(node);
10645     UV uc;
10646     /* Start out big enough for 2 separate code points */
10647     SV* invlist = _new_invlist(4);
10648
10649     PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10650
10651     if (! UTF) {
10652         uc = *s;
10653
10654         /* We punt and assume can match anything if the node begins
10655          * with a multi-character fold.  Things are complicated.  For
10656          * example, /ffi/i could match any of:
10657          *  "\N{LATIN SMALL LIGATURE FFI}"
10658          *  "\N{LATIN SMALL LIGATURE FF}I"
10659          *  "F\N{LATIN SMALL LIGATURE FI}"
10660          *  plus several other things; and making sure we have all the
10661          *  possibilities is hard. */
10662         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10663             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10664         }
10665         else {
10666             /* Any Latin1 range character can potentially match any
10667              * other depending on the locale, and in Turkic locales, U+130 and
10668              * U+131 */
10669             if (OP(node) == EXACTFL) {
10670                 _invlist_union(invlist, PL_Latin1, &invlist);
10671                 invlist = add_cp_to_invlist(invlist,
10672                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10673                 invlist = add_cp_to_invlist(invlist,
10674                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10675             }
10676             else {
10677                 /* But otherwise, it matches at least itself.  We can
10678                  * quickly tell if it has a distinct fold, and if so,
10679                  * it matches that as well */
10680                 invlist = add_cp_to_invlist(invlist, uc);
10681                 if (IS_IN_SOME_FOLD_L1(uc))
10682                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10683             }
10684
10685             /* Some characters match above-Latin1 ones under /i.  This
10686              * is true of EXACTFL ones when the locale is UTF-8 */
10687             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10688                 && (! isASCII(uc) || (OP(node) != EXACTFAA
10689                                     && OP(node) != EXACTFAA_NO_TRIE)))
10690             {
10691                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10692             }
10693         }
10694     }
10695     else {  /* Pattern is UTF-8 */
10696         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10697         const U8* e = s + bytelen;
10698         IV fc;
10699
10700         fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10701
10702         /* The only code points that aren't folded in a UTF EXACTFish
10703          * node are the problematic ones in EXACTFL nodes */
10704         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10705             /* We need to check for the possibility that this EXACTFL
10706              * node begins with a multi-char fold.  Therefore we fold
10707              * the first few characters of it so that we can make that
10708              * check */
10709             U8 *d = folded;
10710             int i;
10711
10712             fc = -1;
10713             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10714                 if (isASCII(*s)) {
10715                     *(d++) = (U8) toFOLD(*s);
10716                     if (fc < 0) {       /* Save the first fold */
10717                         fc = *(d-1);
10718                     }
10719                     s++;
10720                 }
10721                 else {
10722                     STRLEN len;
10723                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
10724                     if (fc < 0) {       /* Save the first fold */
10725                         fc = fold;
10726                     }
10727                     d += len;
10728                     s += UTF8SKIP(s);
10729                 }
10730             }
10731
10732             /* And set up so the code below that looks in this folded
10733              * buffer instead of the node's string */
10734             e = d;
10735             s = folded;
10736         }
10737
10738         /* When we reach here 's' points to the fold of the first
10739          * character(s) of the node; and 'e' points to far enough along
10740          * the folded string to be just past any possible multi-char
10741          * fold.
10742          *
10743          * Unlike the non-UTF-8 case, the macro for determining if a
10744          * string is a multi-char fold requires all the characters to
10745          * already be folded.  This is because of all the complications
10746          * if not.  Note that they are folded anyway, except in EXACTFL
10747          * nodes.  Like the non-UTF case above, we punt if the node
10748          * begins with a multi-char fold  */
10749
10750         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10751             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10752         }
10753         else {  /* Single char fold */
10754             unsigned int k;
10755             U32 first_fold;
10756             const U32 * remaining_folds;
10757             Size_t folds_count;
10758
10759             /* It matches itself */
10760             invlist = add_cp_to_invlist(invlist, fc);
10761
10762             /* ... plus all the things that fold to it, which are found in
10763              * PL_utf8_foldclosures */
10764             folds_count = _inverse_folds(fc, &first_fold,
10765                                                 &remaining_folds);
10766             for (k = 0; k < folds_count; k++) {
10767                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10768
10769                 /* /aa doesn't allow folds between ASCII and non- */
10770                 if (   (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10771                     && isASCII(c) != isASCII(fc))
10772                 {
10773                     continue;
10774                 }
10775
10776                 invlist = add_cp_to_invlist(invlist, c);
10777             }
10778
10779             if (OP(node) == EXACTFL) {
10780
10781                 /* If either [iI] are present in an EXACTFL node the above code
10782                  * should have added its normal case pair, but under a Turkish
10783                  * locale they could match instead the case pairs from it.  Add
10784                  * those as potential matches as well */
10785                 if (isALPHA_FOLD_EQ(fc, 'I')) {
10786                     invlist = add_cp_to_invlist(invlist,
10787                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10788                     invlist = add_cp_to_invlist(invlist,
10789                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10790                 }
10791                 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10792                     invlist = add_cp_to_invlist(invlist, 'I');
10793                 }
10794                 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10795                     invlist = add_cp_to_invlist(invlist, 'i');
10796                 }
10797             }
10798         }
10799     }
10800
10801     return invlist;
10802 }
10803
10804 #undef HEADER_LENGTH
10805 #undef TO_INTERNAL_SIZE
10806 #undef FROM_INTERNAL_SIZE
10807 #undef INVLIST_VERSION_ID
10808
10809 /* End of inversion list object */
10810
10811 STATIC void
10812 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10813 {
10814     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10815      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10816      * should point to the first flag; it is updated on output to point to the
10817      * final ')' or ':'.  There needs to be at least one flag, or this will
10818      * abort */
10819
10820     /* for (?g), (?gc), and (?o) warnings; warning
10821        about (?c) will warn about (?g) -- japhy    */
10822
10823 #define WASTED_O  0x01
10824 #define WASTED_G  0x02
10825 #define WASTED_C  0x04
10826 #define WASTED_GC (WASTED_G|WASTED_C)
10827     I32 wastedflags = 0x00;
10828     U32 posflags = 0, negflags = 0;
10829     U32 *flagsp = &posflags;
10830     char has_charset_modifier = '\0';
10831     regex_charset cs;
10832     bool has_use_defaults = FALSE;
10833     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10834     int x_mod_count = 0;
10835
10836     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10837
10838     /* '^' as an initial flag sets certain defaults */
10839     if (UCHARAT(RExC_parse) == '^') {
10840         RExC_parse++;
10841         has_use_defaults = TRUE;
10842         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10843         cs = (RExC_uni_semantics)
10844              ? REGEX_UNICODE_CHARSET
10845              : REGEX_DEPENDS_CHARSET;
10846         set_regex_charset(&RExC_flags, cs);
10847     }
10848     else {
10849         cs = get_regex_charset(RExC_flags);
10850         if (   cs == REGEX_DEPENDS_CHARSET
10851             && RExC_uni_semantics)
10852         {
10853             cs = REGEX_UNICODE_CHARSET;
10854         }
10855     }
10856
10857     while (RExC_parse < RExC_end) {
10858         /* && memCHRs("iogcmsx", *RExC_parse) */
10859         /* (?g), (?gc) and (?o) are useless here
10860            and must be globally applied -- japhy */
10861         if ((RExC_pm_flags & PMf_WILDCARD)) {
10862             if (flagsp == & negflags) {
10863                 if (*RExC_parse == 'm') {
10864                     RExC_parse++;
10865                     /* diag_listed_as: Use of %s is not allowed in Unicode
10866                        property wildcard subpatterns in regex; marked by <--
10867                        HERE in m/%s/ */
10868                     vFAIL("Use of modifier '-m' is not allowed in Unicode"
10869                           " property wildcard subpatterns");
10870                 }
10871             }
10872             else {
10873                 if (*RExC_parse == 's') {
10874                     goto modifier_illegal_in_wildcard;
10875                 }
10876             }
10877         }
10878
10879         switch (*RExC_parse) {
10880
10881             /* Code for the imsxn flags */
10882             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10883
10884             case LOCALE_PAT_MOD:
10885                 if (has_charset_modifier) {
10886                     goto excess_modifier;
10887                 }
10888                 else if (flagsp == &negflags) {
10889                     goto neg_modifier;
10890                 }
10891                 cs = REGEX_LOCALE_CHARSET;
10892                 has_charset_modifier = LOCALE_PAT_MOD;
10893                 break;
10894             case UNICODE_PAT_MOD:
10895                 if (has_charset_modifier) {
10896                     goto excess_modifier;
10897                 }
10898                 else if (flagsp == &negflags) {
10899                     goto neg_modifier;
10900                 }
10901                 cs = REGEX_UNICODE_CHARSET;
10902                 has_charset_modifier = UNICODE_PAT_MOD;
10903                 break;
10904             case ASCII_RESTRICT_PAT_MOD:
10905                 if (flagsp == &negflags) {
10906                     goto neg_modifier;
10907                 }
10908                 if (has_charset_modifier) {
10909                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10910                         goto excess_modifier;
10911                     }
10912                     /* Doubled modifier implies more restricted */
10913                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10914                 }
10915                 else {
10916                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10917                 }
10918                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10919                 break;
10920             case DEPENDS_PAT_MOD:
10921                 if (has_use_defaults) {
10922                     goto fail_modifiers;
10923                 }
10924                 else if (flagsp == &negflags) {
10925                     goto neg_modifier;
10926                 }
10927                 else if (has_charset_modifier) {
10928                     goto excess_modifier;
10929                 }
10930
10931                 /* The dual charset means unicode semantics if the
10932                  * pattern (or target, not known until runtime) are
10933                  * utf8, or something in the pattern indicates unicode
10934                  * semantics */
10935                 cs = (RExC_uni_semantics)
10936                      ? REGEX_UNICODE_CHARSET
10937                      : REGEX_DEPENDS_CHARSET;
10938                 has_charset_modifier = DEPENDS_PAT_MOD;
10939                 break;
10940               excess_modifier:
10941                 RExC_parse++;
10942                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10943                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10944                 }
10945                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10946                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10947                                         *(RExC_parse - 1));
10948                 }
10949                 else {
10950                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10951                 }
10952                 NOT_REACHED; /*NOTREACHED*/
10953               neg_modifier:
10954                 RExC_parse++;
10955                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10956                                     *(RExC_parse - 1));
10957                 NOT_REACHED; /*NOTREACHED*/
10958             case GLOBAL_PAT_MOD: /* 'g' */
10959                 if (RExC_pm_flags & PMf_WILDCARD) {
10960                     goto modifier_illegal_in_wildcard;
10961                 }
10962                 /*FALLTHROUGH*/
10963             case ONCE_PAT_MOD: /* 'o' */
10964                 if (ckWARN(WARN_REGEXP)) {
10965                     const I32 wflagbit = *RExC_parse == 'o'
10966                                          ? WASTED_O
10967                                          : WASTED_G;
10968                     if (! (wastedflags & wflagbit) ) {
10969                         wastedflags |= wflagbit;
10970                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10971                         vWARN5(
10972                             RExC_parse + 1,
10973                             "Useless (%s%c) - %suse /%c modifier",
10974                             flagsp == &negflags ? "?-" : "?",
10975                             *RExC_parse,
10976                             flagsp == &negflags ? "don't " : "",
10977                             *RExC_parse
10978                         );
10979                     }
10980                 }
10981                 break;
10982
10983             case CONTINUE_PAT_MOD: /* 'c' */
10984                 if (RExC_pm_flags & PMf_WILDCARD) {
10985                     goto modifier_illegal_in_wildcard;
10986                 }
10987                 if (ckWARN(WARN_REGEXP)) {
10988                     if (! (wastedflags & WASTED_C) ) {
10989                         wastedflags |= WASTED_GC;
10990                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10991                         vWARN3(
10992                             RExC_parse + 1,
10993                             "Useless (%sc) - %suse /gc modifier",
10994                             flagsp == &negflags ? "?-" : "?",
10995                             flagsp == &negflags ? "don't " : ""
10996                         );
10997                     }
10998                 }
10999                 break;
11000             case KEEPCOPY_PAT_MOD: /* 'p' */
11001                 if (RExC_pm_flags & PMf_WILDCARD) {
11002                     goto modifier_illegal_in_wildcard;
11003                 }
11004                 if (flagsp == &negflags) {
11005                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
11006                 } else {
11007                     *flagsp |= RXf_PMf_KEEPCOPY;
11008                 }
11009                 break;
11010             case '-':
11011                 /* A flag is a default iff it is following a minus, so
11012                  * if there is a minus, it means will be trying to
11013                  * re-specify a default which is an error */
11014                 if (has_use_defaults || flagsp == &negflags) {
11015                     goto fail_modifiers;
11016                 }
11017                 flagsp = &negflags;
11018                 wastedflags = 0;  /* reset so (?g-c) warns twice */
11019                 x_mod_count = 0;
11020                 break;
11021             case ':':
11022             case ')':
11023
11024                 if (  (RExC_pm_flags & PMf_WILDCARD)
11025                     && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
11026                 {
11027                     RExC_parse++;
11028                     /* diag_listed_as: Use of %s is not allowed in Unicode
11029                        property wildcard subpatterns in regex; marked by <--
11030                        HERE in m/%s/ */
11031                     vFAIL2("Use of modifier '%c' is not allowed in Unicode"
11032                            " property wildcard subpatterns",
11033                            has_charset_modifier);
11034                 }
11035
11036                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
11037                     negflags |= RXf_PMf_EXTENDED_MORE;
11038                 }
11039                 RExC_flags |= posflags;
11040
11041                 if (negflags & RXf_PMf_EXTENDED) {
11042                     negflags |= RXf_PMf_EXTENDED_MORE;
11043                 }
11044                 RExC_flags &= ~negflags;
11045                 set_regex_charset(&RExC_flags, cs);
11046
11047                 return;
11048             default:
11049               fail_modifiers:
11050                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11051                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11052                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11053                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11054                 NOT_REACHED; /*NOTREACHED*/
11055         }
11056
11057         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11058     }
11059
11060     vFAIL("Sequence (?... not terminated");
11061
11062   modifier_illegal_in_wildcard:
11063     RExC_parse++;
11064     /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
11065        subpatterns in regex; marked by <-- HERE in m/%s/ */
11066     vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
11067            " subpatterns", *(RExC_parse - 1));
11068 }
11069
11070 /*
11071  - reg - regular expression, i.e. main body or parenthesized thing
11072  *
11073  * Caller must absorb opening parenthesis.
11074  *
11075  * Combining parenthesis handling with the base level of regular expression
11076  * is a trifle forced, but the need to tie the tails of the branches to what
11077  * follows makes it hard to avoid.
11078  */
11079 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11080 #ifdef DEBUGGING
11081 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11082 #else
11083 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11084 #endif
11085
11086 STATIC regnode_offset
11087 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11088                              I32 *flagp,
11089                              char * parse_start,
11090                              char ch
11091                       )
11092 {
11093     regnode_offset ret;
11094     char* name_start = RExC_parse;
11095     U32 num = 0;
11096     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11097     DECLARE_AND_GET_RE_DEBUG_FLAGS;
11098
11099     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11100
11101     if (RExC_parse == name_start || *RExC_parse != ch) {
11102         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11103         vFAIL2("Sequence %.3s... not terminated", parse_start);
11104     }
11105
11106     if (sv_dat) {
11107         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11108         RExC_rxi->data->data[num]=(void*)sv_dat;
11109         SvREFCNT_inc_simple_void_NN(sv_dat);
11110     }
11111     RExC_sawback = 1;
11112     ret = reganode(pRExC_state,
11113                    ((! FOLD)
11114                      ? REFN
11115                      : (ASCII_FOLD_RESTRICTED)
11116                        ? REFFAN
11117                        : (AT_LEAST_UNI_SEMANTICS)
11118                          ? REFFUN
11119                          : (LOC)
11120                            ? REFFLN
11121                            : REFFN),
11122                     num);
11123     *flagp |= HASWIDTH;
11124
11125     Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11126     Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11127
11128     nextchar(pRExC_state);
11129     return ret;
11130 }
11131
11132 /* On success, returns the offset at which any next node should be placed into
11133  * the regex engine program being compiled.
11134  *
11135  * Returns 0 otherwise, with *flagp set to indicate why:
11136  *  TRYAGAIN        at the end of (?) that only sets flags.
11137  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
11138  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11139  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
11140  *  happen.  */
11141 STATIC regnode_offset
11142 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11143     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11144      * 2 is like 1, but indicates that nextchar() has been called to advance
11145      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
11146      * this flag alerts us to the need to check for that */
11147 {
11148     regnode_offset ret = 0;    /* Will be the head of the group. */
11149     regnode_offset br;
11150     regnode_offset lastbr;
11151     regnode_offset ender = 0;
11152     I32 parno = 0;
11153     I32 flags;
11154     U32 oregflags = RExC_flags;
11155     bool have_branch = 0;
11156     bool is_open = 0;
11157     I32 freeze_paren = 0;
11158     I32 after_freeze = 0;
11159     I32 num; /* numeric backreferences */
11160     SV * max_open;  /* Max number of unclosed parens */
11161
11162     char * parse_start = RExC_parse; /* MJD */
11163     char * const oregcomp_parse = RExC_parse;
11164
11165     DECLARE_AND_GET_RE_DEBUG_FLAGS;
11166
11167     PERL_ARGS_ASSERT_REG;
11168     DEBUG_PARSE("reg ");
11169
11170     max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11171     assert(max_open);
11172     if (!SvIOK(max_open)) {
11173         sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11174     }
11175     if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11176                                               open paren */
11177         vFAIL("Too many nested open parens");
11178     }
11179
11180     *flagp = 0;                         /* Initialize. */
11181
11182     if (RExC_in_lookbehind) {
11183         RExC_in_lookbehind++;
11184     }
11185     if (RExC_in_lookahead) {
11186         RExC_in_lookahead++;
11187     }
11188
11189     /* Having this true makes it feasible to have a lot fewer tests for the
11190      * parse pointer being in scope.  For example, we can write
11191      *      while(isFOO(*RExC_parse)) RExC_parse++;
11192      * instead of
11193      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11194      */
11195     assert(*RExC_end == '\0');
11196
11197     /* Make an OPEN node, if parenthesized. */
11198     if (paren) {
11199
11200         /* Under /x, space and comments can be gobbled up between the '(' and
11201          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
11202          * intervening space, as the sequence is a token, and a token should be
11203          * indivisible */
11204         bool has_intervening_patws = (paren == 2)
11205                                   && *(RExC_parse - 1) != '(';
11206
11207         if (RExC_parse >= RExC_end) {
11208             vFAIL("Unmatched (");
11209         }
11210
11211         if (paren == 'r') {     /* Atomic script run */
11212             paren = '>';
11213             goto parse_rest;
11214         }
11215         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11216             char *start_verb = RExC_parse + 1;
11217             STRLEN verb_len;
11218             char *start_arg = NULL;
11219             unsigned char op = 0;
11220             int arg_required = 0;
11221             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11222             bool has_upper = FALSE;
11223
11224             if (has_intervening_patws) {
11225                 RExC_parse++;   /* past the '*' */
11226
11227                 /* For strict backwards compatibility, don't change the message
11228                  * now that we also have lowercase operands */
11229                 if (isUPPER(*RExC_parse)) {
11230                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11231                 }
11232                 else {
11233                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11234                 }
11235             }
11236             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11237                 if ( *RExC_parse == ':' ) {
11238                     start_arg = RExC_parse + 1;
11239                     break;
11240                 }
11241                 else if (! UTF) {
11242                     if (isUPPER(*RExC_parse)) {
11243                         has_upper = TRUE;
11244                     }
11245                     RExC_parse++;
11246                 }
11247                 else {
11248                     RExC_parse += UTF8SKIP(RExC_parse);
11249                 }
11250             }
11251             verb_len = RExC_parse - start_verb;
11252             if ( start_arg ) {
11253                 if (RExC_parse >= RExC_end) {
11254                     goto unterminated_verb_pattern;
11255                 }
11256
11257                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11258                 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11259                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11260                 }
11261                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11262                   unterminated_verb_pattern:
11263                     if (has_upper) {
11264                         vFAIL("Unterminated verb pattern argument");
11265                     }
11266                     else {
11267                         vFAIL("Unterminated '(*...' argument");
11268                     }
11269                 }
11270             } else {
11271                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11272                     if (has_upper) {
11273                         vFAIL("Unterminated verb pattern");
11274                     }
11275                     else {
11276                         vFAIL("Unterminated '(*...' construct");
11277                     }
11278                 }
11279             }
11280
11281             /* Here, we know that RExC_parse < RExC_end */
11282
11283             switch ( *start_verb ) {
11284             case 'A':  /* (*ACCEPT) */
11285                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11286                     op = ACCEPT;
11287                     internal_argval = RExC_nestroot;
11288                 }
11289                 break;
11290             case 'C':  /* (*COMMIT) */
11291                 if ( memEQs(start_verb, verb_len,"COMMIT") )
11292                     op = COMMIT;
11293                 break;
11294             case 'F':  /* (*FAIL) */
11295                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11296                     op = OPFAIL;
11297                 }
11298                 break;
11299             case ':':  /* (*:NAME) */
11300             case 'M':  /* (*MARK:NAME) */
11301                 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11302                     op = MARKPOINT;
11303                     arg_required = 1;
11304                 }
11305                 break;
11306             case 'P':  /* (*PRUNE) */
11307                 if ( memEQs(start_verb, verb_len,"PRUNE") )
11308                     op = PRUNE;
11309                 break;
11310             case 'S':   /* (*SKIP) */
11311                 if ( memEQs(start_verb, verb_len,"SKIP") )
11312                     op = SKIP;
11313                 break;
11314             case 'T':  /* (*THEN) */
11315                 /* [19:06] <TimToady> :: is then */
11316                 if ( memEQs(start_verb, verb_len,"THEN") ) {
11317                     op = CUTGROUP;
11318                     RExC_seen |= REG_CUTGROUP_SEEN;
11319                 }
11320                 break;
11321             case 'a':
11322                 if (   memEQs(start_verb, verb_len, "asr")
11323                     || memEQs(start_verb, verb_len, "atomic_script_run"))
11324                 {
11325                     paren = 'r';        /* Mnemonic: recursed run */
11326                     goto script_run;
11327                 }
11328                 else if (memEQs(start_verb, verb_len, "atomic")) {
11329                     paren = 't';    /* AtOMIC */
11330                     goto alpha_assertions;
11331                 }
11332                 break;
11333             case 'p':
11334                 if (   memEQs(start_verb, verb_len, "plb")
11335                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
11336                 {
11337                     paren = 'b';
11338                     goto lookbehind_alpha_assertions;
11339                 }
11340                 else if (   memEQs(start_verb, verb_len, "pla")
11341                          || memEQs(start_verb, verb_len, "positive_lookahead"))
11342                 {
11343                     paren = 'a';
11344                     goto alpha_assertions;
11345                 }
11346                 break;
11347             case 'n':
11348                 if (   memEQs(start_verb, verb_len, "nlb")
11349                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
11350                 {
11351                     paren = 'B';
11352                     goto lookbehind_alpha_assertions;
11353                 }
11354                 else if (   memEQs(start_verb, verb_len, "nla")
11355                          || memEQs(start_verb, verb_len, "negative_lookahead"))
11356                 {
11357                     paren = 'A';
11358                     goto alpha_assertions;
11359                 }
11360                 break;
11361             case 's':
11362                 if (   memEQs(start_verb, verb_len, "sr")
11363                     || memEQs(start_verb, verb_len, "script_run"))
11364                 {
11365                     regnode_offset atomic;
11366
11367                     paren = 's';
11368
11369                    script_run:
11370
11371                     /* This indicates Unicode rules. */
11372                     REQUIRE_UNI_RULES(flagp, 0);
11373
11374                     if (! start_arg) {
11375                         goto no_colon;
11376                     }
11377
11378                     RExC_parse = start_arg;
11379
11380                     if (RExC_in_script_run) {
11381
11382                         /*  Nested script runs are treated as no-ops, because
11383                          *  if the nested one fails, the outer one must as
11384                          *  well.  It could fail sooner, and avoid (??{} with
11385                          *  side effects, but that is explicitly documented as
11386                          *  undefined behavior. */
11387
11388                         ret = 0;
11389
11390                         if (paren == 's') {
11391                             paren = ':';
11392                             goto parse_rest;
11393                         }
11394
11395                         /* But, the atomic part of a nested atomic script run
11396                          * isn't a no-op, but can be treated just like a '(?>'
11397                          * */
11398                         paren = '>';
11399                         goto parse_rest;
11400                     }
11401
11402                     if (paren == 's') {
11403                         /* Here, we're starting a new regular script run */
11404                         ret = reg_node(pRExC_state, SROPEN);
11405                         RExC_in_script_run = 1;
11406                         is_open = 1;
11407                         goto parse_rest;
11408                     }
11409
11410                     /* Here, we are starting an atomic script run.  This is
11411                      * handled by recursing to deal with the atomic portion
11412                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
11413
11414                     ret = reg_node(pRExC_state, SROPEN);
11415
11416                     RExC_in_script_run = 1;
11417
11418                     atomic = reg(pRExC_state, 'r', &flags, depth);
11419                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
11420                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11421                         return 0;
11422                     }
11423
11424                     if (! REGTAIL(pRExC_state, ret, atomic)) {
11425                         REQUIRE_BRANCHJ(flagp, 0);
11426                     }
11427
11428                     if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11429                                                                 SRCLOSE)))
11430                     {
11431                         REQUIRE_BRANCHJ(flagp, 0);
11432                     }
11433
11434                     RExC_in_script_run = 0;
11435                     return ret;
11436                 }
11437
11438                 break;
11439
11440             lookbehind_alpha_assertions:
11441                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11442                 RExC_in_lookbehind++;
11443                 /*FALLTHROUGH*/
11444
11445             alpha_assertions:
11446
11447                 RExC_seen_zerolen++;
11448
11449                 if (! start_arg) {
11450                     goto no_colon;
11451                 }
11452
11453                 /* An empty negative lookahead assertion simply is failure */
11454                 if (paren == 'A' && RExC_parse == start_arg) {
11455                     ret=reganode(pRExC_state, OPFAIL, 0);
11456                     nextchar(pRExC_state);
11457                     return ret;
11458                 }
11459
11460                 RExC_parse = start_arg;
11461                 goto parse_rest;
11462
11463               no_colon:
11464                 vFAIL2utf8f(
11465                 "'(*%" UTF8f "' requires a terminating ':'",
11466                 UTF8fARG(UTF, verb_len, start_verb));
11467                 NOT_REACHED; /*NOTREACHED*/
11468
11469             } /* End of switch */
11470             if ( ! op ) {
11471                 RExC_parse += UTF
11472                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11473                               : 1;
11474                 if (has_upper || verb_len == 0) {
11475                     vFAIL2utf8f(
11476                     "Unknown verb pattern '%" UTF8f "'",
11477                     UTF8fARG(UTF, verb_len, start_verb));
11478                 }
11479                 else {
11480                     vFAIL2utf8f(
11481                     "Unknown '(*...)' construct '%" UTF8f "'",
11482                     UTF8fARG(UTF, verb_len, start_verb));
11483                 }
11484             }
11485             if ( RExC_parse == start_arg ) {
11486                 start_arg = NULL;
11487             }
11488             if ( arg_required && !start_arg ) {
11489                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11490                     (int) verb_len, start_verb);
11491             }
11492             if (internal_argval == -1) {
11493                 ret = reganode(pRExC_state, op, 0);
11494             } else {
11495                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11496             }
11497             RExC_seen |= REG_VERBARG_SEEN;
11498             if (start_arg) {
11499                 SV *sv = newSVpvn( start_arg,
11500                                     RExC_parse - start_arg);
11501                 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11502                                         STR_WITH_LEN("S"));
11503                 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11504                 FLAGS(REGNODE_p(ret)) = 1;
11505             } else {
11506                 FLAGS(REGNODE_p(ret)) = 0;
11507             }
11508             if ( internal_argval != -1 )
11509                 ARG2L_SET(REGNODE_p(ret), internal_argval);
11510             nextchar(pRExC_state);
11511             return ret;
11512         }
11513         else if (*RExC_parse == '?') { /* (?...) */
11514             bool is_logical = 0;
11515             const char * const seqstart = RExC_parse;
11516             const char * endptr;
11517             const char non_existent_group_msg[]
11518                                             = "Reference to nonexistent group";
11519             const char impossible_group[] = "Invalid reference to group";
11520
11521             if (has_intervening_patws) {
11522                 RExC_parse++;
11523                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11524             }
11525
11526             RExC_parse++;           /* past the '?' */
11527             paren = *RExC_parse;    /* might be a trailing NUL, if not
11528                                        well-formed */
11529             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11530             if (RExC_parse > RExC_end) {
11531                 paren = '\0';
11532             }
11533             ret = 0;                    /* For look-ahead/behind. */
11534             switch (paren) {
11535
11536             case 'P':   /* (?P...) variants for those used to PCRE/Python */
11537                 paren = *RExC_parse;
11538                 if ( paren == '<') {    /* (?P<...>) named capture */
11539                     RExC_parse++;
11540                     if (RExC_parse >= RExC_end) {
11541                         vFAIL("Sequence (?P<... not terminated");
11542                     }
11543                     goto named_capture;
11544                 }
11545                 else if (paren == '>') {   /* (?P>name) named recursion */
11546                     RExC_parse++;
11547                     if (RExC_parse >= RExC_end) {
11548                         vFAIL("Sequence (?P>... not terminated");
11549                     }
11550                     goto named_recursion;
11551                 }
11552                 else if (paren == '=') {   /* (?P=...)  named backref */
11553                     RExC_parse++;
11554                     return handle_named_backref(pRExC_state, flagp,
11555                                                 parse_start, ')');
11556                 }
11557                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11558                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11559                 vFAIL3("Sequence (%.*s...) not recognized",
11560                                 (int) (RExC_parse - seqstart), seqstart);
11561                 NOT_REACHED; /*NOTREACHED*/
11562             case '<':           /* (?<...) */
11563                 /* If you want to support (?<*...), first reconcile with GH #17363 */
11564                 if (*RExC_parse == '!')
11565                     paren = ',';
11566                 else if (*RExC_parse != '=')
11567               named_capture:
11568                 {               /* (?<...>) */
11569                     char *name_start;
11570                     SV *svname;
11571                     paren= '>';
11572                 /* FALLTHROUGH */
11573             case '\'':          /* (?'...') */
11574                     name_start = RExC_parse;
11575                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11576                     if (   RExC_parse == name_start
11577                         || RExC_parse >= RExC_end
11578                         || *RExC_parse != paren)
11579                     {
11580                         vFAIL2("Sequence (?%c... not terminated",
11581                             paren=='>' ? '<' : (char) paren);
11582                     }
11583                     {
11584                         HE *he_str;
11585                         SV *sv_dat = NULL;
11586                         if (!svname) /* shouldn't happen */
11587                             Perl_croak(aTHX_
11588                                 "panic: reg_scan_name returned NULL");
11589                         if (!RExC_paren_names) {
11590                             RExC_paren_names= newHV();
11591                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
11592 #ifdef DEBUGGING
11593                             RExC_paren_name_list= newAV();
11594                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11595 #endif
11596                         }
11597                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11598                         if ( he_str )
11599                             sv_dat = HeVAL(he_str);
11600                         if ( ! sv_dat ) {
11601                             /* croak baby croak */
11602                             Perl_croak(aTHX_
11603                                 "panic: paren_name hash element allocation failed");
11604                         } else if ( SvPOK(sv_dat) ) {
11605                             /* (?|...) can mean we have dupes so scan to check
11606                                its already been stored. Maybe a flag indicating
11607                                we are inside such a construct would be useful,
11608                                but the arrays are likely to be quite small, so
11609                                for now we punt -- dmq */
11610                             IV count = SvIV(sv_dat);
11611                             I32 *pv = (I32*)SvPVX(sv_dat);
11612                             IV i;
11613                             for ( i = 0 ; i < count ; i++ ) {
11614                                 if ( pv[i] == RExC_npar ) {
11615                                     count = 0;
11616                                     break;
11617                                 }
11618                             }
11619                             if ( count ) {
11620                                 pv = (I32*)SvGROW(sv_dat,
11621                                                 SvCUR(sv_dat) + sizeof(I32)+1);
11622                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11623                                 pv[count] = RExC_npar;
11624                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11625                             }
11626                         } else {
11627                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
11628                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
11629                                                                 sizeof(I32));
11630                             SvIOK_on(sv_dat);
11631                             SvIV_set(sv_dat, 1);
11632                         }
11633 #ifdef DEBUGGING
11634                         /* Yes this does cause a memory leak in debugging Perls
11635                          * */
11636                         if (!av_store(RExC_paren_name_list,
11637                                       RExC_npar, SvREFCNT_inc_NN(svname)))
11638                             SvREFCNT_dec_NN(svname);
11639 #endif
11640
11641                         /*sv_dump(sv_dat);*/
11642                     }
11643                     nextchar(pRExC_state);
11644                     paren = 1;
11645                     goto capturing_parens;
11646                 }
11647
11648                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11649                 RExC_in_lookbehind++;
11650                 RExC_parse++;
11651                 if (RExC_parse >= RExC_end) {
11652                     vFAIL("Sequence (?... not terminated");
11653                 }
11654                 RExC_seen_zerolen++;
11655                 break;
11656             case '=':           /* (?=...) */
11657                 RExC_seen_zerolen++;
11658                 RExC_in_lookahead++;
11659                 break;
11660             case '!':           /* (?!...) */
11661                 RExC_seen_zerolen++;
11662                 /* check if we're really just a "FAIL" assertion */
11663                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11664                                         FALSE /* Don't force to /x */ );
11665                 if (*RExC_parse == ')') {
11666                     ret=reganode(pRExC_state, OPFAIL, 0);
11667                     nextchar(pRExC_state);
11668                     return ret;
11669                 }
11670                 break;
11671             case '|':           /* (?|...) */
11672                 /* branch reset, behave like a (?:...) except that
11673                    buffers in alternations share the same numbers */
11674                 paren = ':';
11675                 after_freeze = freeze_paren = RExC_npar;
11676
11677                 /* XXX This construct currently requires an extra pass.
11678                  * Investigation would be required to see if that could be
11679                  * changed */
11680                 REQUIRE_PARENS_PASS;
11681                 break;
11682             case ':':           /* (?:...) */
11683             case '>':           /* (?>...) */
11684                 break;
11685             case '$':           /* (?$...) */
11686             case '@':           /* (?@...) */
11687                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11688                 break;
11689             case '0' :           /* (?0) */
11690             case 'R' :           /* (?R) */
11691                 if (RExC_parse == RExC_end || *RExC_parse != ')')
11692                     FAIL("Sequence (?R) not terminated");
11693                 num = 0;
11694                 RExC_seen |= REG_RECURSE_SEEN;
11695
11696                 /* XXX These constructs currently require an extra pass.
11697                  * It probably could be changed */
11698                 REQUIRE_PARENS_PASS;
11699
11700                 *flagp |= POSTPONED;
11701                 goto gen_recurse_regop;
11702                 /*notreached*/
11703             /* named and numeric backreferences */
11704             case '&':            /* (?&NAME) */
11705                 parse_start = RExC_parse - 1;
11706               named_recursion:
11707                 {
11708                     SV *sv_dat = reg_scan_name(pRExC_state,
11709                                                REG_RSN_RETURN_DATA);
11710                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11711                 }
11712                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11713                     vFAIL("Sequence (?&... not terminated");
11714                 goto gen_recurse_regop;
11715                 /* NOTREACHED */
11716             case '+':
11717                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11718                     RExC_parse++;
11719                     vFAIL("Illegal pattern");
11720                 }
11721                 goto parse_recursion;
11722                 /* NOTREACHED*/
11723             case '-': /* (?-1) */
11724                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11725                     RExC_parse--; /* rewind to let it be handled later */
11726                     goto parse_flags;
11727                 }
11728                 /* FALLTHROUGH */
11729             case '1': case '2': case '3': case '4': /* (?1) */
11730             case '5': case '6': case '7': case '8': case '9':
11731                 RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
11732               parse_recursion:
11733                 {
11734                     bool is_neg = FALSE;
11735                     UV unum;
11736                     parse_start = RExC_parse - 1; /* MJD */
11737                     if (*RExC_parse == '-') {
11738                         RExC_parse++;
11739                         is_neg = TRUE;
11740                     }
11741                     endptr = RExC_end;
11742                     if (grok_atoUV(RExC_parse, &unum, &endptr)
11743                         && unum <= I32_MAX
11744                     ) {
11745                         num = (I32)unum;
11746                         RExC_parse = (char*)endptr;
11747                     }
11748                     else {  /* Overflow, or something like that.  Position
11749                                beyond all digits for the message */
11750                         while (RExC_parse < RExC_end && isDIGIT(*RExC_parse))  {
11751                             RExC_parse++;
11752                         }
11753                         vFAIL(impossible_group);
11754                     }
11755                     if (is_neg) {
11756                         /* -num is always representable on 1 and 2's complement
11757                          * machines */
11758                         num = -num;
11759                     }
11760                 }
11761                 if (*RExC_parse!=')')
11762                     vFAIL("Expecting close bracket");
11763
11764               gen_recurse_regop:
11765                 if (paren == '-' || paren == '+') {
11766
11767                     /* Don't overflow */
11768                     if (UNLIKELY(I32_MAX - RExC_npar < num)) {
11769                         RExC_parse++;
11770                         vFAIL(impossible_group);
11771                     }
11772
11773                     /*
11774                     Diagram of capture buffer numbering.
11775                     Top line is the normal capture buffer numbers
11776                     Bottom line is the negative indexing as from
11777                     the X (the (?-2))
11778
11779                         1 2    3 4 5 X   Y      6 7
11780                        /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
11781                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11782                     -   5 4    3 2 1 X   Y      x x
11783
11784                     Resolve to absolute group.  Recall that RExC_npar is +1 of
11785                     the actual parenthesis group number.  For lookahead, we
11786                     have to compensate for that.  Using the above example, when
11787                     we get to Y in the parse, num is 2 and RExC_npar is 6.  We
11788                     want 7 for +2, and 4 for -2.
11789                     */
11790                     if ( paren == '+' ) {
11791                         num--;
11792                     }
11793
11794                     num += RExC_npar;
11795
11796                     if (paren == '-' && num < 1) {
11797                         RExC_parse++;
11798                         vFAIL(non_existent_group_msg);
11799                     }
11800                 }
11801
11802                 if (num >= RExC_npar) {
11803
11804                     /* It might be a forward reference; we can't fail until we
11805                      * know, by completing the parse to get all the groups, and
11806                      * then reparsing */
11807                     if (ALL_PARENS_COUNTED)  {
11808                         if (num >= RExC_total_parens) {
11809                             RExC_parse++;
11810                             vFAIL(non_existent_group_msg);
11811                         }
11812                     }
11813                     else {
11814                         REQUIRE_PARENS_PASS;
11815                     }
11816                 }
11817
11818                 /* We keep track how many GOSUB items we have produced.
11819                    To start off the ARG2L() of the GOSUB holds its "id",
11820                    which is used later in conjunction with RExC_recurse
11821                    to calculate the offset we need to jump for the GOSUB,
11822                    which it will store in the final representation.
11823                    We have to defer the actual calculation until much later
11824                    as the regop may move.
11825                  */
11826                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11827                 RExC_recurse_count++;
11828                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11829                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11830                             22, "|    |", (int)(depth * 2 + 1), "",
11831                             (UV)ARG(REGNODE_p(ret)),
11832                             (IV)ARG2L(REGNODE_p(ret))));
11833                 RExC_seen |= REG_RECURSE_SEEN;
11834
11835                 Set_Node_Length(REGNODE_p(ret),
11836                                 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11837                 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11838
11839                 *flagp |= POSTPONED;
11840                 assert(*RExC_parse == ')');
11841                 nextchar(pRExC_state);
11842                 return ret;
11843
11844             /* NOTREACHED */
11845
11846             case '?':           /* (??...) */
11847                 is_logical = 1;
11848                 if (*RExC_parse != '{') {
11849                     RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11850                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11851                     vFAIL2utf8f(
11852                         "Sequence (%" UTF8f "...) not recognized",
11853                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11854                     NOT_REACHED; /*NOTREACHED*/
11855                 }
11856                 *flagp |= POSTPONED;
11857                 paren = '{';
11858                 RExC_parse++;
11859                 /* FALLTHROUGH */
11860             case '{':           /* (?{...}) */
11861             {
11862                 U32 n = 0;
11863                 struct reg_code_block *cb;
11864                 OP * o;
11865
11866                 RExC_seen_zerolen++;
11867
11868                 if (   !pRExC_state->code_blocks
11869                     || pRExC_state->code_index
11870                                         >= pRExC_state->code_blocks->count
11871                     || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11872                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11873                             - RExC_start)
11874                 ) {
11875                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
11876                         FAIL("panic: Sequence (?{...}): no code block found\n");
11877                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
11878                 }
11879                 /* this is a pre-compiled code block (?{...}) */
11880                 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11881                 RExC_parse = RExC_start + cb->end;
11882                 o = cb->block;
11883                 if (cb->src_regex) {
11884                     n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11885                     RExC_rxi->data->data[n] =
11886                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
11887                     RExC_rxi->data->data[n+1] = (void*)o;
11888                 }
11889                 else {
11890                     n = add_data(pRExC_state,
11891                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11892                     RExC_rxi->data->data[n] = (void*)o;
11893                 }
11894                 pRExC_state->code_index++;
11895                 nextchar(pRExC_state);
11896
11897                 if (is_logical) {
11898                     regnode_offset eval;
11899                     ret = reg_node(pRExC_state, LOGICAL);
11900
11901                     eval = reg2Lanode(pRExC_state, EVAL,
11902                                        n,
11903
11904                                        /* for later propagation into (??{})
11905                                         * return value */
11906                                        RExC_flags & RXf_PMf_COMPILETIME
11907                                       );
11908                     FLAGS(REGNODE_p(ret)) = 2;
11909                     if (! REGTAIL(pRExC_state, ret, eval)) {
11910                         REQUIRE_BRANCHJ(flagp, 0);
11911                     }
11912                     /* deal with the length of this later - MJD */
11913                     return ret;
11914                 }
11915                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11916                 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11917                 Set_Node_Offset(REGNODE_p(ret), parse_start);
11918                 return ret;
11919             }
11920             case '(':           /* (?(?{...})...) and (?(?=...)...) */
11921             {
11922                 int is_define= 0;
11923                 const int DEFINE_len = sizeof("DEFINE") - 1;
11924                 if (    RExC_parse < RExC_end - 1
11925                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
11926                             && (   RExC_parse[1] == '='
11927                                 || RExC_parse[1] == '!'
11928                                 || RExC_parse[1] == '<'
11929                                 || RExC_parse[1] == '{'))
11930                         || (       RExC_parse[0] == '*'        /* (?(*...)) */
11931                             && (   memBEGINs(RExC_parse + 1,
11932                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11933                                          "pla:")
11934                                 || memBEGINs(RExC_parse + 1,
11935                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11936                                          "plb:")
11937                                 || memBEGINs(RExC_parse + 1,
11938                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11939                                          "nla:")
11940                                 || memBEGINs(RExC_parse + 1,
11941                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11942                                          "nlb:")
11943                                 || memBEGINs(RExC_parse + 1,
11944                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11945                                          "positive_lookahead:")
11946                                 || memBEGINs(RExC_parse + 1,
11947                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11948                                          "positive_lookbehind:")
11949                                 || memBEGINs(RExC_parse + 1,
11950                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11951                                          "negative_lookahead:")
11952                                 || memBEGINs(RExC_parse + 1,
11953                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11954                                          "negative_lookbehind:"))))
11955                 ) { /* Lookahead or eval. */
11956                     I32 flag;
11957                     regnode_offset tail;
11958
11959                     ret = reg_node(pRExC_state, LOGICAL);
11960                     FLAGS(REGNODE_p(ret)) = 1;
11961
11962                     tail = reg(pRExC_state, 1, &flag, depth+1);
11963                     RETURN_FAIL_ON_RESTART(flag, flagp);
11964                     if (! REGTAIL(pRExC_state, ret, tail)) {
11965                         REQUIRE_BRANCHJ(flagp, 0);
11966                     }
11967                     goto insert_if;
11968                 }
11969                 else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11970                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11971                 {
11972                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
11973                     char *name_start= RExC_parse++;
11974                     U32 num = 0;
11975                     SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11976                     if (   RExC_parse == name_start
11977                         || RExC_parse >= RExC_end
11978                         || *RExC_parse != ch)
11979                     {
11980                         vFAIL2("Sequence (?(%c... not terminated",
11981                             (ch == '>' ? '<' : ch));
11982                     }
11983                     RExC_parse++;
11984                     if (sv_dat) {
11985                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11986                         RExC_rxi->data->data[num]=(void*)sv_dat;
11987                         SvREFCNT_inc_simple_void_NN(sv_dat);
11988                     }
11989                     ret = reganode(pRExC_state, GROUPPN, num);
11990                     goto insert_if_check_paren;
11991                 }
11992                 else if (memBEGINs(RExC_parse,
11993                                    (STRLEN) (RExC_end - RExC_parse),
11994                                    "DEFINE"))
11995                 {
11996                     ret = reganode(pRExC_state, DEFINEP, 0);
11997                     RExC_parse += DEFINE_len;
11998                     is_define = 1;
11999                     goto insert_if_check_paren;
12000                 }
12001                 else if (RExC_parse[0] == 'R') {
12002                     RExC_parse++;
12003                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
12004                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
12005                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
12006                      */
12007                     parno = 0;
12008                     if (RExC_parse[0] == '0') {
12009                         parno = 1;
12010                         RExC_parse++;
12011                     }
12012                     else if (inRANGE(RExC_parse[0], '1', '9')) {
12013                         UV uv;
12014                         endptr = RExC_end;
12015                         if (grok_atoUV(RExC_parse, &uv, &endptr)
12016                             && uv <= I32_MAX
12017                         ) {
12018                             parno = (I32)uv + 1;
12019                             RExC_parse = (char*)endptr;
12020                         }
12021                         /* else "Switch condition not recognized" below */
12022                     } else if (RExC_parse[0] == '&') {
12023                         SV *sv_dat;
12024                         RExC_parse++;
12025                         sv_dat = reg_scan_name(pRExC_state,
12026                                                REG_RSN_RETURN_DATA);
12027                         if (sv_dat)
12028                             parno = 1 + *((I32 *)SvPVX(sv_dat));
12029                     }
12030                     ret = reganode(pRExC_state, INSUBP, parno);
12031                     goto insert_if_check_paren;
12032                 }
12033                 else if (inRANGE(RExC_parse[0], '1', '9')) {
12034                     /* (?(1)...) */
12035                     char c;
12036                     UV uv;
12037                     endptr = RExC_end;
12038                     if (grok_atoUV(RExC_parse, &uv, &endptr)
12039                         && uv <= I32_MAX
12040                     ) {
12041                         parno = (I32)uv;
12042                         RExC_parse = (char*)endptr;
12043                     }
12044                     else {
12045                         vFAIL("panic: grok_atoUV returned FALSE");
12046                     }
12047                     ret = reganode(pRExC_state, GROUPP, parno);
12048
12049                  insert_if_check_paren:
12050                     if (UCHARAT(RExC_parse) != ')') {
12051                         RExC_parse += UTF
12052                                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12053                                       : 1;
12054                         vFAIL("Switch condition not recognized");
12055                     }
12056                     nextchar(pRExC_state);
12057                   insert_if:
12058                     if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
12059                                                              IFTHEN, 0)))
12060                     {
12061                         REQUIRE_BRANCHJ(flagp, 0);
12062                     }
12063                     br = regbranch(pRExC_state, &flags, 1, depth+1);
12064                     if (br == 0) {
12065                         RETURN_FAIL_ON_RESTART(flags,flagp);
12066                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12067                               (UV) flags);
12068                     } else
12069                     if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12070                                                              LONGJMP, 0)))
12071                     {
12072                         REQUIRE_BRANCHJ(flagp, 0);
12073                     }
12074                     c = UCHARAT(RExC_parse);
12075                     nextchar(pRExC_state);
12076                     if (flags&HASWIDTH)
12077                         *flagp |= HASWIDTH;
12078                     if (c == '|') {
12079                         if (is_define)
12080                             vFAIL("(?(DEFINE)....) does not allow branches");
12081
12082                         /* Fake one for optimizer.  */
12083                         lastbr = reganode(pRExC_state, IFTHEN, 0);
12084
12085                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12086                             RETURN_FAIL_ON_RESTART(flags, flagp);
12087                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12088                                   (UV) flags);
12089                         }
12090                         if (! REGTAIL(pRExC_state, ret, lastbr)) {
12091                             REQUIRE_BRANCHJ(flagp, 0);
12092                         }
12093                         if (flags&HASWIDTH)
12094                             *flagp |= HASWIDTH;
12095                         c = UCHARAT(RExC_parse);
12096                         nextchar(pRExC_state);
12097                     }
12098                     else
12099                         lastbr = 0;
12100                     if (c != ')') {
12101                         if (RExC_parse >= RExC_end)
12102                             vFAIL("Switch (?(condition)... not terminated");
12103                         else
12104                             vFAIL("Switch (?(condition)... contains too many branches");
12105                     }
12106                     ender = reg_node(pRExC_state, TAIL);
12107                     if (! REGTAIL(pRExC_state, br, ender)) {
12108                         REQUIRE_BRANCHJ(flagp, 0);
12109                     }
12110                     if (lastbr) {
12111                         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12112                             REQUIRE_BRANCHJ(flagp, 0);
12113                         }
12114                         if (! REGTAIL(pRExC_state,
12115                                       REGNODE_OFFSET(
12116                                                  NEXTOPER(
12117                                                  NEXTOPER(REGNODE_p(lastbr)))),
12118                                       ender))
12119                         {
12120                             REQUIRE_BRANCHJ(flagp, 0);
12121                         }
12122                     }
12123                     else
12124                         if (! REGTAIL(pRExC_state, ret, ender)) {
12125                             REQUIRE_BRANCHJ(flagp, 0);
12126                         }
12127 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
12128                     RExC_size++; /* XXX WHY do we need this?!!
12129                                     For large programs it seems to be required
12130                                     but I can't figure out why. -- dmq*/
12131 #endif
12132                     return ret;
12133                 }
12134                 RExC_parse += UTF
12135                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12136                               : 1;
12137                 vFAIL("Unknown switch condition (?(...))");
12138             }
12139             case '[':           /* (?[ ... ]) */
12140                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
12141                                          oregcomp_parse);
12142             case 0: /* A NUL */
12143                 RExC_parse--; /* for vFAIL to print correctly */
12144                 vFAIL("Sequence (? incomplete");
12145                 break;
12146
12147             case ')':
12148                 if (RExC_strict) {  /* [perl #132851] */
12149                     ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12150                 }
12151                 /* FALLTHROUGH */
12152             case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
12153             /* FALLTHROUGH */
12154             default: /* e.g., (?i) */
12155                 RExC_parse = (char *) seqstart + 1;
12156               parse_flags:
12157                 parse_lparen_question_flags(pRExC_state);
12158                 if (UCHARAT(RExC_parse) != ':') {
12159                     if (RExC_parse < RExC_end)
12160                         nextchar(pRExC_state);
12161                     *flagp = TRYAGAIN;
12162                     return 0;
12163                 }
12164                 paren = ':';
12165                 nextchar(pRExC_state);
12166                 ret = 0;
12167                 goto parse_rest;
12168             } /* end switch */
12169         }
12170         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
12171           capturing_parens:
12172             parno = RExC_npar;
12173             RExC_npar++;
12174             if (! ALL_PARENS_COUNTED) {
12175                 /* If we are in our first pass through (and maybe only pass),
12176                  * we  need to allocate memory for the capturing parentheses
12177                  * data structures.
12178                  */
12179
12180                 if (!RExC_parens_buf_size) {
12181                     /* first guess at number of parens we might encounter */
12182                     RExC_parens_buf_size = 10;
12183
12184                     /* setup RExC_open_parens, which holds the address of each
12185                      * OPEN tag, and to make things simpler for the 0 index the
12186                      * start of the program - this is used later for offsets */
12187                     Newxz(RExC_open_parens, RExC_parens_buf_size,
12188                             regnode_offset);
12189                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
12190
12191                     /* setup RExC_close_parens, which holds the address of each
12192                      * CLOSE tag, and to make things simpler for the 0 index
12193                      * the end of the program - this is used later for offsets
12194                      * */
12195                     Newxz(RExC_close_parens, RExC_parens_buf_size,
12196                             regnode_offset);
12197                     /* we dont know where end op starts yet, so we dont need to
12198                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
12199                      * above */
12200                 }
12201                 else if (RExC_npar > RExC_parens_buf_size) {
12202                     I32 old_size = RExC_parens_buf_size;
12203
12204                     RExC_parens_buf_size *= 2;
12205
12206                     Renew(RExC_open_parens, RExC_parens_buf_size,
12207                             regnode_offset);
12208                     Zero(RExC_open_parens + old_size,
12209                             RExC_parens_buf_size - old_size, regnode_offset);
12210
12211                     Renew(RExC_close_parens, RExC_parens_buf_size,
12212                             regnode_offset);
12213                     Zero(RExC_close_parens + old_size,
12214                             RExC_parens_buf_size - old_size, regnode_offset);
12215                 }
12216             }
12217
12218             ret = reganode(pRExC_state, OPEN, parno);
12219             if (!RExC_nestroot)
12220                 RExC_nestroot = parno;
12221             if (RExC_open_parens && !RExC_open_parens[parno])
12222             {
12223                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12224                     "%*s%*s Setting open paren #%" IVdf " to %zu\n",
12225                     22, "|    |", (int)(depth * 2 + 1), "",
12226                     (IV)parno, ret));
12227                 RExC_open_parens[parno]= ret;
12228             }
12229
12230             Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12231             Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12232             is_open = 1;
12233         } else {
12234             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12235             paren = ':';
12236             ret = 0;
12237         }
12238     }
12239     else                        /* ! paren */
12240         ret = 0;
12241
12242    parse_rest:
12243     /* Pick up the branches, linking them together. */
12244     parse_start = RExC_parse;   /* MJD */
12245     br = regbranch(pRExC_state, &flags, 1, depth+1);
12246
12247     /*     branch_len = (paren != 0); */
12248
12249     if (br == 0) {
12250         RETURN_FAIL_ON_RESTART(flags, flagp);
12251         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12252     }
12253     if (*RExC_parse == '|') {
12254         if (RExC_use_BRANCHJ) {
12255             reginsert(pRExC_state, BRANCHJ, br, depth+1);
12256         }
12257         else {                  /* MJD */
12258             reginsert(pRExC_state, BRANCH, br, depth+1);
12259             Set_Node_Length(REGNODE_p(br), paren != 0);
12260             Set_Node_Offset_To_R(br, parse_start-RExC_start);
12261         }
12262         have_branch = 1;
12263     }
12264     else if (paren == ':') {
12265         *flagp |= flags&SIMPLE;
12266     }
12267     if (is_open) {                              /* Starts with OPEN. */
12268         if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
12269             REQUIRE_BRANCHJ(flagp, 0);
12270         }
12271     }
12272     else if (paren != '?')              /* Not Conditional */
12273         ret = br;
12274     *flagp |= flags & (HASWIDTH | POSTPONED);
12275     lastbr = br;
12276     while (*RExC_parse == '|') {
12277         if (RExC_use_BRANCHJ) {
12278             bool shut_gcc_up;
12279
12280             ender = reganode(pRExC_state, LONGJMP, 0);
12281
12282             /* Append to the previous. */
12283             shut_gcc_up = REGTAIL(pRExC_state,
12284                          REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12285                          ender);
12286             PERL_UNUSED_VAR(shut_gcc_up);
12287         }
12288         nextchar(pRExC_state);
12289         if (freeze_paren) {
12290             if (RExC_npar > after_freeze)
12291                 after_freeze = RExC_npar;
12292             RExC_npar = freeze_paren;
12293         }
12294         br = regbranch(pRExC_state, &flags, 0, depth+1);
12295
12296         if (br == 0) {
12297             RETURN_FAIL_ON_RESTART(flags, flagp);
12298             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12299         }
12300         if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
12301             REQUIRE_BRANCHJ(flagp, 0);
12302         }
12303         lastbr = br;
12304         *flagp |= flags & (HASWIDTH | POSTPONED);
12305     }
12306
12307     if (have_branch || paren != ':') {
12308         regnode * br;
12309
12310         /* Make a closing node, and hook it on the end. */
12311         switch (paren) {
12312         case ':':
12313             ender = reg_node(pRExC_state, TAIL);
12314             break;
12315         case 1: case 2:
12316             ender = reganode(pRExC_state, CLOSE, parno);
12317             if ( RExC_close_parens ) {
12318                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12319                         "%*s%*s Setting close paren #%" IVdf " to %zu\n",
12320                         22, "|    |", (int)(depth * 2 + 1), "",
12321                         (IV)parno, ender));
12322                 RExC_close_parens[parno]= ender;
12323                 if (RExC_nestroot == parno)
12324                     RExC_nestroot = 0;
12325             }
12326             Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12327             Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12328             break;
12329         case 's':
12330             ender = reg_node(pRExC_state, SRCLOSE);
12331             RExC_in_script_run = 0;
12332             break;
12333         case '<':
12334         case 'a':
12335         case 'A':
12336         case 'b':
12337         case 'B':
12338         case ',':
12339         case '=':
12340         case '!':
12341             *flagp &= ~HASWIDTH;
12342             /* FALLTHROUGH */
12343         case 't':   /* aTomic */
12344         case '>':
12345             ender = reg_node(pRExC_state, SUCCEED);
12346             break;
12347         case 0:
12348             ender = reg_node(pRExC_state, END);
12349             assert(!RExC_end_op); /* there can only be one! */
12350             RExC_end_op = REGNODE_p(ender);
12351             if (RExC_close_parens) {
12352                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12353                     "%*s%*s Setting close paren #0 (END) to %zu\n",
12354                     22, "|    |", (int)(depth * 2 + 1), "",
12355                     ender));
12356
12357                 RExC_close_parens[0]= ender;
12358             }
12359             break;
12360         }
12361         DEBUG_PARSE_r({
12362             DEBUG_PARSE_MSG("lsbr");
12363             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12364             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12365             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12366                           SvPV_nolen_const(RExC_mysv1),
12367                           (IV)lastbr,
12368                           SvPV_nolen_const(RExC_mysv2),
12369                           (IV)ender,
12370                           (IV)(ender - lastbr)
12371             );
12372         });
12373         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12374             REQUIRE_BRANCHJ(flagp, 0);
12375         }
12376
12377         if (have_branch) {
12378             char is_nothing= 1;
12379             if (depth==1)
12380                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12381
12382             /* Hook the tails of the branches to the closing node. */
12383             for (br = REGNODE_p(ret); br; br = regnext(br)) {
12384                 const U8 op = PL_regkind[OP(br)];
12385                 if (op == BRANCH) {
12386                     if (! REGTAIL_STUDY(pRExC_state,
12387                                         REGNODE_OFFSET(NEXTOPER(br)),
12388                                         ender))
12389                     {
12390                         REQUIRE_BRANCHJ(flagp, 0);
12391                     }
12392                     if ( OP(NEXTOPER(br)) != NOTHING
12393                          || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12394                         is_nothing= 0;
12395                 }
12396                 else if (op == BRANCHJ) {
12397                     bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12398                                         REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12399                                         ender);
12400                     PERL_UNUSED_VAR(shut_gcc_up);
12401                     /* for now we always disable this optimisation * /
12402                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12403                          || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12404                     */
12405                         is_nothing= 0;
12406                 }
12407             }
12408             if (is_nothing) {
12409                 regnode * ret_as_regnode = REGNODE_p(ret);
12410                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12411                                ? regnext(ret_as_regnode)
12412                                : ret_as_regnode;
12413                 DEBUG_PARSE_r({
12414                     DEBUG_PARSE_MSG("NADA");
12415                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12416                                      NULL, pRExC_state);
12417                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12418                                      NULL, pRExC_state);
12419                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12420                                   SvPV_nolen_const(RExC_mysv1),
12421                                   (IV)REG_NODE_NUM(ret_as_regnode),
12422                                   SvPV_nolen_const(RExC_mysv2),
12423                                   (IV)ender,
12424                                   (IV)(ender - ret)
12425                     );
12426                 });
12427                 OP(br)= NOTHING;
12428                 if (OP(REGNODE_p(ender)) == TAIL) {
12429                     NEXT_OFF(br)= 0;
12430                     RExC_emit= REGNODE_OFFSET(br) + 1;
12431                 } else {
12432                     regnode *opt;
12433                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12434                         OP(opt)= OPTIMIZED;
12435                     NEXT_OFF(br)= REGNODE_p(ender) - br;
12436                 }
12437             }
12438         }
12439     }
12440
12441     {
12442         const char *p;
12443          /* Even/odd or x=don't care: 010101x10x */
12444         static const char parens[] = "=!aA<,>Bbt";
12445          /* flag below is set to 0 up through 'A'; 1 for larger */
12446
12447         if (paren && (p = strchr(parens, paren))) {
12448             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12449             int flag = (p - parens) > 3;
12450
12451             if (paren == '>' || paren == 't') {
12452                 node = SUSPEND, flag = 0;
12453             }
12454
12455             reginsert(pRExC_state, node, ret, depth+1);
12456             Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12457             Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12458             FLAGS(REGNODE_p(ret)) = flag;
12459             if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12460             {
12461                 REQUIRE_BRANCHJ(flagp, 0);
12462             }
12463         }
12464     }
12465
12466     /* Check for proper termination. */
12467     if (paren) {
12468         /* restore original flags, but keep (?p) and, if we've encountered
12469          * something in the parse that changes /d rules into /u, keep the /u */
12470         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12471         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12472             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12473         }
12474         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12475             RExC_parse = oregcomp_parse;
12476             vFAIL("Unmatched (");
12477         }
12478         nextchar(pRExC_state);
12479     }
12480     else if (!paren && RExC_parse < RExC_end) {
12481         if (*RExC_parse == ')') {
12482             RExC_parse++;
12483             vFAIL("Unmatched )");
12484         }
12485         else
12486             FAIL("Junk on end of regexp");      /* "Can't happen". */
12487         NOT_REACHED; /* NOTREACHED */
12488     }
12489
12490     if (RExC_in_lookbehind) {
12491         RExC_in_lookbehind--;
12492     }
12493     if (RExC_in_lookahead) {
12494         RExC_in_lookahead--;
12495     }
12496     if (after_freeze > RExC_npar)
12497         RExC_npar = after_freeze;
12498     return(ret);
12499 }
12500
12501 /*
12502  - regbranch - one alternative of an | operator
12503  *
12504  * Implements the concatenation operator.
12505  *
12506  * On success, returns the offset at which any next node should be placed into
12507  * the regex engine program being compiled.
12508  *
12509  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12510  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12511  * UTF-8
12512  */
12513 STATIC regnode_offset
12514 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12515 {
12516     regnode_offset ret;
12517     regnode_offset chain = 0;
12518     regnode_offset latest;
12519     I32 flags = 0, c = 0;
12520     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12521
12522     PERL_ARGS_ASSERT_REGBRANCH;
12523
12524     DEBUG_PARSE("brnc");
12525
12526     if (first)
12527         ret = 0;
12528     else {
12529         if (RExC_use_BRANCHJ)
12530             ret = reganode(pRExC_state, BRANCHJ, 0);
12531         else {
12532             ret = reg_node(pRExC_state, BRANCH);
12533             Set_Node_Length(REGNODE_p(ret), 1);
12534         }
12535     }
12536
12537     *flagp = 0;                 /* Initialize. */
12538
12539     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12540                             FALSE /* Don't force to /x */ );
12541     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12542         flags &= ~TRYAGAIN;
12543         latest = regpiece(pRExC_state, &flags, depth+1);
12544         if (latest == 0) {
12545             if (flags & TRYAGAIN)
12546                 continue;
12547             RETURN_FAIL_ON_RESTART(flags, flagp);
12548             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12549         }
12550         else if (ret == 0)
12551             ret = latest;
12552         *flagp |= flags&(HASWIDTH|POSTPONED);
12553         if (chain != 0) {
12554             /* FIXME adding one for every branch after the first is probably
12555              * excessive now we have TRIE support. (hv) */
12556             MARK_NAUGHTY(1);
12557             if (! REGTAIL(pRExC_state, chain, latest)) {
12558                 /* XXX We could just redo this branch, but figuring out what
12559                  * bookkeeping needs to be reset is a pain, and it's likely
12560                  * that other branches that goto END will also be too large */
12561                 REQUIRE_BRANCHJ(flagp, 0);
12562             }
12563         }
12564         chain = latest;
12565         c++;
12566     }
12567     if (chain == 0) {   /* Loop ran zero times. */
12568         chain = reg_node(pRExC_state, NOTHING);
12569         if (ret == 0)
12570             ret = chain;
12571     }
12572     if (c == 1) {
12573         *flagp |= flags&SIMPLE;
12574     }
12575
12576     return ret;
12577 }
12578
12579 /*
12580  - regpiece - something followed by possible quantifier * + ? {n,m}
12581  *
12582  * Note that the branching code sequences used for ? and the general cases
12583  * of * and + are somewhat optimized:  they use the same NOTHING node as
12584  * both the endmarker for their branch list and the body of the last branch.
12585  * It might seem that this node could be dispensed with entirely, but the
12586  * endmarker role is not redundant.
12587  *
12588  * On success, returns the offset at which any next node should be placed into
12589  * the regex engine program being compiled.
12590  *
12591  * Returns 0 otherwise, with *flagp set to indicate why:
12592  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
12593  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
12594  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12595  */
12596 STATIC regnode_offset
12597 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12598 {
12599     regnode_offset ret;
12600     char op;
12601     char *next;
12602     I32 flags;
12603     const char * const origparse = RExC_parse;
12604     I32 min;
12605     I32 max = REG_INFTY;
12606 #ifdef RE_TRACK_PATTERN_OFFSETS
12607     char *parse_start;
12608 #endif
12609     const char *maxpos = NULL;
12610     UV uv;
12611
12612     /* Save the original in case we change the emitted regop to a FAIL. */
12613     const regnode_offset orig_emit = RExC_emit;
12614
12615     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12616
12617     PERL_ARGS_ASSERT_REGPIECE;
12618
12619     DEBUG_PARSE("piec");
12620
12621     ret = regatom(pRExC_state, &flags, depth+1);
12622     if (ret == 0) {
12623         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12624         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12625     }
12626
12627     if (! ISMULT2(RExC_parse)) {
12628         *flagp = flags;
12629         return(ret);
12630     }
12631
12632     /* Here we know the input is a legal quantifier, including {m,n} */
12633
12634     op = *RExC_parse;
12635
12636 #ifdef RE_TRACK_PATTERN_OFFSETS
12637     parse_start = RExC_parse;
12638 #endif
12639
12640     if (op != '{') {
12641         nextchar(pRExC_state);
12642
12643         if (op == '*') {
12644             min = 0;
12645         }
12646         else if (op == '+') {
12647             min = 1;
12648         }
12649         else if (op == '?') {
12650             min = 0; max = 1;
12651         }
12652     }
12653     else {  /* is '{' */
12654         const char* endptr;
12655
12656         maxpos = NULL;
12657         next = RExC_parse + 1;
12658         while (isDIGIT(*next) || *next == ',') {
12659             if (*next == ',') {
12660                 if (maxpos)
12661                     break;
12662                 else
12663                     maxpos = next;
12664             }
12665             next++;
12666         }
12667
12668         assert(*next == '}');
12669
12670         if (!maxpos)
12671             maxpos = next;
12672         RExC_parse++;
12673         if (isDIGIT(*RExC_parse)) {
12674             endptr = RExC_end;
12675             if (!grok_atoUV(RExC_parse, &uv, &endptr))
12676                 vFAIL("Invalid quantifier in {,}");
12677             if (uv >= REG_INFTY)
12678                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12679             min = (I32)uv;
12680         } else {
12681             min = 0;
12682         }
12683         if (*maxpos == ',')
12684             maxpos++;
12685         else
12686             maxpos = RExC_parse;
12687         if (isDIGIT(*maxpos)) {
12688             endptr = RExC_end;
12689             if (!grok_atoUV(maxpos, &uv, &endptr))
12690                 vFAIL("Invalid quantifier in {,}");
12691             if (uv >= REG_INFTY)
12692                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12693             max = (I32)uv;
12694         } else {
12695             max = REG_INFTY;            /* meaning "infinity" */
12696         }
12697         RExC_parse = next;
12698         nextchar(pRExC_state);
12699         if (max < min) {    /* If can't match, warn and optimize to fail
12700                                unconditionally */
12701             reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12702             ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12703             NEXT_OFF(REGNODE_p(orig_emit)) =
12704                                 regarglen[OPFAIL] + NODE_STEP_REGNODE;
12705             return ret;
12706         }
12707         else if (min == max && *RExC_parse == '?')
12708         {
12709             ckWARN2reg(RExC_parse + 1,
12710                        "Useless use of greediness modifier '%c'",
12711                        *RExC_parse);
12712         }
12713     }
12714
12715     /* Here we have a quantifier, and have calculated 'min' and 'max'.
12716      *
12717      * Check and possibly adjust a zero width operand */
12718     if (! (flags & (HASWIDTH|POSTPONED))) {
12719         if (max > REG_INFTY/3) {
12720             if (origparse[0] == '\\' && origparse[1] == 'K') {
12721                 vFAIL2utf8f(
12722                            "%" UTF8f " is forbidden - matches null string"
12723                            " many times",
12724                            UTF8fARG(UTF, (RExC_parse >= origparse
12725                                          ? RExC_parse - origparse
12726                                          : 0),
12727                            origparse));
12728             } else {
12729                 ckWARN2reg(RExC_parse,
12730                            "%" UTF8f " matches null string many times",
12731                            UTF8fARG(UTF, (RExC_parse >= origparse
12732                                          ? RExC_parse - origparse
12733                                          : 0),
12734                            origparse));
12735             }
12736         }
12737
12738         /* There's no point in trying to match something 0 length more than
12739          * once except for extra side effects, which we don't have here since
12740          * not POSTPONED */
12741         if (max > 1) {
12742             max = 1;
12743             if (min > max) {
12744                 min = max;
12745             }
12746         }
12747     }
12748
12749     /* If this is a code block pass it up */
12750     *flagp |= (flags & POSTPONED);
12751
12752     if (max > 0)
12753         *flagp |= (flags & HASWIDTH);
12754
12755     if ((flags&SIMPLE)) {
12756         if (min == 0 && max == REG_INFTY) {
12757
12758             /* Going from 0..inf is currently forbidden in wildcard
12759              * subpatterns.  The only reason is to make it harder to
12760              * write patterns that take a long long time to halt, and
12761              * because the use of this construct isn't necessary in
12762              * matching Unicode property values */
12763             if (RExC_pm_flags & PMf_WILDCARD) {
12764                 RExC_parse++;
12765                 /* diag_listed_as: Use of %s is not allowed in Unicode
12766                    property wildcard subpatterns in regex; marked by
12767                    <-- HERE in m/%s/ */
12768                 vFAIL("Use of quantifier '*' is not allowed in"
12769                       " Unicode property wildcard subpatterns");
12770                 /* Note, don't need to worry about {0,}, as a '}' isn't
12771                  * legal at all in wildcards, so wouldn't get this far
12772                  * */
12773             }
12774             reginsert(pRExC_state, STAR, ret, depth+1);
12775             MARK_NAUGHTY(4);
12776             RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12777             goto done_main_op;
12778         }
12779         if (min == 1 && max == REG_INFTY) {
12780             reginsert(pRExC_state, PLUS, ret, depth+1);
12781             MARK_NAUGHTY(3);
12782             RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12783             goto done_main_op;
12784         }
12785         MARK_NAUGHTY_EXP(2, 2);
12786         reginsert(pRExC_state, CURLY, ret, depth+1);
12787         Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12788         Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12789     }
12790     else {
12791         const regnode_offset w = reg_node(pRExC_state, WHILEM);
12792
12793         FLAGS(REGNODE_p(w)) = 0;
12794         if (!  REGTAIL(pRExC_state, ret, w)) {
12795             REQUIRE_BRANCHJ(flagp, 0);
12796         }
12797         if (RExC_use_BRANCHJ) {
12798             reginsert(pRExC_state, LONGJMP, ret, depth+1);
12799             reginsert(pRExC_state, NOTHING, ret, depth+1);
12800             NEXT_OFF(REGNODE_p(ret)) = 3;        /* Go over LONGJMP. */
12801         }
12802         reginsert(pRExC_state, CURLYX, ret, depth+1);
12803                         /* MJD hk */
12804         Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12805         Set_Node_Length(REGNODE_p(ret),
12806                         op == '{' ? (RExC_parse - parse_start) : 1);
12807
12808         if (RExC_use_BRANCHJ)
12809             NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
12810                                                LONGJMP. */
12811         if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12812                                                   NOTHING)))
12813         {
12814             REQUIRE_BRANCHJ(flagp, 0);
12815         }
12816         RExC_whilem_seen++;
12817         MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
12818     }
12819     FLAGS(REGNODE_p(ret)) = 0;
12820
12821     ARG1_SET(REGNODE_p(ret), (U16)min);
12822     ARG2_SET(REGNODE_p(ret), (U16)max);
12823     if (max == REG_INFTY)
12824         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12825
12826   done_main_op:
12827
12828     if (*RExC_parse == '?') {
12829         nextchar(pRExC_state);
12830         reginsert(pRExC_state, MINMOD, ret, depth+1);
12831         if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12832             REQUIRE_BRANCHJ(flagp, 0);
12833         }
12834     }
12835     else if (*RExC_parse == '+') {
12836         regnode_offset ender;
12837         nextchar(pRExC_state);
12838         ender = reg_node(pRExC_state, SUCCEED);
12839         if (! REGTAIL(pRExC_state, ret, ender)) {
12840             REQUIRE_BRANCHJ(flagp, 0);
12841         }
12842         reginsert(pRExC_state, SUSPEND, ret, depth+1);
12843         ender = reg_node(pRExC_state, TAIL);
12844         if (! REGTAIL(pRExC_state, ret, ender)) {
12845             REQUIRE_BRANCHJ(flagp, 0);
12846         }
12847     }
12848
12849     if (ISMULT2(RExC_parse)) {
12850         RExC_parse++;
12851         vFAIL("Nested quantifiers");
12852     }
12853
12854     return(ret);
12855 }
12856
12857 STATIC bool
12858 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12859                 regnode_offset * node_p,
12860                 UV * code_point_p,
12861                 int * cp_count,
12862                 I32 * flagp,
12863                 const bool strict,
12864                 const U32 depth
12865     )
12866 {
12867  /* This routine teases apart the various meanings of \N and returns
12868   * accordingly.  The input parameters constrain which meaning(s) is/are valid
12869   * in the current context.
12870   *
12871   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12872   *
12873   * If <code_point_p> is not NULL, the context is expecting the result to be a
12874   * single code point.  If this \N instance turns out to a single code point,
12875   * the function returns TRUE and sets *code_point_p to that code point.
12876   *
12877   * If <node_p> is not NULL, the context is expecting the result to be one of
12878   * the things representable by a regnode.  If this \N instance turns out to be
12879   * one such, the function generates the regnode, returns TRUE and sets *node_p
12880   * to point to the offset of that regnode into the regex engine program being
12881   * compiled.
12882   *
12883   * If this instance of \N isn't legal in any context, this function will
12884   * generate a fatal error and not return.
12885   *
12886   * On input, RExC_parse should point to the first char following the \N at the
12887   * time of the call.  On successful return, RExC_parse will have been updated
12888   * to point to just after the sequence identified by this routine.  Also
12889   * *flagp has been updated as needed.
12890   *
12891   * When there is some problem with the current context and this \N instance,
12892   * the function returns FALSE, without advancing RExC_parse, nor setting
12893   * *node_p, nor *code_point_p, nor *flagp.
12894   *
12895   * If <cp_count> is not NULL, the caller wants to know the length (in code
12896   * points) that this \N sequence matches.  This is set, and the input is
12897   * parsed for errors, even if the function returns FALSE, as detailed below.
12898   *
12899   * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12900   *
12901   * Probably the most common case is for the \N to specify a single code point.
12902   * *cp_count will be set to 1, and *code_point_p will be set to that code
12903   * point.
12904   *
12905   * Another possibility is for the input to be an empty \N{}.  This is no
12906   * longer accepted, and will generate a fatal error.
12907   *
12908   * Another possibility is for a custom charnames handler to be in effect which
12909   * translates the input name to an empty string.  *cp_count will be set to 0.
12910   * *node_p will be set to a generated NOTHING node.
12911   *
12912   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12913   * set to 0. *node_p will be set to a generated REG_ANY node.
12914   *
12915   * The fifth possibility is that \N resolves to a sequence of more than one
12916   * code points.  *cp_count will be set to the number of code points in the
12917   * sequence. *node_p will be set to a generated node returned by this
12918   * function calling S_reg().
12919   *
12920   * The final possibility is that it is premature to be calling this function;
12921   * the parse needs to be restarted.  This can happen when this changes from
12922   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
12923   * latter occurs only when the fifth possibility would otherwise be in
12924   * effect, and is because one of those code points requires the pattern to be
12925   * recompiled as UTF-8.  The function returns FALSE, and sets the
12926   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
12927   * happens, the caller needs to desist from continuing parsing, and return
12928   * this information to its caller.  This is not set for when there is only one
12929   * code point, as this can be called as part of an ANYOF node, and they can
12930   * store above-Latin1 code points without the pattern having to be in UTF-8.
12931   *
12932   * For non-single-quoted regexes, the tokenizer has resolved character and
12933   * sequence names inside \N{...} into their Unicode values, normalizing the
12934   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12935   * hex-represented code points in the sequence.  This is done there because
12936   * the names can vary based on what charnames pragma is in scope at the time,
12937   * so we need a way to take a snapshot of what they resolve to at the time of
12938   * the original parse. [perl #56444].
12939   *
12940   * That parsing is skipped for single-quoted regexes, so here we may get
12941   * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
12942   * like '\N{U+41}', that code point is Unicode, and has to be translated into
12943   * the native character set for non-ASCII platforms.  The other possibilities
12944   * are already native, so no translation is done. */
12945
12946     char * endbrace;    /* points to '}' following the name */
12947     char* p = RExC_parse; /* Temporary */
12948
12949     SV * substitute_parse = NULL;
12950     char *orig_end;
12951     char *save_start;
12952     I32 flags;
12953
12954     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12955
12956     PERL_ARGS_ASSERT_GROK_BSLASH_N;
12957
12958     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
12959     assert(! (node_p && cp_count));               /* At most 1 should be set */
12960
12961     if (cp_count) {     /* Initialize return for the most common case */
12962         *cp_count = 1;
12963     }
12964
12965     /* The [^\n] meaning of \N ignores spaces and comments under the /x
12966      * modifier.  The other meanings do not, so use a temporary until we find
12967      * out which we are being called with */
12968     skip_to_be_ignored_text(pRExC_state, &p,
12969                             FALSE /* Don't force to /x */ );
12970
12971     /* Disambiguate between \N meaning a named character versus \N meaning
12972      * [^\n].  The latter is assumed when the {...} following the \N is a legal
12973      * quantifier, or if there is no '{' at all */
12974     if (*p != '{' || regcurly(p)) {
12975         RExC_parse = p;
12976         if (cp_count) {
12977             *cp_count = -1;
12978         }
12979
12980         if (! node_p) {
12981             return FALSE;
12982         }
12983
12984         *node_p = reg_node(pRExC_state, REG_ANY);
12985         *flagp |= HASWIDTH|SIMPLE;
12986         MARK_NAUGHTY(1);
12987         Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12988         return TRUE;
12989     }
12990
12991     /* The test above made sure that the next real character is a '{', but
12992      * under the /x modifier, it could be separated by space (or a comment and
12993      * \n) and this is not allowed (for consistency with \x{...} and the
12994      * tokenizer handling of \N{NAME}). */
12995     if (*RExC_parse != '{') {
12996         vFAIL("Missing braces on \\N{}");
12997     }
12998
12999     RExC_parse++;       /* Skip past the '{' */
13000
13001     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13002     if (! endbrace) { /* no trailing brace */
13003         vFAIL2("Missing right brace on \\%c{}", 'N');
13004     }
13005
13006     /* Here, we have decided it should be a named character or sequence.  These
13007      * imply Unicode semantics */
13008     REQUIRE_UNI_RULES(flagp, FALSE);
13009
13010     /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
13011      * nothing at all (not allowed under strict) */
13012     if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
13013         RExC_parse = endbrace;
13014         if (strict) {
13015             RExC_parse++;   /* Position after the "}" */
13016             vFAIL("Zero length \\N{}");
13017         }
13018
13019         if (cp_count) {
13020             *cp_count = 0;
13021         }
13022         nextchar(pRExC_state);
13023         if (! node_p) {
13024             return FALSE;
13025         }
13026
13027         *node_p = reg_node(pRExC_state, NOTHING);
13028         return TRUE;
13029     }
13030
13031     if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
13032
13033         /* Here, the name isn't of the form  U+....  This can happen if the
13034          * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
13035          * is the time to find out what the name means */
13036
13037         const STRLEN name_len = endbrace - RExC_parse;
13038         SV *  value_sv;     /* What does this name evaluate to */
13039         SV ** value_svp;
13040         const U8 * value;   /* string of name's value */
13041         STRLEN value_len;   /* and its length */
13042
13043         /*  RExC_unlexed_names is a hash of names that weren't evaluated by
13044          *  toke.c, and their values. Make sure is initialized */
13045         if (! RExC_unlexed_names) {
13046             RExC_unlexed_names = newHV();
13047         }
13048
13049         /* If we have already seen this name in this pattern, use that.  This
13050          * allows us to only call the charnames handler once per name per
13051          * pattern.  A broken or malicious handler could return something
13052          * different each time, which could cause the results to vary depending
13053          * on if something gets added or subtracted from the pattern that
13054          * causes the number of passes to change, for example */
13055         if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
13056                                                       name_len, 0)))
13057         {
13058             value_sv = *value_svp;
13059         }
13060         else { /* Otherwise we have to go out and get the name */
13061             const char * error_msg = NULL;
13062             value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
13063                                                       UTF,
13064                                                       &error_msg);
13065             if (error_msg) {
13066                 RExC_parse = endbrace;
13067                 vFAIL(error_msg);
13068             }
13069
13070             /* If no error message, should have gotten a valid return */
13071             assert (value_sv);
13072
13073             /* Save the name's meaning for later use */
13074             if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
13075                            value_sv, 0))
13076             {
13077                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
13078             }
13079         }
13080
13081         /* Here, we have the value the name evaluates to in 'value_sv' */
13082         value = (U8 *) SvPV(value_sv, value_len);
13083
13084         /* See if the result is one code point vs 0 or multiple */
13085         if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
13086                                   ? UTF8SKIP(value)
13087                                   : 1)))
13088         {
13089             /* Here, exactly one code point.  If that isn't what is wanted,
13090              * fail */
13091             if (! code_point_p) {
13092                 RExC_parse = p;
13093                 return FALSE;
13094             }
13095
13096             /* Convert from string to numeric code point */
13097             *code_point_p = (SvUTF8(value_sv))
13098                             ? valid_utf8_to_uvchr(value, NULL)
13099                             : *value;
13100
13101             /* Have parsed this entire single code point \N{...}.  *cp_count
13102              * has already been set to 1, so don't do it again. */
13103             RExC_parse = endbrace;
13104             nextchar(pRExC_state);
13105             return TRUE;
13106         } /* End of is a single code point */
13107
13108         /* Count the code points, if caller desires.  The API says to do this
13109          * even if we will later return FALSE */
13110         if (cp_count) {
13111             *cp_count = 0;
13112
13113             *cp_count = (SvUTF8(value_sv))
13114                         ? utf8_length(value, value + value_len)
13115                         : value_len;
13116         }
13117
13118         /* Fail if caller doesn't want to handle a multi-code-point sequence.
13119          * But don't back the pointer up if the caller wants to know how many
13120          * code points there are (they need to handle it themselves in this
13121          * case).  */
13122         if (! node_p) {
13123             if (! cp_count) {
13124                 RExC_parse = p;
13125             }
13126             return FALSE;
13127         }
13128
13129         /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13130          * reg recursively to parse it.  That way, it retains its atomicness,
13131          * while not having to worry about any special handling that some code
13132          * points may have. */
13133
13134         substitute_parse = newSVpvs("?:");
13135         sv_catsv(substitute_parse, value_sv);
13136         sv_catpv(substitute_parse, ")");
13137
13138         /* The value should already be native, so no need to convert on EBCDIC
13139          * platforms.*/
13140         assert(! RExC_recode_x_to_native);
13141
13142     }
13143     else {   /* \N{U+...} */
13144         Size_t count = 0;   /* code point count kept internally */
13145
13146         /* We can get to here when the input is \N{U+...} or when toke.c has
13147          * converted a name to the \N{U+...} form.  This include changing a
13148          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13149
13150         RExC_parse += 2;    /* Skip past the 'U+' */
13151
13152         /* Code points are separated by dots.  The '}' terminates the whole
13153          * thing. */
13154
13155         do {    /* Loop until the ending brace */
13156             I32 flags = PERL_SCAN_SILENT_OVERFLOW
13157                       | PERL_SCAN_SILENT_ILLDIGIT
13158                       | PERL_SCAN_NOTIFY_ILLDIGIT
13159                       | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
13160                       | PERL_SCAN_DISALLOW_PREFIX;
13161             STRLEN len = endbrace - RExC_parse;
13162             NV overflow_value;
13163             char * start_digit = RExC_parse;
13164             UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
13165
13166             if (len == 0) {
13167                 RExC_parse++;
13168               bad_NU:
13169                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13170             }
13171
13172             RExC_parse += len;
13173
13174             if (cp > MAX_LEGAL_CP) {
13175                 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
13176             }
13177
13178             if (RExC_parse >= endbrace) { /* Got to the closing '}' */
13179                 if (count) {
13180                     goto do_concat;
13181                 }
13182
13183                 /* Here, is a single code point; fail if doesn't want that */
13184                 if (! code_point_p) {
13185                     RExC_parse = p;
13186                     return FALSE;
13187                 }
13188
13189                 /* A single code point is easy to handle; just return it */
13190                 *code_point_p = UNI_TO_NATIVE(cp);
13191                 RExC_parse = endbrace;
13192                 nextchar(pRExC_state);
13193                 return TRUE;
13194             }
13195
13196             /* Here, the parse stopped bfore the ending brace.  This is legal
13197              * only if that character is a dot separating code points, like a
13198              * multiple character sequence (of the form "\N{U+c1.c2. ... }".
13199              * So the next character must be a dot (and the one after that
13200              * can't be the endbrace, or we'd have something like \N{U+100.} )
13201              * */
13202             if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13203                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
13204                               ? UTF8SKIP(RExC_parse)
13205                               : 1;
13206                 RExC_parse = MIN(endbrace, RExC_parse);/* Guard against
13207                                                           malformed utf8 */
13208                 goto bad_NU;
13209             }
13210
13211             /* Here, looks like its really a multiple character sequence.  Fail
13212              * if that's not what the caller wants.  But continue with counting
13213              * and error checking if they still want a count */
13214             if (! node_p && ! cp_count) {
13215                 return FALSE;
13216             }
13217
13218             /* What is done here is to convert this to a sub-pattern of the
13219              * form \x{char1}\x{char2}...  and then call reg recursively to
13220              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
13221              * atomicness, while not having to worry about special handling
13222              * that some code points may have.  We don't create a subpattern,
13223              * but go through the motions of code point counting and error
13224              * checking, if the caller doesn't want a node returned. */
13225
13226             if (node_p && ! substitute_parse) {
13227                 substitute_parse = newSVpvs("?:");
13228             }
13229
13230           do_concat:
13231
13232             if (node_p) {
13233                 /* Convert to notation the rest of the code understands */
13234                 sv_catpvs(substitute_parse, "\\x{");
13235                 sv_catpvn(substitute_parse, start_digit,
13236                                             RExC_parse - start_digit);
13237                 sv_catpvs(substitute_parse, "}");
13238             }
13239
13240             /* Move to after the dot (or ending brace the final time through.)
13241              * */
13242             RExC_parse++;
13243             count++;
13244
13245         } while (RExC_parse < endbrace);
13246
13247         if (! node_p) { /* Doesn't want the node */
13248             assert (cp_count);
13249
13250             *cp_count = count;
13251             return FALSE;
13252         }
13253
13254         sv_catpvs(substitute_parse, ")");
13255
13256         /* The values are Unicode, and therefore have to be converted to native
13257          * on a non-Unicode (meaning non-ASCII) platform. */
13258         SET_recode_x_to_native(1);
13259     }
13260
13261     /* Here, we have the string the name evaluates to, ready to be parsed,
13262      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13263      * constructs.  This can be called from within a substitute parse already.
13264      * The error reporting mechanism doesn't work for 2 levels of this, but the
13265      * code above has validated this new construct, so there should be no
13266      * errors generated by the below.  And this isn' an exact copy, so the
13267      * mechanism to seamlessly deal with this won't work, so turn off warnings
13268      * during it */
13269     save_start = RExC_start;
13270     orig_end = RExC_end;
13271
13272     RExC_parse = RExC_start = SvPVX(substitute_parse);
13273     RExC_end = RExC_parse + SvCUR(substitute_parse);
13274     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13275
13276     *node_p = reg(pRExC_state, 1, &flags, depth+1);
13277
13278     /* Restore the saved values */
13279     RESTORE_WARNINGS;
13280     RExC_start = save_start;
13281     RExC_parse = endbrace;
13282     RExC_end = orig_end;
13283     SET_recode_x_to_native(0);
13284
13285     SvREFCNT_dec_NN(substitute_parse);
13286
13287     if (! *node_p) {
13288         RETURN_FAIL_ON_RESTART(flags, flagp);
13289         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13290             (UV) flags);
13291     }
13292     *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13293
13294     nextchar(pRExC_state);
13295
13296     return TRUE;
13297 }
13298
13299
13300 STATIC U8
13301 S_compute_EXACTish(RExC_state_t *pRExC_state)
13302 {
13303     U8 op;
13304
13305     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13306
13307     if (! FOLD) {
13308         return (LOC)
13309                 ? EXACTL
13310                 : EXACT;
13311     }
13312
13313     op = get_regex_charset(RExC_flags);
13314     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13315         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13316                  been, so there is no hole */
13317     }
13318
13319     return op + EXACTF;
13320 }
13321
13322 STATIC bool
13323 S_new_regcurly(const char *s, const char *e)
13324 {
13325     /* This is a temporary function designed to match the most lenient form of
13326      * a {m,n} quantifier we ever envision, with either number omitted, and
13327      * spaces anywhere between/before/after them.
13328      *
13329      * If this function fails, then the string it matches is very unlikely to
13330      * ever be considered a valid quantifier, so we can allow the '{' that
13331      * begins it to be considered as a literal */
13332
13333     bool has_min = FALSE;
13334     bool has_max = FALSE;
13335
13336     PERL_ARGS_ASSERT_NEW_REGCURLY;
13337
13338     if (s >= e || *s++ != '{')
13339         return FALSE;
13340
13341     while (s < e && isSPACE(*s)) {
13342         s++;
13343     }
13344     while (s < e && isDIGIT(*s)) {
13345         has_min = TRUE;
13346         s++;
13347     }
13348     while (s < e && isSPACE(*s)) {
13349         s++;
13350     }
13351
13352     if (*s == ',') {
13353         s++;
13354         while (s < e && isSPACE(*s)) {
13355             s++;
13356         }
13357         while (s < e && isDIGIT(*s)) {
13358             has_max = TRUE;
13359             s++;
13360         }
13361         while (s < e && isSPACE(*s)) {
13362             s++;
13363         }
13364     }
13365
13366     return s < e && *s == '}' && (has_min || has_max);
13367 }
13368
13369 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13370  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13371
13372 static I32
13373 S_backref_value(char *p, char *e)
13374 {
13375     const char* endptr = e;
13376     UV val;
13377     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13378         return (I32)val;
13379     return I32_MAX;
13380 }
13381
13382
13383 /*
13384  - regatom - the lowest level
13385
13386    Try to identify anything special at the start of the current parse position.
13387    If there is, then handle it as required. This may involve generating a
13388    single regop, such as for an assertion; or it may involve recursing, such as
13389    to handle a () structure.
13390
13391    If the string doesn't start with something special then we gobble up
13392    as much literal text as we can.  If we encounter a quantifier, we have to
13393    back off the final literal character, as that quantifier applies to just it
13394    and not to the whole string of literals.
13395
13396    Once we have been able to handle whatever type of thing started the
13397    sequence, we return the offset into the regex engine program being compiled
13398    at which any  next regnode should be placed.
13399
13400    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13401    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13402    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13403    Otherwise does not return 0.
13404
13405    Note: we have to be careful with escapes, as they can be both literal
13406    and special, and in the case of \10 and friends, context determines which.
13407
13408    A summary of the code structure is:
13409
13410    switch (first_byte) {
13411         cases for each special:
13412             handle this special;
13413             break;
13414         case '\\':
13415             switch (2nd byte) {
13416                 cases for each unambiguous special:
13417                     handle this special;
13418                     break;
13419                 cases for each ambigous special/literal:
13420                     disambiguate;
13421                     if (special)  handle here
13422                     else goto defchar;
13423                 default: // unambiguously literal:
13424                     goto defchar;
13425             }
13426         default:  // is a literal char
13427             // FALL THROUGH
13428         defchar:
13429             create EXACTish node for literal;
13430             while (more input and node isn't full) {
13431                 switch (input_byte) {
13432                    cases for each special;
13433                        make sure parse pointer is set so that the next call to
13434                            regatom will see this special first
13435                        goto loopdone; // EXACTish node terminated by prev. char
13436                    default:
13437                        append char to EXACTISH node;
13438                 }
13439                 get next input byte;
13440             }
13441         loopdone:
13442    }
13443    return the generated node;
13444
13445    Specifically there are two separate switches for handling
13446    escape sequences, with the one for handling literal escapes requiring
13447    a dummy entry for all of the special escapes that are actually handled
13448    by the other.
13449
13450 */
13451
13452 STATIC regnode_offset
13453 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13454 {
13455     regnode_offset ret = 0;
13456     I32 flags = 0;
13457     char *parse_start;
13458     U8 op;
13459     int invert = 0;
13460
13461     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13462
13463     *flagp = 0;         /* Initialize. */
13464
13465     DEBUG_PARSE("atom");
13466
13467     PERL_ARGS_ASSERT_REGATOM;
13468
13469   tryagain:
13470     parse_start = RExC_parse;
13471     assert(RExC_parse < RExC_end);
13472     switch ((U8)*RExC_parse) {
13473     case '^':
13474         RExC_seen_zerolen++;
13475         nextchar(pRExC_state);
13476         if (RExC_flags & RXf_PMf_MULTILINE)
13477             ret = reg_node(pRExC_state, MBOL);
13478         else
13479             ret = reg_node(pRExC_state, SBOL);
13480         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13481         break;
13482     case '$':
13483         nextchar(pRExC_state);
13484         if (*RExC_parse)
13485             RExC_seen_zerolen++;
13486         if (RExC_flags & RXf_PMf_MULTILINE)
13487             ret = reg_node(pRExC_state, MEOL);
13488         else
13489             ret = reg_node(pRExC_state, SEOL);
13490         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13491         break;
13492     case '.':
13493         nextchar(pRExC_state);
13494         if (RExC_flags & RXf_PMf_SINGLELINE)
13495             ret = reg_node(pRExC_state, SANY);
13496         else
13497             ret = reg_node(pRExC_state, REG_ANY);
13498         *flagp |= HASWIDTH|SIMPLE;
13499         MARK_NAUGHTY(1);
13500         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13501         break;
13502     case '[':
13503     {
13504         char * const oregcomp_parse = ++RExC_parse;
13505         ret = regclass(pRExC_state, flagp, depth+1,
13506                        FALSE, /* means parse the whole char class */
13507                        TRUE, /* allow multi-char folds */
13508                        FALSE, /* don't silence non-portable warnings. */
13509                        (bool) RExC_strict,
13510                        TRUE, /* Allow an optimized regnode result */
13511                        NULL);
13512         if (ret == 0) {
13513             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13514             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13515                   (UV) *flagp);
13516         }
13517         if (*RExC_parse != ']') {
13518             RExC_parse = oregcomp_parse;
13519             vFAIL("Unmatched [");
13520         }
13521         nextchar(pRExC_state);
13522         Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13523         break;
13524     }
13525     case '(':
13526         nextchar(pRExC_state);
13527         ret = reg(pRExC_state, 2, &flags, depth+1);
13528         if (ret == 0) {
13529                 if (flags & TRYAGAIN) {
13530                     if (RExC_parse >= RExC_end) {
13531                          /* Make parent create an empty node if needed. */
13532                         *flagp |= TRYAGAIN;
13533                         return(0);
13534                     }
13535                     goto tryagain;
13536                 }
13537                 RETURN_FAIL_ON_RESTART(flags, flagp);
13538                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13539                                                                  (UV) flags);
13540         }
13541         *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13542         break;
13543     case '|':
13544     case ')':
13545         if (flags & TRYAGAIN) {
13546             *flagp |= TRYAGAIN;
13547             return 0;
13548         }
13549         vFAIL("Internal urp");
13550                                 /* Supposed to be caught earlier. */
13551         break;
13552     case '?':
13553     case '+':
13554     case '*':
13555         RExC_parse++;
13556         vFAIL("Quantifier follows nothing");
13557         break;
13558     case '\\':
13559         /* Special Escapes
13560
13561            This switch handles escape sequences that resolve to some kind
13562            of special regop and not to literal text. Escape sequences that
13563            resolve to literal text are handled below in the switch marked
13564            "Literal Escapes".
13565
13566            Every entry in this switch *must* have a corresponding entry
13567            in the literal escape switch. However, the opposite is not
13568            required, as the default for this switch is to jump to the
13569            literal text handling code.
13570         */
13571         RExC_parse++;
13572         switch ((U8)*RExC_parse) {
13573         /* Special Escapes */
13574         case 'A':
13575             RExC_seen_zerolen++;
13576             /* Under wildcards, this is changed to match \n; should be
13577              * invisible to the user, as they have to compile under /m */
13578             if (RExC_pm_flags & PMf_WILDCARD) {
13579                 ret = reg_node(pRExC_state, MBOL);
13580             }
13581             else {
13582                 ret = reg_node(pRExC_state, SBOL);
13583                 /* SBOL is shared with /^/ so we set the flags so we can tell
13584                  * /\A/ from /^/ in split. */
13585                 FLAGS(REGNODE_p(ret)) = 1;
13586                 *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
13587             }
13588             goto finish_meta_pat;
13589         case 'G':
13590             if (RExC_pm_flags & PMf_WILDCARD) {
13591                 RExC_parse++;
13592                 /* diag_listed_as: Use of %s is not allowed in Unicode property
13593                    wildcard subpatterns in regex; marked by <-- HERE in m/%s/
13594                  */
13595                 vFAIL("Use of '\\G' is not allowed in Unicode property"
13596                       " wildcard subpatterns");
13597             }
13598             ret = reg_node(pRExC_state, GPOS);
13599             RExC_seen |= REG_GPOS_SEEN;
13600             goto finish_meta_pat;
13601         case 'K':
13602             if (!RExC_in_lookbehind && !RExC_in_lookahead) {
13603                 RExC_seen_zerolen++;
13604                 ret = reg_node(pRExC_state, KEEPS);
13605                 /* XXX:dmq : disabling in-place substitution seems to
13606                  * be necessary here to avoid cases of memory corruption, as
13607                  * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13608                  */
13609                 RExC_seen |= REG_LOOKBEHIND_SEEN;
13610                 goto finish_meta_pat;
13611             }
13612             else {
13613                 ++RExC_parse; /* advance past the 'K' */
13614                 vFAIL("\\K not permitted in lookahead/lookbehind");
13615             }
13616         case 'Z':
13617             if (RExC_pm_flags & PMf_WILDCARD) {
13618                 /* See comment under \A above */
13619                 ret = reg_node(pRExC_state, MEOL);
13620             }
13621             else {
13622                 ret = reg_node(pRExC_state, SEOL);
13623                 *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
13624             }
13625             RExC_seen_zerolen++;                /* Do not optimize RE away */
13626             goto finish_meta_pat;
13627         case 'z':
13628             if (RExC_pm_flags & PMf_WILDCARD) {
13629                 /* See comment under \A above */
13630                 ret = reg_node(pRExC_state, MEOL);
13631             }
13632             else {
13633                 ret = reg_node(pRExC_state, EOS);
13634                 *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
13635             }
13636             RExC_seen_zerolen++;                /* Do not optimize RE away */
13637             goto finish_meta_pat;
13638         case 'C':
13639             vFAIL("\\C no longer supported");
13640         case 'X':
13641             ret = reg_node(pRExC_state, CLUMP);
13642             *flagp |= HASWIDTH;
13643             goto finish_meta_pat;
13644
13645         case 'B':
13646             invert = 1;
13647             /* FALLTHROUGH */
13648         case 'b':
13649           {
13650             U8 flags = 0;
13651             regex_charset charset = get_regex_charset(RExC_flags);
13652
13653             RExC_seen_zerolen++;
13654             RExC_seen |= REG_LOOKBEHIND_SEEN;
13655             op = BOUND + charset;
13656
13657             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13658                 flags = TRADITIONAL_BOUND;
13659                 if (op > BOUNDA) {  /* /aa is same as /a */
13660                     op = BOUNDA;
13661                 }
13662             }
13663             else {
13664                 STRLEN length;
13665                 char name = *RExC_parse;
13666                 char * endbrace = NULL;
13667                 RExC_parse += 2;
13668                 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13669
13670                 if (! endbrace) {
13671                     vFAIL2("Missing right brace on \\%c{}", name);
13672                 }
13673                 /* XXX Need to decide whether to take spaces or not.  Should be
13674                  * consistent with \p{}, but that currently is SPACE, which
13675                  * means vertical too, which seems wrong
13676                  * while (isBLANK(*RExC_parse)) {
13677                     RExC_parse++;
13678                 }*/
13679                 if (endbrace == RExC_parse) {
13680                     RExC_parse++;  /* After the '}' */
13681                     vFAIL2("Empty \\%c{}", name);
13682                 }
13683                 length = endbrace - RExC_parse;
13684                 /*while (isBLANK(*(RExC_parse + length - 1))) {
13685                     length--;
13686                 }*/
13687                 switch (*RExC_parse) {
13688                     case 'g':
13689                         if (    length != 1
13690                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
13691                         {
13692                             goto bad_bound_type;
13693                         }
13694                         flags = GCB_BOUND;
13695                         break;
13696                     case 'l':
13697                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13698                             goto bad_bound_type;
13699                         }
13700                         flags = LB_BOUND;
13701                         break;
13702                     case 's':
13703                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13704                             goto bad_bound_type;
13705                         }
13706                         flags = SB_BOUND;
13707                         break;
13708                     case 'w':
13709                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13710                             goto bad_bound_type;
13711                         }
13712                         flags = WB_BOUND;
13713                         break;
13714                     default:
13715                       bad_bound_type:
13716                         RExC_parse = endbrace;
13717                         vFAIL2utf8f(
13718                             "'%" UTF8f "' is an unknown bound type",
13719                             UTF8fARG(UTF, length, endbrace - length));
13720                         NOT_REACHED; /*NOTREACHED*/
13721                 }
13722                 RExC_parse = endbrace;
13723                 REQUIRE_UNI_RULES(flagp, 0);
13724
13725                 if (op == BOUND) {
13726                     op = BOUNDU;
13727                 }
13728                 else if (op >= BOUNDA) {  /* /aa is same as /a */
13729                     op = BOUNDU;
13730                     length += 4;
13731
13732                     /* Don't have to worry about UTF-8, in this message because
13733                      * to get here the contents of the \b must be ASCII */
13734                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
13735                               "Using /u for '%.*s' instead of /%s",
13736                               (unsigned) length,
13737                               endbrace - length + 1,
13738                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13739                               ? ASCII_RESTRICT_PAT_MODS
13740                               : ASCII_MORE_RESTRICT_PAT_MODS);
13741                 }
13742             }
13743
13744             if (op == BOUND) {
13745                 RExC_seen_d_op = TRUE;
13746             }
13747             else if (op == BOUNDL) {
13748                 RExC_contains_locale = 1;
13749             }
13750
13751             if (invert) {
13752                 op += NBOUND - BOUND;
13753             }
13754
13755             ret = reg_node(pRExC_state, op);
13756             FLAGS(REGNODE_p(ret)) = flags;
13757
13758             goto finish_meta_pat;
13759           }
13760
13761         case 'R':
13762             ret = reg_node(pRExC_state, LNBREAK);
13763             *flagp |= HASWIDTH|SIMPLE;
13764             goto finish_meta_pat;
13765
13766         case 'd':
13767         case 'D':
13768         case 'h':
13769         case 'H':
13770         case 'p':
13771         case 'P':
13772         case 's':
13773         case 'S':
13774         case 'v':
13775         case 'V':
13776         case 'w':
13777         case 'W':
13778             /* These all have the same meaning inside [brackets], and it knows
13779              * how to do the best optimizations for them.  So, pretend we found
13780              * these within brackets, and let it do the work */
13781             RExC_parse--;
13782
13783             ret = regclass(pRExC_state, flagp, depth+1,
13784                            TRUE, /* means just parse this element */
13785                            FALSE, /* don't allow multi-char folds */
13786                            FALSE, /* don't silence non-portable warnings.  It
13787                                      would be a bug if these returned
13788                                      non-portables */
13789                            (bool) RExC_strict,
13790                            TRUE, /* Allow an optimized regnode result */
13791                            NULL);
13792             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13793             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13794              * multi-char folds are allowed.  */
13795             if (!ret)
13796                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13797                       (UV) *flagp);
13798
13799             RExC_parse--;   /* regclass() leaves this one too far ahead */
13800
13801           finish_meta_pat:
13802                    /* The escapes above that don't take a parameter can't be
13803                     * followed by a '{'.  But 'pX', 'p{foo}' and
13804                     * correspondingly 'P' can be */
13805             if (   RExC_parse - parse_start == 1
13806                 && UCHARAT(RExC_parse + 1) == '{'
13807                 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13808             {
13809                 RExC_parse += 2;
13810                 vFAIL("Unescaped left brace in regex is illegal here");
13811             }
13812             Set_Node_Offset(REGNODE_p(ret), parse_start);
13813             Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13814             nextchar(pRExC_state);
13815             break;
13816         case 'N':
13817             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13818              * \N{...} evaluates to a sequence of more than one code points).
13819              * The function call below returns a regnode, which is our result.
13820              * The parameters cause it to fail if the \N{} evaluates to a
13821              * single code point; we handle those like any other literal.  The
13822              * reason that the multicharacter case is handled here and not as
13823              * part of the EXACtish code is because of quantifiers.  In
13824              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13825              * this way makes that Just Happen. dmq.
13826              * join_exact() will join this up with adjacent EXACTish nodes
13827              * later on, if appropriate. */
13828             ++RExC_parse;
13829             if (grok_bslash_N(pRExC_state,
13830                               &ret,     /* Want a regnode returned */
13831                               NULL,     /* Fail if evaluates to a single code
13832                                            point */
13833                               NULL,     /* Don't need a count of how many code
13834                                            points */
13835                               flagp,
13836                               RExC_strict,
13837                               depth)
13838             ) {
13839                 break;
13840             }
13841
13842             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13843
13844             /* Here, evaluates to a single code point.  Go get that */
13845             RExC_parse = parse_start;
13846             goto defchar;
13847
13848         case 'k':    /* Handle \k<NAME> and \k'NAME' */
13849       parse_named_seq:
13850         {
13851             char ch;
13852             if (   RExC_parse >= RExC_end - 1
13853                 || ((   ch = RExC_parse[1]) != '<'
13854                                       && ch != '\''
13855                                       && ch != '{'))
13856             {
13857                 RExC_parse++;
13858                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13859                 vFAIL2("Sequence %.2s... not terminated", parse_start);
13860             } else {
13861                 RExC_parse += 2;
13862                 ret = handle_named_backref(pRExC_state,
13863                                            flagp,
13864                                            parse_start,
13865                                            (ch == '<')
13866                                            ? '>'
13867                                            : (ch == '{')
13868                                              ? '}'
13869                                              : '\'');
13870             }
13871             break;
13872         }
13873         case 'g':
13874         case '1': case '2': case '3': case '4':
13875         case '5': case '6': case '7': case '8': case '9':
13876             {
13877                 I32 num;
13878                 bool hasbrace = 0;
13879
13880                 if (*RExC_parse == 'g') {
13881                     bool isrel = 0;
13882
13883                     RExC_parse++;
13884                     if (*RExC_parse == '{') {
13885                         RExC_parse++;
13886                         hasbrace = 1;
13887                     }
13888                     if (*RExC_parse == '-') {
13889                         RExC_parse++;
13890                         isrel = 1;
13891                     }
13892                     if (hasbrace && !isDIGIT(*RExC_parse)) {
13893                         if (isrel) RExC_parse--;
13894                         RExC_parse -= 2;
13895                         goto parse_named_seq;
13896                     }
13897
13898                     if (RExC_parse >= RExC_end) {
13899                         goto unterminated_g;
13900                     }
13901                     num = S_backref_value(RExC_parse, RExC_end);
13902                     if (num == 0)
13903                         vFAIL("Reference to invalid group 0");
13904                     else if (num == I32_MAX) {
13905                          if (isDIGIT(*RExC_parse))
13906                             vFAIL("Reference to nonexistent group");
13907                         else
13908                           unterminated_g:
13909                             vFAIL("Unterminated \\g... pattern");
13910                     }
13911
13912                     if (isrel) {
13913                         num = RExC_npar - num;
13914                         if (num < 1)
13915                             vFAIL("Reference to nonexistent or unclosed group");
13916                     }
13917                 }
13918                 else {
13919                     num = S_backref_value(RExC_parse, RExC_end);
13920                     /* bare \NNN might be backref or octal - if it is larger
13921                      * than or equal RExC_npar then it is assumed to be an
13922                      * octal escape. Note RExC_npar is +1 from the actual
13923                      * number of parens. */
13924                     /* Note we do NOT check if num == I32_MAX here, as that is
13925                      * handled by the RExC_npar check */
13926
13927                     if (
13928                         /* any numeric escape < 10 is always a backref */
13929                         num > 9
13930                         /* any numeric escape < RExC_npar is a backref */
13931                         && num >= RExC_npar
13932                         /* cannot be an octal escape if it starts with [89] */
13933                         && ! inRANGE(*RExC_parse, '8', '9')
13934                     ) {
13935                         /* Probably not meant to be a backref, instead likely
13936                          * to be an octal character escape, e.g. \35 or \777.
13937                          * The above logic should make it obvious why using
13938                          * octal escapes in patterns is problematic. - Yves */
13939                         RExC_parse = parse_start;
13940                         goto defchar;
13941                     }
13942                 }
13943
13944                 /* At this point RExC_parse points at a numeric escape like
13945                  * \12 or \88 or something similar, which we should NOT treat
13946                  * as an octal escape. It may or may not be a valid backref
13947                  * escape. For instance \88888888 is unlikely to be a valid
13948                  * backref. */
13949                 while (isDIGIT(*RExC_parse))
13950                     RExC_parse++;
13951                 if (hasbrace) {
13952                     if (*RExC_parse != '}')
13953                         vFAIL("Unterminated \\g{...} pattern");
13954                     RExC_parse++;
13955                 }
13956                 if (num >= (I32)RExC_npar) {
13957
13958                     /* It might be a forward reference; we can't fail until we
13959                      * know, by completing the parse to get all the groups, and
13960                      * then reparsing */
13961                     if (ALL_PARENS_COUNTED)  {
13962                         if (num >= RExC_total_parens)  {
13963                             vFAIL("Reference to nonexistent group");
13964                         }
13965                     }
13966                     else {
13967                         REQUIRE_PARENS_PASS;
13968                     }
13969                 }
13970                 RExC_sawback = 1;
13971                 ret = reganode(pRExC_state,
13972                                ((! FOLD)
13973                                  ? REF
13974                                  : (ASCII_FOLD_RESTRICTED)
13975                                    ? REFFA
13976                                    : (AT_LEAST_UNI_SEMANTICS)
13977                                      ? REFFU
13978                                      : (LOC)
13979                                        ? REFFL
13980                                        : REFF),
13981                                 num);
13982                 if (OP(REGNODE_p(ret)) == REFF) {
13983                     RExC_seen_d_op = TRUE;
13984                 }
13985                 *flagp |= HASWIDTH;
13986
13987                 /* override incorrect value set in reganode MJD */
13988                 Set_Node_Offset(REGNODE_p(ret), parse_start);
13989                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13990                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13991                                         FALSE /* Don't force to /x */ );
13992             }
13993             break;
13994         case '\0':
13995             if (RExC_parse >= RExC_end)
13996                 FAIL("Trailing \\");
13997             /* FALLTHROUGH */
13998         default:
13999             /* Do not generate "unrecognized" warnings here, we fall
14000                back into the quick-grab loop below */
14001             RExC_parse = parse_start;
14002             goto defchar;
14003         } /* end of switch on a \foo sequence */
14004         break;
14005
14006     case '#':
14007
14008         /* '#' comments should have been spaced over before this function was
14009          * called */
14010         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
14011         /*
14012         if (RExC_flags & RXf_PMf_EXTENDED) {
14013             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
14014             if (RExC_parse < RExC_end)
14015                 goto tryagain;
14016         }
14017         */
14018
14019         /* FALLTHROUGH */
14020
14021     default:
14022           defchar: {
14023
14024             /* Here, we have determined that the next thing is probably a
14025              * literal character.  RExC_parse points to the first byte of its
14026              * definition.  (It still may be an escape sequence that evaluates
14027              * to a single character) */
14028
14029             STRLEN len = 0;
14030             UV ender = 0;
14031             char *p;
14032             char *s, *old_s = NULL, *old_old_s = NULL;
14033             char *s0;
14034             U32 max_string_len = 255;
14035
14036             /* We may have to reparse the node, artificially stopping filling
14037              * it early, based on info gleaned in the first parse.  This
14038              * variable gives where we stop.  Make it above the normal stopping
14039              * place first time through; otherwise it would stop too early */
14040             U32 upper_fill = max_string_len + 1;
14041
14042             /* We start out as an EXACT node, even if under /i, until we find a
14043              * character which is in a fold.  The algorithm now segregates into
14044              * separate nodes, characters that fold from those that don't under
14045              * /i.  (This hopefully will create nodes that are fixed strings
14046              * even under /i, giving the optimizer something to grab on to.)
14047              * So, if a node has something in it and the next character is in
14048              * the opposite category, that node is closed up, and the function
14049              * returns.  Then regatom is called again, and a new node is
14050              * created for the new category. */
14051             U8 node_type = EXACT;
14052
14053             /* Assume the node will be fully used; the excess is given back at
14054              * the end.  Under /i, we may need to temporarily add the fold of
14055              * an extra character or two at the end to check for splitting
14056              * multi-char folds, so allocate extra space for that.   We can't
14057              * make any other length assumptions, as a byte input sequence
14058              * could shrink down. */
14059             Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
14060                                                  + ((! FOLD)
14061                                                     ? 0
14062                                                     : 2 * ((UTF)
14063                                                            ? UTF8_MAXBYTES_CASE
14064                         /* Max non-UTF-8 expansion is 2 */ : 2)));
14065
14066             bool next_is_quantifier;
14067             char * oldp = NULL;
14068
14069             /* We can convert EXACTF nodes to EXACTFU if they contain only
14070              * characters that match identically regardless of the target
14071              * string's UTF8ness.  The reason to do this is that EXACTF is not
14072              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14073              * runtime.
14074              *
14075              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14076              * contain only above-Latin1 characters (hence must be in UTF8),
14077              * which don't participate in folds with Latin1-range characters,
14078              * as the latter's folds aren't known until runtime. */
14079             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14080
14081             /* Single-character EXACTish nodes are almost always SIMPLE.  This
14082              * allows us to override this as encountered */
14083             U8 maybe_SIMPLE = SIMPLE;
14084
14085             /* Does this node contain something that can't match unless the
14086              * target string is (also) in UTF-8 */
14087             bool requires_utf8_target = FALSE;
14088
14089             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14090             bool has_ss = FALSE;
14091
14092             /* So is the MICRO SIGN */
14093             bool has_micro_sign = FALSE;
14094
14095             /* Set when we fill up the current node and there is still more
14096              * text to process */
14097             bool overflowed;
14098
14099             /* Allocate an EXACT node.  The node_type may change below to
14100              * another EXACTish node, but since the size of the node doesn't
14101              * change, it works */
14102             ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
14103                                                                     "exact");
14104             FILL_NODE(ret, node_type);
14105             RExC_emit++;
14106
14107             s = STRING(REGNODE_p(ret));
14108
14109             s0 = s;
14110
14111           reparse:
14112
14113             p = RExC_parse;
14114             len = 0;
14115             s = s0;
14116             node_type = EXACT;
14117             oldp = NULL;
14118             maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14119             maybe_SIMPLE = SIMPLE;
14120             requires_utf8_target = FALSE;
14121             has_ss = FALSE;
14122             has_micro_sign = FALSE;
14123
14124           continue_parse:
14125
14126             /* This breaks under rare circumstances.  If folding, we do not
14127              * want to split a node at a character that is a non-final in a
14128              * multi-char fold, as an input string could just happen to want to
14129              * match across the node boundary.  The code at the end of the loop
14130              * looks for this, and backs off until it finds not such a
14131              * character, but it is possible (though extremely, extremely
14132              * unlikely) for all characters in the node to be non-final fold
14133              * ones, in which case we just leave the node fully filled, and
14134              * hope that it doesn't match the string in just the wrong place */
14135
14136             assert( ! UTF     /* Is at the beginning of a character */
14137                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14138                    || UTF8_IS_START(UCHARAT(RExC_parse)));
14139
14140             overflowed = FALSE;
14141
14142             /* Here, we have a literal character.  Find the maximal string of
14143              * them in the input that we can fit into a single EXACTish node.
14144              * We quit at the first non-literal or when the node gets full, or
14145              * under /i the categorization of folding/non-folding character
14146              * changes */
14147             while (p < RExC_end && len < upper_fill) {
14148
14149                 /* In most cases each iteration adds one byte to the output.
14150                  * The exceptions override this */
14151                 Size_t added_len = 1;
14152
14153                 oldp = p;
14154                 old_old_s = old_s;
14155                 old_s = s;
14156
14157                 /* White space has already been ignored */
14158                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
14159                        || ! is_PATWS_safe((p), RExC_end, UTF));
14160
14161                 switch ((U8)*p) {
14162                   const char* message;
14163                   U32 packed_warn;
14164                   U8 grok_c_char;
14165
14166                 case '^':
14167                 case '$':
14168                 case '.':
14169                 case '[':
14170                 case '(':
14171                 case ')':
14172                 case '|':
14173                     goto loopdone;
14174                 case '\\':
14175                     /* Literal Escapes Switch
14176
14177                        This switch is meant to handle escape sequences that
14178                        resolve to a literal character.
14179
14180                        Every escape sequence that represents something
14181                        else, like an assertion or a char class, is handled
14182                        in the switch marked 'Special Escapes' above in this
14183                        routine, but also has an entry here as anything that
14184                        isn't explicitly mentioned here will be treated as
14185                        an unescaped equivalent literal.
14186                     */
14187
14188                     switch ((U8)*++p) {
14189
14190                     /* These are all the special escapes. */
14191                     case 'A':             /* Start assertion */
14192                     case 'b': case 'B':   /* Word-boundary assertion*/
14193                     case 'C':             /* Single char !DANGEROUS! */
14194                     case 'd': case 'D':   /* digit class */
14195                     case 'g': case 'G':   /* generic-backref, pos assertion */
14196                     case 'h': case 'H':   /* HORIZWS */
14197                     case 'k': case 'K':   /* named backref, keep marker */
14198                     case 'p': case 'P':   /* Unicode property */
14199                               case 'R':   /* LNBREAK */
14200                     case 's': case 'S':   /* space class */
14201                     case 'v': case 'V':   /* VERTWS */
14202                     case 'w': case 'W':   /* word class */
14203                     case 'X':             /* eXtended Unicode "combining
14204                                              character sequence" */
14205                     case 'z': case 'Z':   /* End of line/string assertion */
14206                         --p;
14207                         goto loopdone;
14208
14209                     /* Anything after here is an escape that resolves to a
14210                        literal. (Except digits, which may or may not)
14211                      */
14212                     case 'n':
14213                         ender = '\n';
14214                         p++;
14215                         break;
14216                     case 'N': /* Handle a single-code point named character. */
14217                         RExC_parse = p + 1;
14218                         if (! grok_bslash_N(pRExC_state,
14219                                             NULL,   /* Fail if evaluates to
14220                                                        anything other than a
14221                                                        single code point */
14222                                             &ender, /* The returned single code
14223                                                        point */
14224                                             NULL,   /* Don't need a count of
14225                                                        how many code points */
14226                                             flagp,
14227                                             RExC_strict,
14228                                             depth)
14229                         ) {
14230                             if (*flagp & NEED_UTF8)
14231                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
14232                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14233
14234                             /* Here, it wasn't a single code point.  Go close
14235                              * up this EXACTish node.  The switch() prior to
14236                              * this switch handles the other cases */
14237                             RExC_parse = p = oldp;
14238                             goto loopdone;
14239                         }
14240                         p = RExC_parse;
14241                         RExC_parse = parse_start;
14242
14243                         /* The \N{} means the pattern, if previously /d,
14244                          * becomes /u.  That means it can't be an EXACTF node,
14245                          * but an EXACTFU */
14246                         if (node_type == EXACTF) {
14247                             node_type = EXACTFU;
14248
14249                             /* If the node already contains something that
14250                              * differs between EXACTF and EXACTFU, reparse it
14251                              * as EXACTFU */
14252                             if (! maybe_exactfu) {
14253                                 len = 0;
14254                                 s = s0;
14255                                 goto reparse;
14256                             }
14257                         }
14258
14259                         break;
14260                     case 'r':
14261                         ender = '\r';
14262                         p++;
14263                         break;
14264                     case 't':
14265                         ender = '\t';
14266                         p++;
14267                         break;
14268                     case 'f':
14269                         ender = '\f';
14270                         p++;
14271                         break;
14272                     case 'e':
14273                         ender = ESC_NATIVE;
14274                         p++;
14275                         break;
14276                     case 'a':
14277                         ender = '\a';
14278                         p++;
14279                         break;
14280                     case 'o':
14281                         if (! grok_bslash_o(&p,
14282                                             RExC_end,
14283                                             &ender,
14284                                             &message,
14285                                             &packed_warn,
14286                                             (bool) RExC_strict,
14287                                             FALSE, /* No illegal cp's */
14288                                             UTF))
14289                         {
14290                             RExC_parse = p; /* going to die anyway; point to
14291                                                exact spot of failure */
14292                             vFAIL(message);
14293                         }
14294
14295                         if (message && TO_OUTPUT_WARNINGS(p)) {
14296                             warn_non_literal_string(p, packed_warn, message);
14297                         }
14298                         break;
14299                     case 'x':
14300                         if (! grok_bslash_x(&p,
14301                                             RExC_end,
14302                                             &ender,
14303                                             &message,
14304                                             &packed_warn,
14305                                             (bool) RExC_strict,
14306                                             FALSE, /* No illegal cp's */
14307                                             UTF))
14308                         {
14309                             RExC_parse = p;     /* going to die anyway; point
14310                                                    to exact spot of failure */
14311                             vFAIL(message);
14312                         }
14313
14314                         if (message && TO_OUTPUT_WARNINGS(p)) {
14315                             warn_non_literal_string(p, packed_warn, message);
14316                         }
14317
14318 #ifdef EBCDIC
14319                         if (ender < 0x100) {
14320                             if (RExC_recode_x_to_native) {
14321                                 ender = LATIN1_TO_NATIVE(ender);
14322                             }
14323                         }
14324 #endif
14325                         break;
14326                     case 'c':
14327                         p++;
14328                         if (! grok_bslash_c(*p, &grok_c_char,
14329                                             &message, &packed_warn))
14330                         {
14331                             /* going to die anyway; point to exact spot of
14332                              * failure */
14333                             RExC_parse = p + ((UTF)
14334                                               ? UTF8_SAFE_SKIP(p, RExC_end)
14335                                               : 1);
14336                             vFAIL(message);
14337                         }
14338
14339                         ender = grok_c_char;
14340                         p++;
14341                         if (message && TO_OUTPUT_WARNINGS(p)) {
14342                             warn_non_literal_string(p, packed_warn, message);
14343                         }
14344
14345                         break;
14346                     case '8': case '9': /* must be a backreference */
14347                         --p;
14348                         /* we have an escape like \8 which cannot be an octal escape
14349                          * so we exit the loop, and let the outer loop handle this
14350                          * escape which may or may not be a legitimate backref. */
14351                         goto loopdone;
14352                     case '1': case '2': case '3':case '4':
14353                     case '5': case '6': case '7':
14354                         /* When we parse backslash escapes there is ambiguity
14355                          * between backreferences and octal escapes. Any escape
14356                          * from \1 - \9 is a backreference, any multi-digit
14357                          * escape which does not start with 0 and which when
14358                          * evaluated as decimal could refer to an already
14359                          * parsed capture buffer is a back reference. Anything
14360                          * else is octal.
14361                          *
14362                          * Note this implies that \118 could be interpreted as
14363                          * 118 OR as "\11" . "8" depending on whether there
14364                          * were 118 capture buffers defined already in the
14365                          * pattern.  */
14366
14367                         /* NOTE, RExC_npar is 1 more than the actual number of
14368                          * parens we have seen so far, hence the "<" as opposed
14369                          * to "<=" */
14370                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14371                         {  /* Not to be treated as an octal constant, go
14372                                    find backref */
14373                             --p;
14374                             goto loopdone;
14375                         }
14376                         /* FALLTHROUGH */
14377                     case '0':
14378                         {
14379                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT
14380                                       | PERL_SCAN_NOTIFY_ILLDIGIT;
14381                             STRLEN numlen = 3;
14382                             ender = grok_oct(p, &numlen, &flags, NULL);
14383                             p += numlen;
14384                             if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
14385                                 && isDIGIT(*p)  /* like \08, \178 */
14386                                 && ckWARN(WARN_REGEXP))
14387                             {
14388                                 reg_warn_non_literal_string(
14389                                      p + 1,
14390                                      form_alien_digit_msg(8, numlen, p,
14391                                                         RExC_end, UTF, FALSE));
14392                             }
14393                         }
14394                         break;
14395                     case '\0':
14396                         if (p >= RExC_end)
14397                             FAIL("Trailing \\");
14398                         /* FALLTHROUGH */
14399                     default:
14400                         if (isALPHANUMERIC(*p)) {
14401                             /* An alpha followed by '{' is going to fail next
14402                              * iteration, so don't output this warning in that
14403                              * case */
14404                             if (! isALPHA(*p) || *(p + 1) != '{') {
14405                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14406                                                   " passed through", p);
14407                             }
14408                         }
14409                         goto normal_default;
14410                     } /* End of switch on '\' */
14411                     break;
14412                 case '{':
14413                     /* Trying to gain new uses for '{' without breaking too
14414                      * much existing code is hard.  The solution currently
14415                      * adopted is:
14416                      *  1)  If there is no ambiguity that a '{' should always
14417                      *      be taken literally, at the start of a construct, we
14418                      *      just do so.
14419                      *  2)  If the literal '{' conflicts with our desired use
14420                      *      of it as a metacharacter, we die.  The deprecation
14421                      *      cycles for this have come and gone.
14422                      *  3)  If there is ambiguity, we raise a simple warning.
14423                      *      This could happen, for example, if the user
14424                      *      intended it to introduce a quantifier, but slightly
14425                      *      misspelled the quantifier.  Without this warning,
14426                      *      the quantifier would silently be taken as a literal
14427                      *      string of characters instead of a meta construct */
14428                     if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14429                         if (      RExC_strict
14430                             || (  p > parse_start + 1
14431                                 && isALPHA_A(*(p - 1))
14432                                 && *(p - 2) == '\\')
14433                             || new_regcurly(p, RExC_end))
14434                         {
14435                             RExC_parse = p + 1;
14436                             vFAIL("Unescaped left brace in regex is "
14437                                   "illegal here");
14438                         }
14439                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
14440                                          " passed through");
14441                     }
14442                     goto normal_default;
14443                 case '}':
14444                 case ']':
14445                     if (p > RExC_parse && RExC_strict) {
14446                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14447                     }
14448                     /*FALLTHROUGH*/
14449                 default:    /* A literal character */
14450                   normal_default:
14451                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
14452                         STRLEN numlen;
14453                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14454                                                &numlen, UTF8_ALLOW_DEFAULT);
14455                         p += numlen;
14456                     }
14457                     else
14458                         ender = (U8) *p++;
14459                     break;
14460                 } /* End of switch on the literal */
14461
14462                 /* Here, have looked at the literal character, and <ender>
14463                  * contains its ordinal; <p> points to the character after it.
14464                  * */
14465
14466                 if (ender > 255) {
14467                     REQUIRE_UTF8(flagp);
14468                     if (   UNICODE_IS_PERL_EXTENDED(ender)
14469                         && TO_OUTPUT_WARNINGS(p))
14470                     {
14471                         ckWARN2_non_literal_string(p,
14472                                                    packWARN(WARN_PORTABLE),
14473                                                    PL_extended_cp_format,
14474                                                    ender);
14475                     }
14476                 }
14477
14478                 /* We need to check if the next non-ignored thing is a
14479                  * quantifier.  Move <p> to after anything that should be
14480                  * ignored, which, as a side effect, positions <p> for the next
14481                  * loop iteration */
14482                 skip_to_be_ignored_text(pRExC_state, &p,
14483                                         FALSE /* Don't force to /x */ );
14484
14485                 /* If the next thing is a quantifier, it applies to this
14486                  * character only, which means that this character has to be in
14487                  * its own node and can't just be appended to the string in an
14488                  * existing node, so if there are already other characters in
14489                  * the node, close the node with just them, and set up to do
14490                  * this character again next time through, when it will be the
14491                  * only thing in its new node */
14492
14493                 next_is_quantifier =    LIKELY(p < RExC_end)
14494                                      && UNLIKELY(ISMULT2(p));
14495
14496                 if (next_is_quantifier && LIKELY(len)) {
14497                     p = oldp;
14498                     goto loopdone;
14499                 }
14500
14501                 /* Ready to add 'ender' to the node */
14502
14503                 if (! FOLD) {  /* The simple case, just append the literal */
14504                   not_fold_common:
14505
14506                     /* Don't output if it would overflow */
14507                     if (UNLIKELY(len > max_string_len - ((UTF)
14508                                                       ? UVCHR_SKIP(ender)
14509                                                       : 1)))
14510                     {
14511                         overflowed = TRUE;
14512                         break;
14513                     }
14514
14515                     if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14516                         *(s++) = (char) ender;
14517                     }
14518                     else {
14519                         U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14520                         added_len = (char *) new_s - s;
14521                         s = (char *) new_s;
14522
14523                         if (ender > 255)  {
14524                             requires_utf8_target = TRUE;
14525                         }
14526                     }
14527                 }
14528                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14529
14530                     /* Here are folding under /l, and the code point is
14531                      * problematic.  If this is the first character in the
14532                      * node, change the node type to folding.   Otherwise, if
14533                      * this is the first problematic character, close up the
14534                      * existing node, so can start a new node with this one */
14535                     if (! len) {
14536                         node_type = EXACTFL;
14537                         RExC_contains_locale = 1;
14538                     }
14539                     else if (node_type == EXACT) {
14540                         p = oldp;
14541                         goto loopdone;
14542                     }
14543
14544                     /* This problematic code point means we can't simplify
14545                      * things */
14546                     maybe_exactfu = FALSE;
14547
14548                     /* Here, we are adding a problematic fold character.
14549                      * "Problematic" in this context means that its fold isn't
14550                      * known until runtime.  (The non-problematic code points
14551                      * are the above-Latin1 ones that fold to also all
14552                      * above-Latin1.  Their folds don't vary no matter what the
14553                      * locale is.) But here we have characters whose fold
14554                      * depends on the locale.  We just add in the unfolded
14555                      * character, and wait until runtime to fold it */
14556                     goto not_fold_common;
14557                 }
14558                 else /* regular fold; see if actually is in a fold */
14559                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14560                          || (ender > 255
14561                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14562                 {
14563                     /* Here, folding, but the character isn't in a fold.
14564                      *
14565                      * Start a new node if previous characters in the node were
14566                      * folded */
14567                     if (len && node_type != EXACT) {
14568                         p = oldp;
14569                         goto loopdone;
14570                     }
14571
14572                     /* Here, continuing a node with non-folded characters.  Add
14573                      * this one */
14574                     goto not_fold_common;
14575                 }
14576                 else {  /* Here, does participate in some fold */
14577
14578                     /* If this is the first character in the node, change its
14579                      * type to folding.  Otherwise, if this is the first
14580                      * folding character in the node, close up the existing
14581                      * node, so can start a new node with this one.  */
14582                     if (! len) {
14583                         node_type = compute_EXACTish(pRExC_state);
14584                     }
14585                     else if (node_type == EXACT) {
14586                         p = oldp;
14587                         goto loopdone;
14588                     }
14589
14590                     if (UTF) {  /* Alway use the folded value for UTF-8
14591                                    patterns */
14592                         if (UVCHR_IS_INVARIANT(ender)) {
14593                             if (UNLIKELY(len + 1 > max_string_len)) {
14594                                 overflowed = TRUE;
14595                                 break;
14596                             }
14597
14598                             *(s)++ = (U8) toFOLD(ender);
14599                         }
14600                         else {
14601                             UV folded = _to_uni_fold_flags(
14602                                     ender,
14603                                     (U8 *) s,  /* We have allocated extra space
14604                                                   in 's' so can't run off the
14605                                                   end */
14606                                     &added_len,
14607                                     FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14608                                                     ? FOLD_FLAGS_NOMIX_ASCII
14609                                                     : 0));
14610                             if (UNLIKELY(len + added_len > max_string_len)) {
14611                                 overflowed = TRUE;
14612                                 break;
14613                             }
14614
14615                             s += added_len;
14616
14617                             if (   folded > 255
14618                                 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14619                             {
14620                                 /* U+B5 folds to the MU, so its possible for a
14621                                  * non-UTF-8 target to match it */
14622                                 requires_utf8_target = TRUE;
14623                             }
14624                         }
14625                     }
14626                     else { /* Here is non-UTF8. */
14627
14628                         /* The fold will be one or (rarely) two characters.
14629                          * Check that there's room for at least a single one
14630                          * before setting any flags, etc.  Because otherwise an
14631                          * overflowing character could cause a flag to be set
14632                          * even though it doesn't end up in this node.  (For
14633                          * the two character fold, we check again, before
14634                          * setting any flags) */
14635                         if (UNLIKELY(len + 1 > max_string_len)) {
14636                             overflowed = TRUE;
14637                             break;
14638                         }
14639
14640 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
14641    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
14642                                       || UNICODE_DOT_DOT_VERSION > 0)
14643
14644                         /* On non-ancient Unicodes, check for the only possible
14645                          * multi-char fold  */
14646                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14647
14648                             /* This potential multi-char fold means the node
14649                              * can't be simple (because it could match more
14650                              * than a single char).  And in some cases it will
14651                              * match 'ss', so set that flag */
14652                             maybe_SIMPLE = 0;
14653                             has_ss = TRUE;
14654
14655                             /* It can't change to be an EXACTFU (unless already
14656                              * is one).  We fold it iff under /u rules. */
14657                             if (node_type != EXACTFU) {
14658                                 maybe_exactfu = FALSE;
14659                             }
14660                             else {
14661                                 if (UNLIKELY(len + 2 > max_string_len)) {
14662                                     overflowed = TRUE;
14663                                     break;
14664                                 }
14665
14666                                 *(s++) = 's';
14667                                 *(s++) = 's';
14668                                 added_len = 2;
14669
14670                                 goto done_with_this_char;
14671                             }
14672                         }
14673                         else if (   UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14674                                  && LIKELY(len > 0)
14675                                  && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14676                         {
14677                             /* Also, the sequence 'ss' is special when not
14678                              * under /u.  If the target string is UTF-8, it
14679                              * should match SHARP S; otherwise it won't.  So,
14680                              * here we have to exclude the possibility of this
14681                              * node moving to /u.*/
14682                             has_ss = TRUE;
14683                             maybe_exactfu = FALSE;
14684                         }
14685 #endif
14686                         /* Here, the fold will be a single character */
14687
14688                         if (UNLIKELY(ender == MICRO_SIGN)) {
14689                             has_micro_sign = TRUE;
14690                         }
14691                         else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14692
14693                             /* If the character's fold differs between /d and
14694                              * /u, this can't change to be an EXACTFU node */
14695                             maybe_exactfu = FALSE;
14696                         }
14697
14698                         *(s++) = (DEPENDS_SEMANTICS)
14699                                  ? (char) toFOLD(ender)
14700
14701                                    /* Under /u, the fold of any character in
14702                                     * the 0-255 range happens to be its
14703                                     * lowercase equivalent, except for LATIN
14704                                     * SMALL LETTER SHARP S, which was handled
14705                                     * above, and the MICRO SIGN, whose fold
14706                                     * requires UTF-8 to represent.  */
14707                                  : (char) toLOWER_L1(ender);
14708                     }
14709                 } /* End of adding current character to the node */
14710
14711               done_with_this_char:
14712
14713                 len += added_len;
14714
14715                 if (next_is_quantifier) {
14716
14717                     /* Here, the next input is a quantifier, and to get here,
14718                      * the current character is the only one in the node. */
14719                     goto loopdone;
14720                 }
14721
14722             } /* End of loop through literal characters */
14723
14724             /* Here we have either exhausted the input or run out of room in
14725              * the node.  If the former, we are done.  (If we encountered a
14726              * character that can't be in the node, transfer is made directly
14727              * to <loopdone>, and so we wouldn't have fallen off the end of the
14728              * loop.)  */
14729             if (LIKELY(! overflowed)) {
14730                 goto loopdone;
14731             }
14732
14733             /* Here we have run out of room.  We can grow plain EXACT and
14734              * LEXACT nodes.  If the pattern is gigantic enough, though,
14735              * eventually we'll have to artificially chunk the pattern into
14736              * multiple nodes. */
14737             if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14738                 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14739                 Size_t overhead_expansion = 0;
14740                 char temp[256];
14741                 Size_t max_nodes_for_string;
14742                 Size_t achievable;
14743                 SSize_t delta;
14744
14745                 /* Here we couldn't fit the final character in the current
14746                  * node, so it will have to be reparsed, no matter what else we
14747                  * do */
14748                 p = oldp;
14749
14750                 /* If would have overflowed a regular EXACT node, switch
14751                  * instead to an LEXACT.  The code below is structured so that
14752                  * the actual growing code is common to changing from an EXACT
14753                  * or just increasing the LEXACT size.  This means that we have
14754                  * to save the string in the EXACT case before growing, and
14755                  * then copy it afterwards to its new location */
14756                 if (node_type == EXACT) {
14757                     overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14758                     RExC_emit += overhead_expansion;
14759                     Copy(s0, temp, len, char);
14760                 }
14761
14762                 /* Ready to grow.  If it was a plain EXACT, the string was
14763                  * saved, and the first few bytes of it overwritten by adding
14764                  * an argument field.  We assume, as we do elsewhere in this
14765                  * file, that one byte of remaining input will translate into
14766                  * one byte of output, and if that's too small, we grow again,
14767                  * if too large the excess memory is freed at the end */
14768
14769                 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14770                 achievable = MIN(max_nodes_for_string,
14771                                  current_string_nodes + STR_SZ(RExC_end - p));
14772                 delta = achievable - current_string_nodes;
14773
14774                 /* If there is just no more room, go finish up this chunk of
14775                  * the pattern. */
14776                 if (delta <= 0) {
14777                     goto loopdone;
14778                 }
14779
14780                 change_engine_size(pRExC_state, delta + overhead_expansion);
14781                 current_string_nodes += delta;
14782                 max_string_len
14783                            = sizeof(struct regnode) * current_string_nodes;
14784                 upper_fill = max_string_len + 1;
14785
14786                 /* If the length was small, we know this was originally an
14787                  * EXACT node now converted to LEXACT, and the string has to be
14788                  * restored.  Otherwise the string was untouched.  260 is just
14789                  * a number safely above 255 so don't have to worry about
14790                  * getting it precise */
14791                 if (len < 260) {
14792                     node_type = LEXACT;
14793                     FILL_NODE(ret, node_type);
14794                     s0 = STRING(REGNODE_p(ret));
14795                     Copy(temp, s0, len, char);
14796                     s = s0 + len;
14797                 }
14798
14799                 goto continue_parse;
14800             }
14801             else if (FOLD) {
14802                 bool splittable = FALSE;
14803                 bool backed_up = FALSE;
14804                 char * e;       /* should this be U8? */
14805                 char * s_start; /* should this be U8? */
14806
14807                 /* Here is /i.  Running out of room creates a problem if we are
14808                  * folding, and the split happens in the middle of a
14809                  * multi-character fold, as a match that should have occurred,
14810                  * won't, due to the way nodes are matched, and our artificial
14811                  * boundary.  So back off until we aren't splitting such a
14812                  * fold.  If there is no such place to back off to, we end up
14813                  * taking the entire node as-is.  This can happen if the node
14814                  * consists entirely of 'f' or entirely of 's' characters (or
14815                  * things that fold to them) as 'ff' and 'ss' are
14816                  * multi-character folds.
14817                  *
14818                  * The Unicode standard says that multi character folds consist
14819                  * of either two or three characters.  That means we would be
14820                  * splitting one if the final character in the node is at the
14821                  * beginning of either type, or is the second of a three
14822                  * character fold.
14823                  *
14824                  * At this point:
14825                  *  ender     is the code point of the character that won't fit
14826                  *            in the node
14827                  *  s         points to just beyond the final byte in the node.
14828                  *            It's where we would place ender if there were
14829                  *            room, and where in fact we do place ender's fold
14830                  *            in the code below, as we've over-allocated space
14831                  *            for s0 (hence s) to allow for this
14832                  *  e         starts at 's' and advances as we append things.
14833                  *  old_s     is the same as 's'.  (If ender had fit, 's' would
14834                  *            have been advanced to beyond it).
14835                  *  old_old_s points to the beginning byte of the final
14836                  *            character in the node
14837                  *  p         points to the beginning byte in the input of the
14838                  *            character beyond 'ender'.
14839                  *  oldp      points to the beginning byte in the input of
14840                  *            'ender'.
14841                  *
14842                  * In the case of /il, we haven't folded anything that could be
14843                  * affected by the locale.  That means only above-Latin1
14844                  * characters that fold to other above-latin1 characters get
14845                  * folded at compile time.  To check where a good place to
14846                  * split nodes is, everything in it will have to be folded.
14847                  * The boolean 'maybe_exactfu' keeps track in /il if there are
14848                  * any unfolded characters in the node. */
14849                 bool need_to_fold_loc = LOC && ! maybe_exactfu;
14850
14851                 /* If we do need to fold the node, we need a place to store the
14852                  * folded copy, and a way to map back to the unfolded original
14853                  * */
14854                 char * locfold_buf = NULL;
14855                 Size_t * loc_correspondence = NULL;
14856
14857                 if (! need_to_fold_loc) {   /* The normal case.  Just
14858                                                initialize to the actual node */
14859                     e = s;
14860                     s_start = s0;
14861                     s = old_old_s;  /* Point to the beginning of the final char
14862                                        that fits in the node */
14863                 }
14864                 else {
14865
14866                     /* Here, we have filled a /il node, and there are unfolded
14867                      * characters in it.  If the runtime locale turns out to be
14868                      * UTF-8, there are possible multi-character folds, just
14869                      * like when not under /l.  The node hence can't terminate
14870                      * in the middle of such a fold.  To determine this, we
14871                      * have to create a folded copy of this node.  That means
14872                      * reparsing the node, folding everything assuming a UTF-8
14873                      * locale.  (If at runtime it isn't such a locale, the
14874                      * actions here wouldn't have been necessary, but we have
14875                      * to assume the worst case.)  If we find we need to back
14876                      * off the folded string, we do so, and then map that
14877                      * position back to the original unfolded node, which then
14878                      * gets output, truncated at that spot */
14879
14880                     char * redo_p = RExC_parse;
14881                     char * redo_e;
14882                     char * old_redo_e;
14883
14884                     /* Allow enough space assuming a single byte input folds to
14885                      * a single byte output, plus assume that the two unparsed
14886                      * characters (that we may need) fold to the largest number
14887                      * of bytes possible, plus extra for one more worst case
14888                      * scenario.  In the loop below, if we start eating into
14889                      * that final spare space, we enlarge this initial space */
14890                     Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
14891
14892                     Newxz(locfold_buf, size, char);
14893                     Newxz(loc_correspondence, size, Size_t);
14894
14895                     /* Redo this node's parse, folding into 'locfold_buf' */
14896                     redo_p = RExC_parse;
14897                     old_redo_e = redo_e = locfold_buf;
14898                     while (redo_p <= oldp) {
14899
14900                         old_redo_e = redo_e;
14901                         loc_correspondence[redo_e - locfold_buf]
14902                                                         = redo_p - RExC_parse;
14903
14904                         if (UTF) {
14905                             Size_t added_len;
14906
14907                             (void) _to_utf8_fold_flags((U8 *) redo_p,
14908                                                        (U8 *) RExC_end,
14909                                                        (U8 *) redo_e,
14910                                                        &added_len,
14911                                                        FOLD_FLAGS_FULL);
14912                             redo_e += added_len;
14913                             redo_p += UTF8SKIP(redo_p);
14914                         }
14915                         else {
14916
14917                             /* Note that if this code is run on some ancient
14918                              * Unicode versions, SHARP S doesn't fold to 'ss',
14919                              * but rather than clutter the code with #ifdef's,
14920                              * as is done above, we ignore that possibility.
14921                              * This is ok because this code doesn't affect what
14922                              * gets matched, but merely where the node gets
14923                              * split */
14924                             if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
14925                                 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
14926                             }
14927                             else {
14928                                 *redo_e++ = 's';
14929                                 *redo_e++ = 's';
14930                             }
14931                             redo_p++;
14932                         }
14933
14934
14935                         /* If we're getting so close to the end that a
14936                          * worst-case fold in the next character would cause us
14937                          * to overflow, increase, assuming one byte output byte
14938                          * per one byte input one, plus room for another worst
14939                          * case fold */
14940                         if (   redo_p <= oldp
14941                             && redo_e > locfold_buf + size
14942                                                     - (UTF8_MAXBYTES_CASE + 1))
14943                         {
14944                             Size_t new_size = size
14945                                             + (oldp - redo_p)
14946                                             + UTF8_MAXBYTES_CASE + 1;
14947                             Ptrdiff_t e_offset = redo_e - locfold_buf;
14948
14949                             Renew(locfold_buf, new_size, char);
14950                             Renew(loc_correspondence, new_size, Size_t);
14951                             size = new_size;
14952
14953                             redo_e = locfold_buf + e_offset;
14954                         }
14955                     }
14956
14957                     /* Set so that things are in terms of the folded, temporary
14958                      * string */
14959                     s = old_redo_e;
14960                     s_start = locfold_buf;
14961                     e = redo_e;
14962
14963                 }
14964
14965                 /* Here, we have 's', 's_start' and 'e' set up to point to the
14966                  * input that goes into the node, folded.
14967                  *
14968                  * If the final character of the node and the fold of ender
14969                  * form the first two characters of a three character fold, we
14970                  * need to peek ahead at the next (unparsed) character in the
14971                  * input to determine if the three actually do form such a
14972                  * fold.  Just looking at that character is not generally
14973                  * sufficient, as it could be, for example, an escape sequence
14974                  * that evaluates to something else, and it needs to be folded.
14975                  *
14976                  * khw originally thought to just go through the parse loop one
14977                  * extra time, but that doesn't work easily as that iteration
14978                  * could cause things to think that the parse is over and to
14979                  * goto loopdone.  The character could be a '$' for example, or
14980                  * the character beyond could be a quantifier, and other
14981                  * glitches as well.
14982                  *
14983                  * The solution used here for peeking ahead is to look at that
14984                  * next character.  If it isn't ASCII punctuation, then it will
14985                  * be something that continues in an EXACTish node if there
14986                  * were space.  We append the fold of it to s, having reserved
14987                  * enough room in s0 for the purpose.  If we can't reasonably
14988                  * peek ahead, we instead assume the worst case: that it is
14989                  * something that would form the completion of a multi-char
14990                  * fold.
14991                  *
14992                  * If we can't split between s and ender, we work backwards
14993                  * character-by-character down to s0.  At each current point
14994                  * see if we are at the beginning of a multi-char fold.  If so,
14995                  * that means we would be splitting the fold across nodes, and
14996                  * so we back up one and try again.
14997                  *
14998                  * If we're not at the beginning, we still could be at the
14999                  * final two characters of a (rare) three character fold.  We
15000                  * check if the sequence starting at the character before the
15001                  * current position (and including the current and next
15002                  * characters) is a three character fold.  If not, the node can
15003                  * be split here.  If it is, we have to backup two characters
15004                  * and try again.
15005                  *
15006                  * Otherwise, the node can be split at the current position.
15007                  *
15008                  * The same logic is used for UTF-8 patterns and not */
15009                 if (UTF) {
15010                     Size_t added_len;
15011
15012                     /* Append the fold of ender */
15013                     (void) _to_uni_fold_flags(
15014                         ender,
15015                         (U8 *) e,
15016                         &added_len,
15017                         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15018                                         ? FOLD_FLAGS_NOMIX_ASCII
15019                                         : 0));
15020                     e += added_len;
15021
15022                     /* 's' and the character folded to by ender may be the
15023                      * first two of a three-character fold, in which case the
15024                      * node should not be split here.  That may mean examining
15025                      * the so-far unparsed character starting at 'p'.  But if
15026                      * ender folded to more than one character, we already have
15027                      * three characters to look at.  Also, we first check if
15028                      * the sequence consisting of s and the next character form
15029                      * the first two of some three character fold.  If not,
15030                      * there's no need to peek ahead. */
15031                     if (   added_len <= UTF8SKIP(e - added_len)
15032                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
15033                     {
15034                         /* Here, the two do form the beginning of a potential
15035                          * three character fold.  The unexamined character may
15036                          * or may not complete it.  Peek at it.  It might be
15037                          * something that ends the node or an escape sequence,
15038                          * in which case we don't know without a lot of work
15039                          * what it evaluates to, so we have to assume the worst
15040                          * case: that it does complete the fold, and so we
15041                          * can't split here.  All such instances  will have
15042                          * that character be an ASCII punctuation character,
15043                          * like a backslash.  So, for that case, backup one and
15044                          * drop down to try at that position */
15045                         if (isPUNCT(*p)) {
15046                             s = (char *) utf8_hop_back((U8 *) s, -1,
15047                                        (U8 *) s_start);
15048                             backed_up = TRUE;
15049                         }
15050                         else {
15051                             /* Here, since it's not punctuation, it must be a
15052                              * real character, and we can append its fold to
15053                              * 'e' (having deliberately reserved enough space
15054                              * for this eventuality) and drop down to check if
15055                              * the three actually do form a folded sequence */
15056                             (void) _to_utf8_fold_flags(
15057                                 (U8 *) p, (U8 *) RExC_end,
15058                                 (U8 *) e,
15059                                 &added_len,
15060                                 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15061                                                 ? FOLD_FLAGS_NOMIX_ASCII
15062                                                 : 0));
15063                             e += added_len;
15064                         }
15065                     }
15066
15067                     /* Here, we either have three characters available in
15068                      * sequence starting at 's', or we have two characters and
15069                      * know that the following one can't possibly be part of a
15070                      * three character fold.  We go through the node backwards
15071                      * until we find a place where we can split it without
15072                      * breaking apart a multi-character fold.  At any given
15073                      * point we have to worry about if such a fold begins at
15074                      * the current 's', and also if a three-character fold
15075                      * begins at s-1, (containing s and s+1).  Splitting in
15076                      * either case would break apart a fold */
15077                     do {
15078                         char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
15079                                                             (U8 *) s_start);
15080
15081                         /* If is a multi-char fold, can't split here.  Backup
15082                          * one char and try again */
15083                         if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
15084                             s = prev_s;
15085                             backed_up = TRUE;
15086                             continue;
15087                         }
15088
15089                         /* If the two characters beginning at 's' are part of a
15090                          * three character fold starting at the character
15091                          * before s, we can't split either before or after s.
15092                          * Backup two chars and try again */
15093                         if (   LIKELY(s > s_start)
15094                             && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
15095                         {
15096                             s = prev_s;
15097                             s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
15098                             backed_up = TRUE;
15099                             continue;
15100                         }
15101
15102                         /* Here there's no multi-char fold between s and the
15103                          * next character following it.  We can split */
15104                         splittable = TRUE;
15105                         break;
15106
15107                     } while (s > s_start); /* End of loops backing up through the node */
15108
15109                     /* Here we either couldn't find a place to split the node,
15110                      * or else we broke out of the loop setting 'splittable' to
15111                      * true.  In the latter case, the place to split is between
15112                      * the first and second characters in the sequence starting
15113                      * at 's' */
15114                     if (splittable) {
15115                         s += UTF8SKIP(s);
15116                     }
15117                 }
15118                 else {  /* Pattern not UTF-8 */
15119                     if (   ender != LATIN_SMALL_LETTER_SHARP_S
15120                         || ASCII_FOLD_RESTRICTED)
15121                     {
15122                         assert( toLOWER_L1(ender) < 256 );
15123                         *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15124                     }
15125                     else {
15126                         *e++ = 's';
15127                         *e++ = 's';
15128                     }
15129
15130                     if (   e - s  <= 1
15131                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
15132                     {
15133                         if (isPUNCT(*p)) {
15134                             s--;
15135                             backed_up = TRUE;
15136                         }
15137                         else {
15138                             if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
15139                                 || ASCII_FOLD_RESTRICTED)
15140                             {
15141                                 assert( toLOWER_L1(ender) < 256 );
15142                                 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15143                             }
15144                             else {
15145                                 *e++ = 's';
15146                                 *e++ = 's';
15147                             }
15148                         }
15149                     }
15150
15151                     do {
15152                         if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
15153                             s--;
15154                             backed_up = TRUE;
15155                             continue;
15156                         }
15157
15158                         if (   LIKELY(s > s_start)
15159                             && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
15160                         {
15161                             s -= 2;
15162                             backed_up = TRUE;
15163                             continue;
15164                         }
15165
15166                         splittable = TRUE;
15167                         break;
15168
15169                     } while (s > s_start);
15170
15171                     if (splittable) {
15172                         s++;
15173                     }
15174                 }
15175
15176                 /* Here, we are done backing up.  If we didn't backup at all
15177                  * (the likely case), just proceed */
15178                 if (backed_up) {
15179
15180                    /* If we did find a place to split, reparse the entire node
15181                     * stopping where we have calculated. */
15182                     if (splittable) {
15183
15184                        /* If we created a temporary folded string under /l, we
15185                         * have to map that back to the original */
15186                         if (need_to_fold_loc) {
15187                             upper_fill = loc_correspondence[s - s_start];
15188                             if (upper_fill == 0) {
15189                                 FAIL2("panic: loc_correspondence[%d] is 0",
15190                                       (int) (s - s_start));
15191                             }
15192                             Safefree(locfold_buf);
15193                             Safefree(loc_correspondence);
15194                         }
15195                         else {
15196                             upper_fill = s - s0;
15197                         }
15198                         goto reparse;
15199                     }
15200
15201                     /* Here the node consists entirely of non-final multi-char
15202                      * folds.  (Likely it is all 'f's or all 's's.)  There's no
15203                      * decent place to split it, so give up and just take the
15204                      * whole thing */
15205                     len = old_s - s0;
15206                 }
15207
15208                 if (need_to_fold_loc) {
15209                     Safefree(locfold_buf);
15210                     Safefree(loc_correspondence);
15211                 }
15212             }   /* End of verifying node ends with an appropriate char */
15213
15214             /* We need to start the next node at the character that didn't fit
15215              * in this one */
15216             p = oldp;
15217
15218           loopdone:   /* Jumped to when encounters something that shouldn't be
15219                          in the node */
15220
15221             /* Free up any over-allocated space; cast is to silence bogus
15222              * warning in MS VC */
15223             change_engine_size(pRExC_state,
15224                         - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
15225
15226             /* I (khw) don't know if you can get here with zero length, but the
15227              * old code handled this situation by creating a zero-length EXACT
15228              * node.  Might as well be NOTHING instead */
15229             if (len == 0) {
15230                 OP(REGNODE_p(ret)) = NOTHING;
15231             }
15232             else {
15233
15234                 /* If the node type is EXACT here, check to see if it
15235                  * should be EXACTL, or EXACT_REQ8. */
15236                 if (node_type == EXACT) {
15237                     if (LOC) {
15238                         node_type = EXACTL;
15239                     }
15240                     else if (requires_utf8_target) {
15241                         node_type = EXACT_REQ8;
15242                     }
15243                 }
15244                 else if (node_type == LEXACT) {
15245                     if (requires_utf8_target) {
15246                         node_type = LEXACT_REQ8;
15247                     }
15248                 }
15249                 else if (FOLD) {
15250                     if (    UNLIKELY(has_micro_sign || has_ss)
15251                         && (node_type == EXACTFU || (   node_type == EXACTF
15252                                                      && maybe_exactfu)))
15253                     {   /* These two conditions are problematic in non-UTF-8
15254                            EXACTFU nodes. */
15255                         assert(! UTF);
15256                         node_type = EXACTFUP;
15257                     }
15258                     else if (node_type == EXACTFL) {
15259
15260                         /* 'maybe_exactfu' is deliberately set above to
15261                          * indicate this node type, where all code points in it
15262                          * are above 255 */
15263                         if (maybe_exactfu) {
15264                             node_type = EXACTFLU8;
15265                         }
15266                         else if (UNLIKELY(
15267                              _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15268                         {
15269                             /* A character that folds to more than one will
15270                              * match multiple characters, so can't be SIMPLE.
15271                              * We don't have to worry about this with EXACTFLU8
15272                              * nodes just above, as they have already been
15273                              * folded (since the fold doesn't vary at run
15274                              * time).  Here, if the final character in the node
15275                              * folds to multiple, it can't be simple.  (This
15276                              * only has an effect if the node has only a single
15277                              * character, hence the final one, as elsewhere we
15278                              * turn off simple for nodes whose length > 1 */
15279                             maybe_SIMPLE = 0;
15280                         }
15281                     }
15282                     else if (node_type == EXACTF) {  /* Means is /di */
15283
15284                         /* This intermediate variable is needed solely because
15285                          * the asserts in the macro where used exceed Win32's
15286                          * literal string capacity */
15287                         char first_char = * STRING(REGNODE_p(ret));
15288
15289                         /* If 'maybe_exactfu' is clear, then we need to stay
15290                          * /di.  If it is set, it means there are no code
15291                          * points that match differently depending on UTF8ness
15292                          * of the target string, so it can become an EXACTFU
15293                          * node */
15294                         if (! maybe_exactfu) {
15295                             RExC_seen_d_op = TRUE;
15296                         }
15297                         else if (   isALPHA_FOLD_EQ(first_char, 's')
15298                                  || isALPHA_FOLD_EQ(ender, 's'))
15299                         {
15300                             /* But, if the node begins or ends in an 's' we
15301                              * have to defer changing it into an EXACTFU, as
15302                              * the node could later get joined with another one
15303                              * that ends or begins with 's' creating an 'ss'
15304                              * sequence which would then wrongly match the
15305                              * sharp s without the target being UTF-8.  We
15306                              * create a special node that we resolve later when
15307                              * we join nodes together */
15308
15309                             node_type = EXACTFU_S_EDGE;
15310                         }
15311                         else {
15312                             node_type = EXACTFU;
15313                         }
15314                     }
15315
15316                     if (requires_utf8_target && node_type == EXACTFU) {
15317                         node_type = EXACTFU_REQ8;
15318                     }
15319                 }
15320
15321                 OP(REGNODE_p(ret)) = node_type;
15322                 setSTR_LEN(REGNODE_p(ret), len);
15323                 RExC_emit += STR_SZ(len);
15324
15325                 /* If the node isn't a single character, it can't be SIMPLE */
15326                 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15327                     maybe_SIMPLE = 0;
15328                 }
15329
15330                 *flagp |= HASWIDTH | maybe_SIMPLE;
15331             }
15332
15333             Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15334             RExC_parse = p;
15335
15336             {
15337                 /* len is STRLEN which is unsigned, need to copy to signed */
15338                 IV iv = len;
15339                 if (iv < 0)
15340                     vFAIL("Internal disaster");
15341             }
15342
15343         } /* End of label 'defchar:' */
15344         break;
15345     } /* End of giant switch on input character */
15346
15347     /* Position parse to next real character */
15348     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15349                                             FALSE /* Don't force to /x */ );
15350     if (   *RExC_parse == '{'
15351         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
15352     {
15353         if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
15354             RExC_parse++;
15355             vFAIL("Unescaped left brace in regex is illegal here");
15356         }
15357         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15358                                   " passed through");
15359     }
15360
15361     return(ret);
15362 }
15363
15364
15365 STATIC void
15366 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15367 {
15368     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
15369      * sets up the bitmap and any flags, removing those code points from the
15370      * inversion list, setting it to NULL should it become completely empty */
15371
15372
15373     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15374     assert(PL_regkind[OP(node)] == ANYOF);
15375
15376     /* There is no bitmap for this node type */
15377     if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15378         return;
15379     }
15380
15381     ANYOF_BITMAP_ZERO(node);
15382     if (*invlist_ptr) {
15383
15384         /* This gets set if we actually need to modify things */
15385         bool change_invlist = FALSE;
15386
15387         UV start, end;
15388
15389         /* Start looking through *invlist_ptr */
15390         invlist_iterinit(*invlist_ptr);
15391         while (invlist_iternext(*invlist_ptr, &start, &end)) {
15392             UV high;
15393             int i;
15394
15395             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15396                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15397             }
15398
15399             /* Quit if are above what we should change */
15400             if (start >= NUM_ANYOF_CODE_POINTS) {
15401                 break;
15402             }
15403
15404             change_invlist = TRUE;
15405
15406             /* Set all the bits in the range, up to the max that we are doing */
15407             high = (end < NUM_ANYOF_CODE_POINTS - 1)
15408                    ? end
15409                    : NUM_ANYOF_CODE_POINTS - 1;
15410             for (i = start; i <= (int) high; i++) {
15411                 ANYOF_BITMAP_SET(node, i);
15412             }
15413         }
15414         invlist_iterfinish(*invlist_ptr);
15415
15416         /* Done with loop; remove any code points that are in the bitmap from
15417          * *invlist_ptr; similarly for code points above the bitmap if we have
15418          * a flag to match all of them anyways */
15419         if (change_invlist) {
15420             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15421         }
15422         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15423             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15424         }
15425
15426         /* If have completely emptied it, remove it completely */
15427         if (_invlist_len(*invlist_ptr) == 0) {
15428             SvREFCNT_dec_NN(*invlist_ptr);
15429             *invlist_ptr = NULL;
15430         }
15431     }
15432 }
15433
15434 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15435    Character classes ([:foo:]) can also be negated ([:^foo:]).
15436    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15437    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15438    but trigger failures because they are currently unimplemented. */
15439
15440 #define POSIXCC_DONE(c)   ((c) == ':')
15441 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15442 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15443 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15444
15445 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
15446 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
15447 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
15448
15449 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15450
15451 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15452  * routine. q.v. */
15453 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
15454         if (posix_warnings) {                                               \
15455             if (! RExC_warn_text ) RExC_warn_text =                         \
15456                                          (AV *) sv_2mortal((SV *) newAV()); \
15457             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
15458                                              WARNING_PREFIX                 \
15459                                              text                           \
15460                                              REPORT_LOCATION,               \
15461                                              REPORT_LOCATION_ARGS(p)));     \
15462         }                                                                   \
15463     } STMT_END
15464 #define CLEAR_POSIX_WARNINGS()                                              \
15465     STMT_START {                                                            \
15466         if (posix_warnings && RExC_warn_text)                               \
15467             av_clear(RExC_warn_text);                                       \
15468     } STMT_END
15469
15470 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
15471     STMT_START {                                                            \
15472         CLEAR_POSIX_WARNINGS();                                             \
15473         return ret;                                                         \
15474     } STMT_END
15475
15476 STATIC int
15477 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15478
15479     const char * const s,      /* Where the putative posix class begins.
15480                                   Normally, this is one past the '['.  This
15481                                   parameter exists so it can be somewhere
15482                                   besides RExC_parse. */
15483     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15484                                   NULL */
15485     AV ** posix_warnings,      /* Where to place any generated warnings, or
15486                                   NULL */
15487     const bool check_only      /* Don't die if error */
15488 )
15489 {
15490     /* This parses what the caller thinks may be one of the three POSIX
15491      * constructs:
15492      *  1) a character class, like [:blank:]
15493      *  2) a collating symbol, like [. .]
15494      *  3) an equivalence class, like [= =]
15495      * In the latter two cases, it croaks if it finds a syntactically legal
15496      * one, as these are not handled by Perl.
15497      *
15498      * The main purpose is to look for a POSIX character class.  It returns:
15499      *  a) the class number
15500      *      if it is a completely syntactically and semantically legal class.
15501      *      'updated_parse_ptr', if not NULL, is set to point to just after the
15502      *      closing ']' of the class
15503      *  b) OOB_NAMEDCLASS
15504      *      if it appears that one of the three POSIX constructs was meant, but
15505      *      its specification was somehow defective.  'updated_parse_ptr', if
15506      *      not NULL, is set to point to the character just after the end
15507      *      character of the class.  See below for handling of warnings.
15508      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15509      *      if it  doesn't appear that a POSIX construct was intended.
15510      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
15511      *      raised.
15512      *
15513      * In b) there may be errors or warnings generated.  If 'check_only' is
15514      * TRUE, then any errors are discarded.  Warnings are returned to the
15515      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
15516      * instead it is NULL, warnings are suppressed.
15517      *
15518      * The reason for this function, and its complexity is that a bracketed
15519      * character class can contain just about anything.  But it's easy to
15520      * mistype the very specific posix class syntax but yielding a valid
15521      * regular bracketed class, so it silently gets compiled into something
15522      * quite unintended.
15523      *
15524      * The solution adopted here maintains backward compatibility except that
15525      * it adds a warning if it looks like a posix class was intended but
15526      * improperly specified.  The warning is not raised unless what is input
15527      * very closely resembles one of the 14 legal posix classes.  To do this,
15528      * it uses fuzzy parsing.  It calculates how many single-character edits it
15529      * would take to transform what was input into a legal posix class.  Only
15530      * if that number is quite small does it think that the intention was a
15531      * posix class.  Obviously these are heuristics, and there will be cases
15532      * where it errs on one side or another, and they can be tweaked as
15533      * experience informs.
15534      *
15535      * The syntax for a legal posix class is:
15536      *
15537      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15538      *
15539      * What this routine considers syntactically to be an intended posix class
15540      * is this (the comments indicate some restrictions that the pattern
15541      * doesn't show):
15542      *
15543      *  qr/(?x: \[?                         # The left bracket, possibly
15544      *                                      # omitted
15545      *          \h*                         # possibly followed by blanks
15546      *          (?: \^ \h* )?               # possibly a misplaced caret
15547      *          [:;]?                       # The opening class character,
15548      *                                      # possibly omitted.  A typo
15549      *                                      # semi-colon can also be used.
15550      *          \h*
15551      *          \^?                         # possibly a correctly placed
15552      *                                      # caret, but not if there was also
15553      *                                      # a misplaced one
15554      *          \h*
15555      *          .{3,15}                     # The class name.  If there are
15556      *                                      # deviations from the legal syntax,
15557      *                                      # its edit distance must be close
15558      *                                      # to a real class name in order
15559      *                                      # for it to be considered to be
15560      *                                      # an intended posix class.
15561      *          \h*
15562      *          [[:punct:]]?                # The closing class character,
15563      *                                      # possibly omitted.  If not a colon
15564      *                                      # nor semi colon, the class name
15565      *                                      # must be even closer to a valid
15566      *                                      # one
15567      *          \h*
15568      *          \]?                         # The right bracket, possibly
15569      *                                      # omitted.
15570      *     )/
15571      *
15572      * In the above, \h must be ASCII-only.
15573      *
15574      * These are heuristics, and can be tweaked as field experience dictates.
15575      * There will be cases when someone didn't intend to specify a posix class
15576      * that this warns as being so.  The goal is to minimize these, while
15577      * maximizing the catching of things intended to be a posix class that
15578      * aren't parsed as such.
15579      */
15580
15581     const char* p             = s;
15582     const char * const e      = RExC_end;
15583     unsigned complement       = 0;      /* If to complement the class */
15584     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
15585     bool has_opening_bracket  = FALSE;
15586     bool has_opening_colon    = FALSE;
15587     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
15588                                                    valid class */
15589     const char * possible_end = NULL;   /* used for a 2nd parse pass */
15590     const char* name_start;             /* ptr to class name first char */
15591
15592     /* If the number of single-character typos the input name is away from a
15593      * legal name is no more than this number, it is considered to have meant
15594      * the legal name */
15595     int max_distance          = 2;
15596
15597     /* to store the name.  The size determines the maximum length before we
15598      * decide that no posix class was intended.  Should be at least
15599      * sizeof("alphanumeric") */
15600     UV input_text[15];
15601     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15602
15603     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15604
15605     CLEAR_POSIX_WARNINGS();
15606
15607     if (p >= e) {
15608         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15609     }
15610
15611     if (*(p - 1) != '[') {
15612         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15613         found_problem = TRUE;
15614     }
15615     else {
15616         has_opening_bracket = TRUE;
15617     }
15618
15619     /* They could be confused and think you can put spaces between the
15620      * components */
15621     if (isBLANK(*p)) {
15622         found_problem = TRUE;
15623
15624         do {
15625             p++;
15626         } while (p < e && isBLANK(*p));
15627
15628         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15629     }
15630
15631     /* For [. .] and [= =].  These are quite different internally from [: :],
15632      * so they are handled separately.  */
15633     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15634                                             and 1 for at least one char in it
15635                                           */
15636     {
15637         const char open_char  = *p;
15638         const char * temp_ptr = p + 1;
15639
15640         /* These two constructs are not handled by perl, and if we find a
15641          * syntactically valid one, we croak.  khw, who wrote this code, finds
15642          * this explanation of them very unclear:
15643          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15644          * And searching the rest of the internet wasn't very helpful either.
15645          * It looks like just about any byte can be in these constructs,
15646          * depending on the locale.  But unless the pattern is being compiled
15647          * under /l, which is very rare, Perl runs under the C or POSIX locale.
15648          * In that case, it looks like [= =] isn't allowed at all, and that
15649          * [. .] could be any single code point, but for longer strings the
15650          * constituent characters would have to be the ASCII alphabetics plus
15651          * the minus-hyphen.  Any sensible locale definition would limit itself
15652          * to these.  And any portable one definitely should.  Trying to parse
15653          * the general case is a nightmare (see [perl #127604]).  So, this code
15654          * looks only for interiors of these constructs that match:
15655          *      qr/.|[-\w]{2,}/
15656          * Using \w relaxes the apparent rules a little, without adding much
15657          * danger of mistaking something else for one of these constructs.
15658          *
15659          * [. .] in some implementations described on the internet is usable to
15660          * escape a character that otherwise is special in bracketed character
15661          * classes.  For example [.].] means a literal right bracket instead of
15662          * the ending of the class
15663          *
15664          * [= =] can legitimately contain a [. .] construct, but we don't
15665          * handle this case, as that [. .] construct will later get parsed
15666          * itself and croak then.  And [= =] is checked for even when not under
15667          * /l, as Perl has long done so.
15668          *
15669          * The code below relies on there being a trailing NUL, so it doesn't
15670          * have to keep checking if the parse ptr < e.
15671          */
15672         if (temp_ptr[1] == open_char) {
15673             temp_ptr++;
15674         }
15675         else while (    temp_ptr < e
15676                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15677         {
15678             temp_ptr++;
15679         }
15680
15681         if (*temp_ptr == open_char) {
15682             temp_ptr++;
15683             if (*temp_ptr == ']') {
15684                 temp_ptr++;
15685                 if (! found_problem && ! check_only) {
15686                     RExC_parse = (char *) temp_ptr;
15687                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
15688                             "extensions", open_char, open_char);
15689                 }
15690
15691                 /* Here, the syntax wasn't completely valid, or else the call
15692                  * is to check-only */
15693                 if (updated_parse_ptr) {
15694                     *updated_parse_ptr = (char *) temp_ptr;
15695                 }
15696
15697                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15698             }
15699         }
15700
15701         /* If we find something that started out to look like one of these
15702          * constructs, but isn't, we continue below so that it can be checked
15703          * for being a class name with a typo of '.' or '=' instead of a colon.
15704          * */
15705     }
15706
15707     /* Here, we think there is a possibility that a [: :] class was meant, and
15708      * we have the first real character.  It could be they think the '^' comes
15709      * first */
15710     if (*p == '^') {
15711         found_problem = TRUE;
15712         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15713         complement = 1;
15714         p++;
15715
15716         if (isBLANK(*p)) {
15717             found_problem = TRUE;
15718
15719             do {
15720                 p++;
15721             } while (p < e && isBLANK(*p));
15722
15723             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15724         }
15725     }
15726
15727     /* But the first character should be a colon, which they could have easily
15728      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15729      * distinguish from a colon, so treat that as a colon).  */
15730     if (*p == ':') {
15731         p++;
15732         has_opening_colon = TRUE;
15733     }
15734     else if (*p == ';') {
15735         found_problem = TRUE;
15736         p++;
15737         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15738         has_opening_colon = TRUE;
15739     }
15740     else {
15741         found_problem = TRUE;
15742         ADD_POSIX_WARNING(p, "there must be a starting ':'");
15743
15744         /* Consider an initial punctuation (not one of the recognized ones) to
15745          * be a left terminator */
15746         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15747             p++;
15748         }
15749     }
15750
15751     /* They may think that you can put spaces between the components */
15752     if (isBLANK(*p)) {
15753         found_problem = TRUE;
15754
15755         do {
15756             p++;
15757         } while (p < e && isBLANK(*p));
15758
15759         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15760     }
15761
15762     if (*p == '^') {
15763
15764         /* We consider something like [^:^alnum:]] to not have been intended to
15765          * be a posix class, but XXX maybe we should */
15766         if (complement) {
15767             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15768         }
15769
15770         complement = 1;
15771         p++;
15772     }
15773
15774     /* Again, they may think that you can put spaces between the components */
15775     if (isBLANK(*p)) {
15776         found_problem = TRUE;
15777
15778         do {
15779             p++;
15780         } while (p < e && isBLANK(*p));
15781
15782         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15783     }
15784
15785     if (*p == ']') {
15786
15787         /* XXX This ']' may be a typo, and something else was meant.  But
15788          * treating it as such creates enough complications, that that
15789          * possibility isn't currently considered here.  So we assume that the
15790          * ']' is what is intended, and if we've already found an initial '[',
15791          * this leaves this construct looking like [:] or [:^], which almost
15792          * certainly weren't intended to be posix classes */
15793         if (has_opening_bracket) {
15794             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15795         }
15796
15797         /* But this function can be called when we parse the colon for
15798          * something like qr/[alpha:]]/, so we back up to look for the
15799          * beginning */
15800         p--;
15801
15802         if (*p == ';') {
15803             found_problem = TRUE;
15804             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15805         }
15806         else if (*p != ':') {
15807
15808             /* XXX We are currently very restrictive here, so this code doesn't
15809              * consider the possibility that, say, /[alpha.]]/ was intended to
15810              * be a posix class. */
15811             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15812         }
15813
15814         /* Here we have something like 'foo:]'.  There was no initial colon,
15815          * and we back up over 'foo.  XXX Unlike the going forward case, we
15816          * don't handle typos of non-word chars in the middle */
15817         has_opening_colon = FALSE;
15818         p--;
15819
15820         while (p > RExC_start && isWORDCHAR(*p)) {
15821             p--;
15822         }
15823         p++;
15824
15825         /* Here, we have positioned ourselves to where we think the first
15826          * character in the potential class is */
15827     }
15828
15829     /* Now the interior really starts.  There are certain key characters that
15830      * can end the interior, or these could just be typos.  To catch both
15831      * cases, we may have to do two passes.  In the first pass, we keep on
15832      * going unless we come to a sequence that matches
15833      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
15834      * This means it takes a sequence to end the pass, so two typos in a row if
15835      * that wasn't what was intended.  If the class is perfectly formed, just
15836      * this one pass is needed.  We also stop if there are too many characters
15837      * being accumulated, but this number is deliberately set higher than any
15838      * real class.  It is set high enough so that someone who thinks that
15839      * 'alphanumeric' is a correct name would get warned that it wasn't.
15840      * While doing the pass, we keep track of where the key characters were in
15841      * it.  If we don't find an end to the class, and one of the key characters
15842      * was found, we redo the pass, but stop when we get to that character.
15843      * Thus the key character was considered a typo in the first pass, but a
15844      * terminator in the second.  If two key characters are found, we stop at
15845      * the second one in the first pass.  Again this can miss two typos, but
15846      * catches a single one
15847      *
15848      * In the first pass, 'possible_end' starts as NULL, and then gets set to
15849      * point to the first key character.  For the second pass, it starts as -1.
15850      * */
15851
15852     name_start = p;
15853   parse_name:
15854     {
15855         bool has_blank               = FALSE;
15856         bool has_upper               = FALSE;
15857         bool has_terminating_colon   = FALSE;
15858         bool has_terminating_bracket = FALSE;
15859         bool has_semi_colon          = FALSE;
15860         unsigned int name_len        = 0;
15861         int punct_count              = 0;
15862
15863         while (p < e) {
15864
15865             /* Squeeze out blanks when looking up the class name below */
15866             if (isBLANK(*p) ) {
15867                 has_blank = TRUE;
15868                 found_problem = TRUE;
15869                 p++;
15870                 continue;
15871             }
15872
15873             /* The name will end with a punctuation */
15874             if (isPUNCT(*p)) {
15875                 const char * peek = p + 1;
15876
15877                 /* Treat any non-']' punctuation followed by a ']' (possibly
15878                  * with intervening blanks) as trying to terminate the class.
15879                  * ']]' is very likely to mean a class was intended (but
15880                  * missing the colon), but the warning message that gets
15881                  * generated shows the error position better if we exit the
15882                  * loop at the bottom (eventually), so skip it here. */
15883                 if (*p != ']') {
15884                     if (peek < e && isBLANK(*peek)) {
15885                         has_blank = TRUE;
15886                         found_problem = TRUE;
15887                         do {
15888                             peek++;
15889                         } while (peek < e && isBLANK(*peek));
15890                     }
15891
15892                     if (peek < e && *peek == ']') {
15893                         has_terminating_bracket = TRUE;
15894                         if (*p == ':') {
15895                             has_terminating_colon = TRUE;
15896                         }
15897                         else if (*p == ';') {
15898                             has_semi_colon = TRUE;
15899                             has_terminating_colon = TRUE;
15900                         }
15901                         else {
15902                             found_problem = TRUE;
15903                         }
15904                         p = peek + 1;
15905                         goto try_posix;
15906                     }
15907                 }
15908
15909                 /* Here we have punctuation we thought didn't end the class.
15910                  * Keep track of the position of the key characters that are
15911                  * more likely to have been class-enders */
15912                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15913
15914                     /* Allow just one such possible class-ender not actually
15915                      * ending the class. */
15916                     if (possible_end) {
15917                         break;
15918                     }
15919                     possible_end = p;
15920                 }
15921
15922                 /* If we have too many punctuation characters, no use in
15923                  * keeping going */
15924                 if (++punct_count > max_distance) {
15925                     break;
15926                 }
15927
15928                 /* Treat the punctuation as a typo. */
15929                 input_text[name_len++] = *p;
15930                 p++;
15931             }
15932             else if (isUPPER(*p)) { /* Use lowercase for lookup */
15933                 input_text[name_len++] = toLOWER(*p);
15934                 has_upper = TRUE;
15935                 found_problem = TRUE;
15936                 p++;
15937             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15938                 input_text[name_len++] = *p;
15939                 p++;
15940             }
15941             else {
15942                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15943                 p+= UTF8SKIP(p);
15944             }
15945
15946             /* The declaration of 'input_text' is how long we allow a potential
15947              * class name to be, before saying they didn't mean a class name at
15948              * all */
15949             if (name_len >= C_ARRAY_LENGTH(input_text)) {
15950                 break;
15951             }
15952         }
15953
15954         /* We get to here when the possible class name hasn't been properly
15955          * terminated before:
15956          *   1) we ran off the end of the pattern; or
15957          *   2) found two characters, each of which might have been intended to
15958          *      be the name's terminator
15959          *   3) found so many punctuation characters in the purported name,
15960          *      that the edit distance to a valid one is exceeded
15961          *   4) we decided it was more characters than anyone could have
15962          *      intended to be one. */
15963
15964         found_problem = TRUE;
15965
15966         /* In the final two cases, we know that looking up what we've
15967          * accumulated won't lead to a match, even a fuzzy one. */
15968         if (   name_len >= C_ARRAY_LENGTH(input_text)
15969             || punct_count > max_distance)
15970         {
15971             /* If there was an intermediate key character that could have been
15972              * an intended end, redo the parse, but stop there */
15973             if (possible_end && possible_end != (char *) -1) {
15974                 possible_end = (char *) -1; /* Special signal value to say
15975                                                we've done a first pass */
15976                 p = name_start;
15977                 goto parse_name;
15978             }
15979
15980             /* Otherwise, it can't have meant to have been a class */
15981             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15982         }
15983
15984         /* If we ran off the end, and the final character was a punctuation
15985          * one, back up one, to look at that final one just below.  Later, we
15986          * will restore the parse pointer if appropriate */
15987         if (name_len && p == e && isPUNCT(*(p-1))) {
15988             p--;
15989             name_len--;
15990         }
15991
15992         if (p < e && isPUNCT(*p)) {
15993             if (*p == ']') {
15994                 has_terminating_bracket = TRUE;
15995
15996                 /* If this is a 2nd ']', and the first one is just below this
15997                  * one, consider that to be the real terminator.  This gives a
15998                  * uniform and better positioning for the warning message  */
15999                 if (   possible_end
16000                     && possible_end != (char *) -1
16001                     && *possible_end == ']'
16002                     && name_len && input_text[name_len - 1] == ']')
16003                 {
16004                     name_len--;
16005                     p = possible_end;
16006
16007                     /* And this is actually equivalent to having done the 2nd
16008                      * pass now, so set it to not try again */
16009                     possible_end = (char *) -1;
16010                 }
16011             }
16012             else {
16013                 if (*p == ':') {
16014                     has_terminating_colon = TRUE;
16015                 }
16016                 else if (*p == ';') {
16017                     has_semi_colon = TRUE;
16018                     has_terminating_colon = TRUE;
16019                 }
16020                 p++;
16021             }
16022         }
16023
16024     try_posix:
16025
16026         /* Here, we have a class name to look up.  We can short circuit the
16027          * stuff below for short names that can't possibly be meant to be a
16028          * class name.  (We can do this on the first pass, as any second pass
16029          * will yield an even shorter name) */
16030         if (name_len < 3) {
16031             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16032         }
16033
16034         /* Find which class it is.  Initially switch on the length of the name.
16035          * */
16036         switch (name_len) {
16037             case 4:
16038                 if (memEQs(name_start, 4, "word")) {
16039                     /* this is not POSIX, this is the Perl \w */
16040                     class_number = ANYOF_WORDCHAR;
16041                 }
16042                 break;
16043             case 5:
16044                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
16045                  *                        graph lower print punct space upper
16046                  * Offset 4 gives the best switch position.  */
16047                 switch (name_start[4]) {
16048                     case 'a':
16049                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
16050                             class_number = ANYOF_ALPHA;
16051                         break;
16052                     case 'e':
16053                         if (memBEGINs(name_start, 5, "spac")) /* space */
16054                             class_number = ANYOF_SPACE;
16055                         break;
16056                     case 'h':
16057                         if (memBEGINs(name_start, 5, "grap")) /* graph */
16058                             class_number = ANYOF_GRAPH;
16059                         break;
16060                     case 'i':
16061                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
16062                             class_number = ANYOF_ASCII;
16063                         break;
16064                     case 'k':
16065                         if (memBEGINs(name_start, 5, "blan")) /* blank */
16066                             class_number = ANYOF_BLANK;
16067                         break;
16068                     case 'l':
16069                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
16070                             class_number = ANYOF_CNTRL;
16071                         break;
16072                     case 'm':
16073                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
16074                             class_number = ANYOF_ALPHANUMERIC;
16075                         break;
16076                     case 'r':
16077                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
16078                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
16079                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
16080                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
16081                         break;
16082                     case 't':
16083                         if (memBEGINs(name_start, 5, "digi")) /* digit */
16084                             class_number = ANYOF_DIGIT;
16085                         else if (memBEGINs(name_start, 5, "prin")) /* print */
16086                             class_number = ANYOF_PRINT;
16087                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
16088                             class_number = ANYOF_PUNCT;
16089                         break;
16090                 }
16091                 break;
16092             case 6:
16093                 if (memEQs(name_start, 6, "xdigit"))
16094                     class_number = ANYOF_XDIGIT;
16095                 break;
16096         }
16097
16098         /* If the name exactly matches a posix class name the class number will
16099          * here be set to it, and the input almost certainly was meant to be a
16100          * posix class, so we can skip further checking.  If instead the syntax
16101          * is exactly correct, but the name isn't one of the legal ones, we
16102          * will return that as an error below.  But if neither of these apply,
16103          * it could be that no posix class was intended at all, or that one
16104          * was, but there was a typo.  We tease these apart by doing fuzzy
16105          * matching on the name */
16106         if (class_number == OOB_NAMEDCLASS && found_problem) {
16107             const UV posix_names[][6] = {
16108                                                 { 'a', 'l', 'n', 'u', 'm' },
16109                                                 { 'a', 'l', 'p', 'h', 'a' },
16110                                                 { 'a', 's', 'c', 'i', 'i' },
16111                                                 { 'b', 'l', 'a', 'n', 'k' },
16112                                                 { 'c', 'n', 't', 'r', 'l' },
16113                                                 { 'd', 'i', 'g', 'i', 't' },
16114                                                 { 'g', 'r', 'a', 'p', 'h' },
16115                                                 { 'l', 'o', 'w', 'e', 'r' },
16116                                                 { 'p', 'r', 'i', 'n', 't' },
16117                                                 { 'p', 'u', 'n', 'c', 't' },
16118                                                 { 's', 'p', 'a', 'c', 'e' },
16119                                                 { 'u', 'p', 'p', 'e', 'r' },
16120                                                 { 'w', 'o', 'r', 'd' },
16121                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
16122                                             };
16123             /* The names of the above all have added NULs to make them the same
16124              * size, so we need to also have the real lengths */
16125             const UV posix_name_lengths[] = {
16126                                                 sizeof("alnum") - 1,
16127                                                 sizeof("alpha") - 1,
16128                                                 sizeof("ascii") - 1,
16129                                                 sizeof("blank") - 1,
16130                                                 sizeof("cntrl") - 1,
16131                                                 sizeof("digit") - 1,
16132                                                 sizeof("graph") - 1,
16133                                                 sizeof("lower") - 1,
16134                                                 sizeof("print") - 1,
16135                                                 sizeof("punct") - 1,
16136                                                 sizeof("space") - 1,
16137                                                 sizeof("upper") - 1,
16138                                                 sizeof("word")  - 1,
16139                                                 sizeof("xdigit")- 1
16140                                             };
16141             unsigned int i;
16142             int temp_max = max_distance;    /* Use a temporary, so if we
16143                                                reparse, we haven't changed the
16144                                                outer one */
16145
16146             /* Use a smaller max edit distance if we are missing one of the
16147              * delimiters */
16148             if (   has_opening_bracket + has_opening_colon < 2
16149                 || has_terminating_bracket + has_terminating_colon < 2)
16150             {
16151                 temp_max--;
16152             }
16153
16154             /* See if the input name is close to a legal one */
16155             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
16156
16157                 /* Short circuit call if the lengths are too far apart to be
16158                  * able to match */
16159                 if (abs( (int) (name_len - posix_name_lengths[i]))
16160                     > temp_max)
16161                 {
16162                     continue;
16163                 }
16164
16165                 if (edit_distance(input_text,
16166                                   posix_names[i],
16167                                   name_len,
16168                                   posix_name_lengths[i],
16169                                   temp_max
16170                                  )
16171                     > -1)
16172                 { /* If it is close, it probably was intended to be a class */
16173                     goto probably_meant_to_be;
16174                 }
16175             }
16176
16177             /* Here the input name is not close enough to a valid class name
16178              * for us to consider it to be intended to be a posix class.  If
16179              * we haven't already done so, and the parse found a character that
16180              * could have been terminators for the name, but which we absorbed
16181              * as typos during the first pass, repeat the parse, signalling it
16182              * to stop at that character */
16183             if (possible_end && possible_end != (char *) -1) {
16184                 possible_end = (char *) -1;
16185                 p = name_start;
16186                 goto parse_name;
16187             }
16188
16189             /* Here neither pass found a close-enough class name */
16190             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16191         }
16192
16193     probably_meant_to_be:
16194
16195         /* Here we think that a posix specification was intended.  Update any
16196          * parse pointer */
16197         if (updated_parse_ptr) {
16198             *updated_parse_ptr = (char *) p;
16199         }
16200
16201         /* If a posix class name was intended but incorrectly specified, we
16202          * output or return the warnings */
16203         if (found_problem) {
16204
16205             /* We set flags for these issues in the parse loop above instead of
16206              * adding them to the list of warnings, because we can parse it
16207              * twice, and we only want one warning instance */
16208             if (has_upper) {
16209                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
16210             }
16211             if (has_blank) {
16212                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16213             }
16214             if (has_semi_colon) {
16215                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16216             }
16217             else if (! has_terminating_colon) {
16218                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
16219             }
16220             if (! has_terminating_bracket) {
16221                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
16222             }
16223
16224             if (   posix_warnings
16225                 && RExC_warn_text
16226                 && av_count(RExC_warn_text) > 0)
16227             {
16228                 *posix_warnings = RExC_warn_text;
16229             }
16230         }
16231         else if (class_number != OOB_NAMEDCLASS) {
16232             /* If it is a known class, return the class.  The class number
16233              * #defines are structured so each complement is +1 to the normal
16234              * one */
16235             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16236         }
16237         else if (! check_only) {
16238
16239             /* Here, it is an unrecognized class.  This is an error (unless the
16240             * call is to check only, which we've already handled above) */
16241             const char * const complement_string = (complement)
16242                                                    ? "^"
16243                                                    : "";
16244             RExC_parse = (char *) p;
16245             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16246                         complement_string,
16247                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16248         }
16249     }
16250
16251     return OOB_NAMEDCLASS;
16252 }
16253 #undef ADD_POSIX_WARNING
16254
16255 STATIC unsigned  int
16256 S_regex_set_precedence(const U8 my_operator) {
16257
16258     /* Returns the precedence in the (?[...]) construct of the input operator,
16259      * specified by its character representation.  The precedence follows
16260      * general Perl rules, but it extends this so that ')' and ']' have (low)
16261      * precedence even though they aren't really operators */
16262
16263     switch (my_operator) {
16264         case '!':
16265             return 5;
16266         case '&':
16267             return 4;
16268         case '^':
16269         case '|':
16270         case '+':
16271         case '-':
16272             return 3;
16273         case ')':
16274             return 2;
16275         case ']':
16276             return 1;
16277     }
16278
16279     NOT_REACHED; /* NOTREACHED */
16280     return 0;   /* Silence compiler warning */
16281 }
16282
16283 STATIC regnode_offset
16284 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16285                     I32 *flagp, U32 depth,
16286                     char * const oregcomp_parse)
16287 {
16288     /* Handle the (?[...]) construct to do set operations */
16289
16290     U8 curchar;                     /* Current character being parsed */
16291     UV start, end;                  /* End points of code point ranges */
16292     SV* final = NULL;               /* The end result inversion list */
16293     SV* result_string;              /* 'final' stringified */
16294     AV* stack;                      /* stack of operators and operands not yet
16295                                        resolved */
16296     AV* fence_stack = NULL;         /* A stack containing the positions in
16297                                        'stack' of where the undealt-with left
16298                                        parens would be if they were actually
16299                                        put there */
16300     /* The 'volatile' is a workaround for an optimiser bug
16301      * in Solaris Studio 12.3. See RT #127455 */
16302     volatile IV fence = 0;          /* Position of where most recent undealt-
16303                                        with left paren in stack is; -1 if none.
16304                                      */
16305     STRLEN len;                     /* Temporary */
16306     regnode_offset node;            /* Temporary, and final regnode returned by
16307                                        this function */
16308     const bool save_fold = FOLD;    /* Temporary */
16309     char *save_end, *save_parse;    /* Temporaries */
16310     const bool in_locale = LOC;     /* we turn off /l during processing */
16311
16312     DECLARE_AND_GET_RE_DEBUG_FLAGS;
16313
16314     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16315     PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
16316
16317     DEBUG_PARSE("xcls");
16318
16319     if (in_locale) {
16320         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16321     }
16322
16323     /* The use of this operator implies /u.  This is required so that the
16324      * compile time values are valid in all runtime cases */
16325     REQUIRE_UNI_RULES(flagp, 0);
16326
16327     ckWARNexperimental(RExC_parse,
16328                        WARN_EXPERIMENTAL__REGEX_SETS,
16329                        "The regex_sets feature is experimental");
16330
16331     /* Everything in this construct is a metacharacter.  Operands begin with
16332      * either a '\' (for an escape sequence), or a '[' for a bracketed
16333      * character class.  Any other character should be an operator, or
16334      * parenthesis for grouping.  Both types of operands are handled by calling
16335      * regclass() to parse them.  It is called with a parameter to indicate to
16336      * return the computed inversion list.  The parsing here is implemented via
16337      * a stack.  Each entry on the stack is a single character representing one
16338      * of the operators; or else a pointer to an operand inversion list. */
16339
16340 #define IS_OPERATOR(a) SvIOK(a)
16341 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
16342
16343     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
16344      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16345      * with pronouncing it called it Reverse Polish instead, but now that YOU
16346      * know how to pronounce it you can use the correct term, thus giving due
16347      * credit to the person who invented it, and impressing your geek friends.
16348      * Wikipedia says that the pronounciation of "Ł" has been changing so that
16349      * it is now more like an English initial W (as in wonk) than an L.)
16350      *
16351      * This means that, for example, 'a | b & c' is stored on the stack as
16352      *
16353      * c  [4]
16354      * b  [3]
16355      * &  [2]
16356      * a  [1]
16357      * |  [0]
16358      *
16359      * where the numbers in brackets give the stack [array] element number.
16360      * In this implementation, parentheses are not stored on the stack.
16361      * Instead a '(' creates a "fence" so that the part of the stack below the
16362      * fence is invisible except to the corresponding ')' (this allows us to
16363      * replace testing for parens, by using instead subtraction of the fence
16364      * position).  As new operands are processed they are pushed onto the stack
16365      * (except as noted in the next paragraph).  New operators of higher
16366      * precedence than the current final one are inserted on the stack before
16367      * the lhs operand (so that when the rhs is pushed next, everything will be
16368      * in the correct positions shown above.  When an operator of equal or
16369      * lower precedence is encountered in parsing, all the stacked operations
16370      * of equal or higher precedence are evaluated, leaving the result as the
16371      * top entry on the stack.  This makes higher precedence operations
16372      * evaluate before lower precedence ones, and causes operations of equal
16373      * precedence to left associate.
16374      *
16375      * The only unary operator '!' is immediately pushed onto the stack when
16376      * encountered.  When an operand is encountered, if the top of the stack is
16377      * a '!", the complement is immediately performed, and the '!' popped.  The
16378      * resulting value is treated as a new operand, and the logic in the
16379      * previous paragraph is executed.  Thus in the expression
16380      *      [a] + ! [b]
16381      * the stack looks like
16382      *
16383      * !
16384      * a
16385      * +
16386      *
16387      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16388      * becomes
16389      *
16390      * !b
16391      * a
16392      * +
16393      *
16394      * A ')' is treated as an operator with lower precedence than all the
16395      * aforementioned ones, which causes all operations on the stack above the
16396      * corresponding '(' to be evaluated down to a single resultant operand.
16397      * Then the fence for the '(' is removed, and the operand goes through the
16398      * algorithm above, without the fence.
16399      *
16400      * A separate stack is kept of the fence positions, so that the position of
16401      * the latest so-far unbalanced '(' is at the top of it.
16402      *
16403      * The ']' ending the construct is treated as the lowest operator of all,
16404      * so that everything gets evaluated down to a single operand, which is the
16405      * result */
16406
16407     sv_2mortal((SV *)(stack = newAV()));
16408     sv_2mortal((SV *)(fence_stack = newAV()));
16409
16410     while (RExC_parse < RExC_end) {
16411         I32 top_index;              /* Index of top-most element in 'stack' */
16412         SV** top_ptr;               /* Pointer to top 'stack' element */
16413         SV* current = NULL;         /* To contain the current inversion list
16414                                        operand */
16415         SV* only_to_avoid_leaks;
16416
16417         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16418                                 TRUE /* Force /x */ );
16419         if (RExC_parse >= RExC_end) {   /* Fail */
16420             break;
16421         }
16422
16423         curchar = UCHARAT(RExC_parse);
16424
16425 redo_curchar:
16426
16427 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16428                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16429         DEBUG_U(dump_regex_sets_structures(pRExC_state,
16430                                            stack, fence, fence_stack));
16431 #endif
16432
16433         top_index = av_tindex_skip_len_mg(stack);
16434
16435         switch (curchar) {
16436             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
16437             char stacked_operator;  /* The topmost operator on the 'stack'. */
16438             SV* lhs;                /* Operand to the left of the operator */
16439             SV* rhs;                /* Operand to the right of the operator */
16440             SV* fence_ptr;          /* Pointer to top element of the fence
16441                                        stack */
16442             case '(':
16443
16444                 if (   RExC_parse < RExC_end - 2
16445                     && UCHARAT(RExC_parse + 1) == '?'
16446                     && UCHARAT(RExC_parse + 2) == '^')
16447                 {
16448                     const regnode_offset orig_emit = RExC_emit;
16449                     SV * resultant_invlist;
16450
16451                     /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
16452                      * This happens when we have some thing like
16453                      *
16454                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16455                      *   ...
16456                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
16457                      *
16458                      * Here we would be handling the interpolated
16459                      * '$thai_or_lao'.  We handle this by a recursive call to
16460                      * reg which returns the inversion list the
16461                      * interpolated expression evaluates to.  Actually, the
16462                      * return is a special regnode containing a pointer to that
16463                      * inversion list.  If the return isn't that regnode alone,
16464                      * we know that this wasn't such an interpolation, which is
16465                      * an error: we need to get a single inversion list back
16466                      * from the recursion */
16467
16468                     RExC_parse++;
16469                     RExC_sets_depth++;
16470
16471                     node = reg(pRExC_state, 2, flagp, depth+1);
16472                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16473
16474                     if (   OP(REGNODE_p(node)) != REGEX_SET
16475                            /* If more than a single node returned, the nested
16476                             * parens evaluated to more than just a (?[...]),
16477                             * which isn't legal */
16478                         || RExC_emit != orig_emit
16479                                       + NODE_STEP_REGNODE
16480                                       + regarglen[REGEX_SET])
16481                     {
16482                         vFAIL("Expecting interpolated extended charclass");
16483                     }
16484                     resultant_invlist = (SV *) ARGp(REGNODE_p(node));
16485                     current = invlist_clone(resultant_invlist, NULL);
16486                     SvREFCNT_dec(resultant_invlist);
16487
16488                     RExC_sets_depth--;
16489                     RExC_emit = orig_emit;
16490                     goto handle_operand;
16491                 }
16492
16493                 /* A regular '('.  Look behind for illegal syntax */
16494                 if (top_index - fence >= 0) {
16495                     /* If the top entry on the stack is an operator, it had
16496                      * better be a '!', otherwise the entry below the top
16497                      * operand should be an operator */
16498                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
16499                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16500                         || (   IS_OPERAND(*top_ptr)
16501                             && (   top_index - fence < 1
16502                                 || ! (stacked_ptr = av_fetch(stack,
16503                                                              top_index - 1,
16504                                                              FALSE))
16505                                 || ! IS_OPERATOR(*stacked_ptr))))
16506                     {
16507                         RExC_parse++;
16508                         vFAIL("Unexpected '(' with no preceding operator");
16509                     }
16510                 }
16511
16512                 /* Stack the position of this undealt-with left paren */
16513                 av_push(fence_stack, newSViv(fence));
16514                 fence = top_index + 1;
16515                 break;
16516
16517             case '\\':
16518                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16519                  * multi-char folds are allowed.  */
16520                 if (!regclass(pRExC_state, flagp, depth+1,
16521                               TRUE, /* means parse just the next thing */
16522                               FALSE, /* don't allow multi-char folds */
16523                               FALSE, /* don't silence non-portable warnings.  */
16524                               TRUE,  /* strict */
16525                               FALSE, /* Require return to be an ANYOF */
16526                               &current))
16527                 {
16528                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16529                     goto regclass_failed;
16530                 }
16531
16532                 assert(current);
16533
16534                 /* regclass() will return with parsing just the \ sequence,
16535                  * leaving the parse pointer at the next thing to parse */
16536                 RExC_parse--;
16537                 goto handle_operand;
16538
16539             case '[':   /* Is a bracketed character class */
16540             {
16541                 /* See if this is a [:posix:] class. */
16542                 bool is_posix_class = (OOB_NAMEDCLASS
16543                             < handle_possible_posix(pRExC_state,
16544                                                 RExC_parse + 1,
16545                                                 NULL,
16546                                                 NULL,
16547                                                 TRUE /* checking only */));
16548                 /* If it is a posix class, leave the parse pointer at the '['
16549                  * to fool regclass() into thinking it is part of a
16550                  * '[[:posix:]]'. */
16551                 if (! is_posix_class) {
16552                     RExC_parse++;
16553                 }
16554
16555                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16556                  * multi-char folds are allowed.  */
16557                 if (!regclass(pRExC_state, flagp, depth+1,
16558                                 is_posix_class, /* parse the whole char
16559                                                     class only if not a
16560                                                     posix class */
16561                                 FALSE, /* don't allow multi-char folds */
16562                                 TRUE, /* silence non-portable warnings. */
16563                                 TRUE, /* strict */
16564                                 FALSE, /* Require return to be an ANYOF */
16565                                 &current))
16566                 {
16567                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16568                     goto regclass_failed;
16569                 }
16570
16571                 assert(current);
16572
16573                 /* function call leaves parse pointing to the ']', except if we
16574                  * faked it */
16575                 if (is_posix_class) {
16576                     RExC_parse--;
16577                 }
16578
16579                 goto handle_operand;
16580             }
16581
16582             case ']':
16583                 if (top_index >= 1) {
16584                     goto join_operators;
16585                 }
16586
16587                 /* Only a single operand on the stack: are done */
16588                 goto done;
16589
16590             case ')':
16591                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16592                     if (UCHARAT(RExC_parse - 1) == ']')  {
16593                         break;
16594                     }
16595                     RExC_parse++;
16596                     vFAIL("Unexpected ')'");
16597                 }
16598
16599                 /* If nothing after the fence, is missing an operand */
16600                 if (top_index - fence < 0) {
16601                     RExC_parse++;
16602                     goto bad_syntax;
16603                 }
16604                 /* If at least two things on the stack, treat this as an
16605                   * operator */
16606                 if (top_index - fence >= 1) {
16607                     goto join_operators;
16608                 }
16609
16610                 /* Here only a single thing on the fenced stack, and there is a
16611                  * fence.  Get rid of it */
16612                 fence_ptr = av_pop(fence_stack);
16613                 assert(fence_ptr);
16614                 fence = SvIV(fence_ptr);
16615                 SvREFCNT_dec_NN(fence_ptr);
16616                 fence_ptr = NULL;
16617
16618                 if (fence < 0) {
16619                     fence = 0;
16620                 }
16621
16622                 /* Having gotten rid of the fence, we pop the operand at the
16623                  * stack top and process it as a newly encountered operand */
16624                 current = av_pop(stack);
16625                 if (IS_OPERAND(current)) {
16626                     goto handle_operand;
16627                 }
16628
16629                 RExC_parse++;
16630                 goto bad_syntax;
16631
16632             case '&':
16633             case '|':
16634             case '+':
16635             case '-':
16636             case '^':
16637
16638                 /* These binary operators should have a left operand already
16639                  * parsed */
16640                 if (   top_index - fence < 0
16641                     || top_index - fence == 1
16642                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16643                     || ! IS_OPERAND(*top_ptr))
16644                 {
16645                     goto unexpected_binary;
16646                 }
16647
16648                 /* If only the one operand is on the part of the stack visible
16649                  * to us, we just place this operator in the proper position */
16650                 if (top_index - fence < 2) {
16651
16652                     /* Place the operator before the operand */
16653
16654                     SV* lhs = av_pop(stack);
16655                     av_push(stack, newSVuv(curchar));
16656                     av_push(stack, lhs);
16657                     break;
16658                 }
16659
16660                 /* But if there is something else on the stack, we need to
16661                  * process it before this new operator if and only if the
16662                  * stacked operation has equal or higher precedence than the
16663                  * new one */
16664
16665              join_operators:
16666
16667                 /* The operator on the stack is supposed to be below both its
16668                  * operands */
16669                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16670                     || IS_OPERAND(*stacked_ptr))
16671                 {
16672                     /* But if not, it's legal and indicates we are completely
16673                      * done if and only if we're currently processing a ']',
16674                      * which should be the final thing in the expression */
16675                     if (curchar == ']') {
16676                         goto done;
16677                     }
16678
16679                   unexpected_binary:
16680                     RExC_parse++;
16681                     vFAIL2("Unexpected binary operator '%c' with no "
16682                            "preceding operand", curchar);
16683                 }
16684                 stacked_operator = (char) SvUV(*stacked_ptr);
16685
16686                 if (regex_set_precedence(curchar)
16687                     > regex_set_precedence(stacked_operator))
16688                 {
16689                     /* Here, the new operator has higher precedence than the
16690                      * stacked one.  This means we need to add the new one to
16691                      * the stack to await its rhs operand (and maybe more
16692                      * stuff).  We put it before the lhs operand, leaving
16693                      * untouched the stacked operator and everything below it
16694                      * */
16695                     lhs = av_pop(stack);
16696                     assert(IS_OPERAND(lhs));
16697
16698                     av_push(stack, newSVuv(curchar));
16699                     av_push(stack, lhs);
16700                     break;
16701                 }
16702
16703                 /* Here, the new operator has equal or lower precedence than
16704                  * what's already there.  This means the operation already
16705                  * there should be performed now, before the new one. */
16706
16707                 rhs = av_pop(stack);
16708                 if (! IS_OPERAND(rhs)) {
16709
16710                     /* This can happen when a ! is not followed by an operand,
16711                      * like in /(?[\t &!])/ */
16712                     goto bad_syntax;
16713                 }
16714
16715                 lhs = av_pop(stack);
16716
16717                 if (! IS_OPERAND(lhs)) {
16718
16719                     /* This can happen when there is an empty (), like in
16720                      * /(?[[0]+()+])/ */
16721                     goto bad_syntax;
16722                 }
16723
16724                 switch (stacked_operator) {
16725                     case '&':
16726                         _invlist_intersection(lhs, rhs, &rhs);
16727                         break;
16728
16729                     case '|':
16730                     case '+':
16731                         _invlist_union(lhs, rhs, &rhs);
16732                         break;
16733
16734                     case '-':
16735                         _invlist_subtract(lhs, rhs, &rhs);
16736                         break;
16737
16738                     case '^':   /* The union minus the intersection */
16739                     {
16740                         SV* i = NULL;
16741                         SV* u = NULL;
16742
16743                         _invlist_union(lhs, rhs, &u);
16744                         _invlist_intersection(lhs, rhs, &i);
16745                         _invlist_subtract(u, i, &rhs);
16746                         SvREFCNT_dec_NN(i);
16747                         SvREFCNT_dec_NN(u);
16748                         break;
16749                     }
16750                 }
16751                 SvREFCNT_dec(lhs);
16752
16753                 /* Here, the higher precedence operation has been done, and the
16754                  * result is in 'rhs'.  We overwrite the stacked operator with
16755                  * the result.  Then we redo this code to either push the new
16756                  * operator onto the stack or perform any higher precedence
16757                  * stacked operation */
16758                 only_to_avoid_leaks = av_pop(stack);
16759                 SvREFCNT_dec(only_to_avoid_leaks);
16760                 av_push(stack, rhs);
16761                 goto redo_curchar;
16762
16763             case '!':   /* Highest priority, right associative */
16764
16765                 /* If what's already at the top of the stack is another '!",
16766                  * they just cancel each other out */
16767                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
16768                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16769                 {
16770                     only_to_avoid_leaks = av_pop(stack);
16771                     SvREFCNT_dec(only_to_avoid_leaks);
16772                 }
16773                 else { /* Otherwise, since it's right associative, just push
16774                           onto the stack */
16775                     av_push(stack, newSVuv(curchar));
16776                 }
16777                 break;
16778
16779             default:
16780                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16781                 if (RExC_parse >= RExC_end) {
16782                     break;
16783                 }
16784                 vFAIL("Unexpected character");
16785
16786           handle_operand:
16787
16788             /* Here 'current' is the operand.  If something is already on the
16789              * stack, we have to check if it is a !.  But first, the code above
16790              * may have altered the stack in the time since we earlier set
16791              * 'top_index'.  */
16792
16793             top_index = av_tindex_skip_len_mg(stack);
16794             if (top_index - fence >= 0) {
16795                 /* If the top entry on the stack is an operator, it had better
16796                  * be a '!', otherwise the entry below the top operand should
16797                  * be an operator */
16798                 top_ptr = av_fetch(stack, top_index, FALSE);
16799                 assert(top_ptr);
16800                 if (IS_OPERATOR(*top_ptr)) {
16801
16802                     /* The only permissible operator at the top of the stack is
16803                      * '!', which is applied immediately to this operand. */
16804                     curchar = (char) SvUV(*top_ptr);
16805                     if (curchar != '!') {
16806                         SvREFCNT_dec(current);
16807                         vFAIL2("Unexpected binary operator '%c' with no "
16808                                 "preceding operand", curchar);
16809                     }
16810
16811                     _invlist_invert(current);
16812
16813                     only_to_avoid_leaks = av_pop(stack);
16814                     SvREFCNT_dec(only_to_avoid_leaks);
16815
16816                     /* And we redo with the inverted operand.  This allows
16817                      * handling multiple ! in a row */
16818                     goto handle_operand;
16819                 }
16820                           /* Single operand is ok only for the non-binary ')'
16821                            * operator */
16822                 else if ((top_index - fence == 0 && curchar != ')')
16823                          || (top_index - fence > 0
16824                              && (! (stacked_ptr = av_fetch(stack,
16825                                                            top_index - 1,
16826                                                            FALSE))
16827                                  || IS_OPERAND(*stacked_ptr))))
16828                 {
16829                     SvREFCNT_dec(current);
16830                     vFAIL("Operand with no preceding operator");
16831                 }
16832             }
16833
16834             /* Here there was nothing on the stack or the top element was
16835              * another operand.  Just add this new one */
16836             av_push(stack, current);
16837
16838         } /* End of switch on next parse token */
16839
16840         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16841     } /* End of loop parsing through the construct */
16842
16843     vFAIL("Syntax error in (?[...])");
16844
16845   done:
16846
16847     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16848         if (RExC_parse < RExC_end) {
16849             RExC_parse++;
16850         }
16851
16852         vFAIL("Unexpected ']' with no following ')' in (?[...");
16853     }
16854
16855     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16856         vFAIL("Unmatched (");
16857     }
16858
16859     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
16860         || ((final = av_pop(stack)) == NULL)
16861         || ! IS_OPERAND(final)
16862         || ! is_invlist(final)
16863         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
16864     {
16865       bad_syntax:
16866         SvREFCNT_dec(final);
16867         vFAIL("Incomplete expression within '(?[ ])'");
16868     }
16869
16870     /* Here, 'final' is the resultant inversion list from evaluating the
16871      * expression.  Return it if so requested */
16872     if (return_invlist) {
16873         *return_invlist = final;
16874         return END;
16875     }
16876
16877     if (RExC_sets_depth) {  /* If within a recursive call, return in a special
16878                                regnode */
16879         RExC_parse++;
16880         node = regpnode(pRExC_state, REGEX_SET, final);
16881     }
16882     else {
16883
16884         /* Otherwise generate a resultant node, based on 'final'.  regclass()
16885          * is expecting a string of ranges and individual code points */
16886         invlist_iterinit(final);
16887         result_string = newSVpvs("");
16888         while (invlist_iternext(final, &start, &end)) {
16889             if (start == end) {
16890                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16891             }
16892             else {
16893                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
16894                                                         UVXf "}", start, end);
16895             }
16896         }
16897
16898         /* About to generate an ANYOF (or similar) node from the inversion list
16899          * we have calculated */
16900         save_parse = RExC_parse;
16901         RExC_parse = SvPV(result_string, len);
16902         save_end = RExC_end;
16903         RExC_end = RExC_parse + len;
16904         TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16905
16906         /* We turn off folding around the call, as the class we have
16907          * constructed already has all folding taken into consideration, and we
16908          * don't want regclass() to add to that */
16909         RExC_flags &= ~RXf_PMf_FOLD;
16910         /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16911          * folds are allowed.  */
16912         node = regclass(pRExC_state, flagp, depth+1,
16913                         FALSE, /* means parse the whole char class */
16914                         FALSE, /* don't allow multi-char folds */
16915                         TRUE, /* silence non-portable warnings.  The above may
16916                                  very well have generated non-portable code
16917                                  points, but they're valid on this machine */
16918                         FALSE, /* similarly, no need for strict */
16919
16920                         /* We can optimize into something besides an ANYOF,
16921                          * except under /l, which needs to be ANYOF because of
16922                          * runtime checks for locale sanity, etc */
16923                     ! in_locale,
16924                         NULL
16925                     );
16926
16927         RESTORE_WARNINGS;
16928         RExC_parse = save_parse + 1;
16929         RExC_end = save_end;
16930         SvREFCNT_dec_NN(final);
16931         SvREFCNT_dec_NN(result_string);
16932
16933         if (save_fold) {
16934             RExC_flags |= RXf_PMf_FOLD;
16935         }
16936
16937         if (!node) {
16938             RETURN_FAIL_ON_RESTART(*flagp, flagp);
16939             goto regclass_failed;
16940         }
16941
16942         /* Fix up the node type if we are in locale.  (We have pretended we are
16943          * under /u for the purposes of regclass(), as this construct will only
16944          * work under UTF-8 locales.  But now we change the opcode to be ANYOFL
16945          * (so as to cause any warnings about bad locales to be output in
16946          * regexec.c), and add the flag that indicates to check if not in a
16947          * UTF-8 locale.  The reason we above forbid optimization into
16948          * something other than an ANYOF node is simply to minimize the number
16949          * of code changes in regexec.c.  Otherwise we would have to create new
16950          * EXACTish node types and deal with them.  This decision could be
16951          * revisited should this construct become popular.
16952          *
16953          * (One might think we could look at the resulting ANYOF node and
16954          * suppress the flag if everything is above 255, as those would be
16955          * UTF-8 only, but this isn't true, as the components that led to that
16956          * result could have been locale-affected, and just happen to cancel
16957          * each other out under UTF-8 locales.) */
16958         if (in_locale) {
16959             set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16960
16961             assert(OP(REGNODE_p(node)) == ANYOF);
16962
16963             OP(REGNODE_p(node)) = ANYOFL;
16964             ANYOF_FLAGS(REGNODE_p(node))
16965                     |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16966         }
16967     }
16968
16969     nextchar(pRExC_state);
16970     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16971     return node;
16972
16973   regclass_failed:
16974     FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
16975                                                                 (UV) *flagp);
16976 }
16977
16978 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16979
16980 STATIC void
16981 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16982                              AV * stack, const IV fence, AV * fence_stack)
16983 {   /* Dumps the stacks in handle_regex_sets() */
16984
16985     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16986     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16987     SSize_t i;
16988
16989     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16990
16991     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16992
16993     if (stack_top < 0) {
16994         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16995     }
16996     else {
16997         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16998         for (i = stack_top; i >= 0; i--) {
16999             SV ** element_ptr = av_fetch(stack, i, FALSE);
17000             if (! element_ptr) {
17001             }
17002
17003             if (IS_OPERATOR(*element_ptr)) {
17004                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
17005                                             (int) i, (int) SvIV(*element_ptr));
17006             }
17007             else {
17008                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
17009                 sv_dump(*element_ptr);
17010             }
17011         }
17012     }
17013
17014     if (fence_stack_top < 0) {
17015         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
17016     }
17017     else {
17018         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
17019         for (i = fence_stack_top; i >= 0; i--) {
17020             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
17021             if (! element_ptr) {
17022             }
17023
17024             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
17025                                             (int) i, (int) SvIV(*element_ptr));
17026         }
17027     }
17028 }
17029
17030 #endif
17031
17032 #undef IS_OPERATOR
17033 #undef IS_OPERAND
17034
17035 STATIC void
17036 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
17037 {
17038     /* This adds the Latin1/above-Latin1 folding rules.
17039      *
17040      * This should be called only for a Latin1-range code points, cp, which is
17041      * known to be involved in a simple fold with other code points above
17042      * Latin1.  It would give false results if /aa has been specified.
17043      * Multi-char folds are outside the scope of this, and must be handled
17044      * specially. */
17045
17046     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
17047
17048     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
17049
17050     /* The rules that are valid for all Unicode versions are hard-coded in */
17051     switch (cp) {
17052         case 'k':
17053         case 'K':
17054           *invlist =
17055              add_cp_to_invlist(*invlist, KELVIN_SIGN);
17056             break;
17057         case 's':
17058         case 'S':
17059           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
17060             break;
17061         case MICRO_SIGN:
17062           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
17063           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
17064             break;
17065         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
17066         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
17067           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
17068             break;
17069         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
17070           *invlist = add_cp_to_invlist(*invlist,
17071                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
17072             break;
17073
17074         default:    /* Other code points are checked against the data for the
17075                        current Unicode version */
17076           {
17077             Size_t folds_count;
17078             U32 first_fold;
17079             const U32 * remaining_folds;
17080             UV folded_cp;
17081
17082             if (isASCII(cp)) {
17083                 folded_cp = toFOLD(cp);
17084             }
17085             else {
17086                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
17087                 Size_t dummy_len;
17088                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
17089             }
17090
17091             if (folded_cp > 255) {
17092                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
17093             }
17094
17095             folds_count = _inverse_folds(folded_cp, &first_fold,
17096                                                     &remaining_folds);
17097             if (folds_count == 0) {
17098
17099                 /* Use deprecated warning to increase the chances of this being
17100                  * output */
17101                 ckWARN2reg_d(RExC_parse,
17102                         "Perl folding rules are not up-to-date for 0x%02X;"
17103                         " please use the perlbug utility to report;", cp);
17104             }
17105             else {
17106                 unsigned int i;
17107
17108                 if (first_fold > 255) {
17109                     *invlist = add_cp_to_invlist(*invlist, first_fold);
17110                 }
17111                 for (i = 0; i < folds_count - 1; i++) {
17112                     if (remaining_folds[i] > 255) {
17113                         *invlist = add_cp_to_invlist(*invlist,
17114                                                     remaining_folds[i]);
17115                     }
17116                 }
17117             }
17118             break;
17119          }
17120     }
17121 }
17122
17123 STATIC void
17124 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
17125 {
17126     /* Output the elements of the array given by '*posix_warnings' as REGEXP
17127      * warnings. */
17128
17129     SV * msg;
17130     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
17131
17132     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
17133
17134     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
17135         CLEAR_POSIX_WARNINGS();
17136         return;
17137     }
17138
17139     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
17140         if (first_is_fatal) {           /* Avoid leaking this */
17141             av_undef(posix_warnings);   /* This isn't necessary if the
17142                                             array is mortal, but is a
17143                                             fail-safe */
17144             (void) sv_2mortal(msg);
17145             PREPARE_TO_DIE;
17146         }
17147         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
17148         SvREFCNT_dec_NN(msg);
17149     }
17150
17151     UPDATE_WARNINGS_LOC(RExC_parse);
17152 }
17153
17154 PERL_STATIC_INLINE Size_t
17155 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
17156 {
17157     const U8 * const start = s1;
17158     const U8 * const send = start + max;
17159
17160     PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
17161
17162     while (s1 < send && *s1  == *s2) {
17163         s1++; s2++;
17164     }
17165
17166     return s1 - start;
17167 }
17168
17169
17170 STATIC AV *
17171 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
17172 {
17173     /* This adds the string scalar <multi_string> to the array
17174      * <multi_char_matches>.  <multi_string> is known to have exactly
17175      * <cp_count> code points in it.  This is used when constructing a
17176      * bracketed character class and we find something that needs to match more
17177      * than a single character.
17178      *
17179      * <multi_char_matches> is actually an array of arrays.  Each top-level
17180      * element is an array that contains all the strings known so far that are
17181      * the same length.  And that length (in number of code points) is the same
17182      * as the index of the top-level array.  Hence, the [2] element is an
17183      * array, each element thereof is a string containing TWO code points;
17184      * while element [3] is for strings of THREE characters, and so on.  Since
17185      * this is for multi-char strings there can never be a [0] nor [1] element.
17186      *
17187      * When we rewrite the character class below, we will do so such that the
17188      * longest strings are written first, so that it prefers the longest
17189      * matching strings first.  This is done even if it turns out that any
17190      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
17191      * Christiansen has agreed that this is ok.  This makes the test for the
17192      * ligature 'ffi' come before the test for 'ff', for example */
17193
17194     AV* this_array;
17195     AV** this_array_ptr;
17196
17197     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
17198
17199     if (! multi_char_matches) {
17200         multi_char_matches = newAV();
17201     }
17202
17203     if (av_exists(multi_char_matches, cp_count)) {
17204         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
17205         this_array = *this_array_ptr;
17206     }
17207     else {
17208         this_array = newAV();
17209         av_store(multi_char_matches, cp_count,
17210                  (SV*) this_array);
17211     }
17212     av_push(this_array, multi_string);
17213
17214     return multi_char_matches;
17215 }
17216
17217 /* The names of properties whose definitions are not known at compile time are
17218  * stored in this SV, after a constant heading.  So if the length has been
17219  * changed since initialization, then there is a run-time definition. */
17220 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
17221                                         (SvCUR(listsv) != initial_listsv_len)
17222
17223 /* There is a restricted set of white space characters that are legal when
17224  * ignoring white space in a bracketed character class.  This generates the
17225  * code to skip them.
17226  *
17227  * There is a line below that uses the same white space criteria but is outside
17228  * this macro.  Both here and there must use the same definition */
17229 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p)                  \
17230     STMT_START {                                                        \
17231         if (do_skip) {                                                  \
17232             while (p < stop_p && isBLANK_A(UCHARAT(p)))                 \
17233             {                                                           \
17234                 p++;                                                    \
17235             }                                                           \
17236         }                                                               \
17237     } STMT_END
17238
17239 STATIC regnode_offset
17240 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17241                  const bool stop_at_1,  /* Just parse the next thing, don't
17242                                            look for a full character class */
17243                  bool allow_mutiple_chars,
17244                  const bool silence_non_portable,   /* Don't output warnings
17245                                                        about too large
17246                                                        characters */
17247                  const bool strict,
17248                  bool optimizable,                  /* ? Allow a non-ANYOF return
17249                                                        node */
17250                  SV** ret_invlist  /* Return an inversion list, not a node */
17251           )
17252 {
17253     /* parse a bracketed class specification.  Most of these will produce an
17254      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17255      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
17256      * under /i with multi-character folds: it will be rewritten following the
17257      * paradigm of this example, where the <multi-fold>s are characters which
17258      * fold to multiple character sequences:
17259      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17260      * gets effectively rewritten as:
17261      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17262      * reg() gets called (recursively) on the rewritten version, and this
17263      * function will return what it constructs.  (Actually the <multi-fold>s
17264      * aren't physically removed from the [abcdefghi], it's just that they are
17265      * ignored in the recursion by means of a flag:
17266      * <RExC_in_multi_char_class>.)
17267      *
17268      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17269      * characters, with the corresponding bit set if that character is in the
17270      * list.  For characters above this, an inversion list is used.  There
17271      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17272      * determinable at compile time
17273      *
17274      * On success, returns the offset at which any next node should be placed
17275      * into the regex engine program being compiled.
17276      *
17277      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17278      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17279      * UTF-8
17280      */
17281
17282     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17283     IV range = 0;
17284     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17285     regnode_offset ret = -1;    /* Initialized to an illegal value */
17286     STRLEN numlen;
17287     int namedclass = OOB_NAMEDCLASS;
17288     char *rangebegin = NULL;
17289     SV *listsv = NULL;      /* List of \p{user-defined} whose definitions
17290                                aren't available at the time this was called */
17291     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17292                                       than just initialized.  */
17293     SV* properties = NULL;    /* Code points that match \p{} \P{} */
17294     SV* posixes = NULL;     /* Code points that match classes like [:word:],
17295                                extended beyond the Latin1 range.  These have to
17296                                be kept separate from other code points for much
17297                                of this function because their handling  is
17298                                different under /i, and for most classes under
17299                                /d as well */
17300     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
17301                                separate for a while from the non-complemented
17302                                versions because of complications with /d
17303                                matching */
17304     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17305                                   treated more simply than the general case,
17306                                   leading to less compilation and execution
17307                                   work */
17308     UV element_count = 0;   /* Number of distinct elements in the class.
17309                                Optimizations may be possible if this is tiny */
17310     AV * multi_char_matches = NULL; /* Code points that fold to more than one
17311                                        character; used under /i */
17312     UV n;
17313     char * stop_ptr = RExC_end;    /* where to stop parsing */
17314
17315     /* ignore unescaped whitespace? */
17316     const bool skip_white = cBOOL(   ret_invlist
17317                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17318
17319     /* inversion list of code points this node matches only when the target
17320      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
17321      * /d) */
17322     SV* upper_latin1_only_utf8_matches = NULL;
17323
17324     /* Inversion list of code points this node matches regardless of things
17325      * like locale, folding, utf8ness of the target string */
17326     SV* cp_list = NULL;
17327
17328     /* Like cp_list, but code points on this list need to be checked for things
17329      * that fold to/from them under /i */
17330     SV* cp_foldable_list = NULL;
17331
17332     /* Like cp_list, but code points on this list are valid only when the
17333      * runtime locale is UTF-8 */
17334     SV* only_utf8_locale_list = NULL;
17335
17336     /* In a range, if one of the endpoints is non-character-set portable,
17337      * meaning that it hard-codes a code point that may mean a different
17338      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17339      * mnemonic '\t' which each mean the same character no matter which
17340      * character set the platform is on. */
17341     unsigned int non_portable_endpoint = 0;
17342
17343     /* Is the range unicode? which means on a platform that isn't 1-1 native
17344      * to Unicode (i.e. non-ASCII), each code point in it should be considered
17345      * to be a Unicode value.  */
17346     bool unicode_range = FALSE;
17347     bool invert = FALSE;    /* Is this class to be complemented */
17348
17349     bool warn_super = ALWAYS_WARN_SUPER;
17350
17351     const char * orig_parse = RExC_parse;
17352
17353     /* This variable is used to mark where the end in the input is of something
17354      * that looks like a POSIX construct but isn't.  During the parse, when
17355      * something looks like it could be such a construct is encountered, it is
17356      * checked for being one, but not if we've already checked this area of the
17357      * input.  Only after this position is reached do we check again */
17358     char *not_posix_region_end = RExC_parse - 1;
17359
17360     AV* posix_warnings = NULL;
17361     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17362     U8 op = END;    /* The returned node-type, initialized to an impossible
17363                        one.  */
17364     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
17365     U32 posixl = 0;       /* bit field of posix classes matched under /l */
17366
17367
17368 /* Flags as to what things aren't knowable until runtime.  (Note that these are
17369  * mutually exclusive.) */
17370 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
17371                                             haven't been defined as of yet */
17372 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
17373                                             UTF-8 or not */
17374 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
17375                                             what gets folded */
17376     U32 has_runtime_dependency = 0;     /* OR of the above flags */
17377
17378     DECLARE_AND_GET_RE_DEBUG_FLAGS;
17379
17380     PERL_ARGS_ASSERT_REGCLASS;
17381 #ifndef DEBUGGING
17382     PERL_UNUSED_ARG(depth);
17383 #endif
17384
17385     assert(! (ret_invlist && allow_mutiple_chars));
17386
17387     /* If wants an inversion list returned, we can't optimize to something
17388      * else. */
17389     if (ret_invlist) {
17390         optimizable = FALSE;
17391     }
17392
17393     DEBUG_PARSE("clas");
17394
17395 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
17396     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
17397                                    && UNICODE_DOT_DOT_VERSION == 0)
17398     allow_mutiple_chars = FALSE;
17399 #endif
17400
17401     /* We include the /i status at the beginning of this so that we can
17402      * know it at runtime */
17403     listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17404     initial_listsv_len = SvCUR(listsv);
17405     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
17406
17407     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17408
17409     assert(RExC_parse <= RExC_end);
17410
17411     if (UCHARAT(RExC_parse) == '^') {   /* Complement the class */
17412         RExC_parse++;
17413         invert = TRUE;
17414         allow_mutiple_chars = FALSE;
17415         MARK_NAUGHTY(1);
17416         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17417     }
17418
17419     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17420     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17421         int maybe_class = handle_possible_posix(pRExC_state,
17422                                                 RExC_parse,
17423                                                 &not_posix_region_end,
17424                                                 NULL,
17425                                                 TRUE /* checking only */);
17426         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17427             ckWARN4reg(not_posix_region_end,
17428                     "POSIX syntax [%c %c] belongs inside character classes%s",
17429                     *RExC_parse, *RExC_parse,
17430                     (maybe_class == OOB_NAMEDCLASS)
17431                     ? ((POSIXCC_NOTYET(*RExC_parse))
17432                         ? " (but this one isn't implemented)"
17433                         : " (but this one isn't fully valid)")
17434                     : ""
17435                     );
17436         }
17437     }
17438
17439     /* If the caller wants us to just parse a single element, accomplish this
17440      * by faking the loop ending condition */
17441     if (stop_at_1 && RExC_end > RExC_parse) {
17442         stop_ptr = RExC_parse + 1;
17443     }
17444
17445     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17446     if (UCHARAT(RExC_parse) == ']')
17447         goto charclassloop;
17448
17449     while (1) {
17450
17451         if (   posix_warnings
17452             && av_tindex_skip_len_mg(posix_warnings) >= 0
17453             && RExC_parse > not_posix_region_end)
17454         {
17455             /* Warnings about posix class issues are considered tentative until
17456              * we are far enough along in the parse that we can no longer
17457              * change our mind, at which point we output them.  This is done
17458              * each time through the loop so that a later class won't zap them
17459              * before they have been dealt with. */
17460             output_posix_warnings(pRExC_state, posix_warnings);
17461         }
17462
17463         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17464
17465         if  (RExC_parse >= stop_ptr) {
17466             break;
17467         }
17468
17469         if  (UCHARAT(RExC_parse) == ']') {
17470             break;
17471         }
17472
17473       charclassloop:
17474
17475         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17476         save_value = value;
17477         save_prevvalue = prevvalue;
17478
17479         if (!range) {
17480             rangebegin = RExC_parse;
17481             element_count++;
17482             non_portable_endpoint = 0;
17483         }
17484         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17485             value = utf8n_to_uvchr((U8*)RExC_parse,
17486                                    RExC_end - RExC_parse,
17487                                    &numlen, UTF8_ALLOW_DEFAULT);
17488             RExC_parse += numlen;
17489         }
17490         else
17491             value = UCHARAT(RExC_parse++);
17492
17493         if (value == '[') {
17494             char * posix_class_end;
17495             namedclass = handle_possible_posix(pRExC_state,
17496                                                RExC_parse,
17497                                                &posix_class_end,
17498                                                do_posix_warnings ? &posix_warnings : NULL,
17499                                                FALSE    /* die if error */);
17500             if (namedclass > OOB_NAMEDCLASS) {
17501
17502                 /* If there was an earlier attempt to parse this particular
17503                  * posix class, and it failed, it was a false alarm, as this
17504                  * successful one proves */
17505                 if (   posix_warnings
17506                     && av_tindex_skip_len_mg(posix_warnings) >= 0
17507                     && not_posix_region_end >= RExC_parse
17508                     && not_posix_region_end <= posix_class_end)
17509                 {
17510                     av_undef(posix_warnings);
17511                 }
17512
17513                 RExC_parse = posix_class_end;
17514             }
17515             else if (namedclass == OOB_NAMEDCLASS) {
17516                 not_posix_region_end = posix_class_end;
17517             }
17518             else {
17519                 namedclass = OOB_NAMEDCLASS;
17520             }
17521         }
17522         else if (   RExC_parse - 1 > not_posix_region_end
17523                  && MAYBE_POSIXCC(value))
17524         {
17525             (void) handle_possible_posix(
17526                         pRExC_state,
17527                         RExC_parse - 1,  /* -1 because parse has already been
17528                                             advanced */
17529                         &not_posix_region_end,
17530                         do_posix_warnings ? &posix_warnings : NULL,
17531                         TRUE /* checking only */);
17532         }
17533         else if (  strict && ! skip_white
17534                  && (   _generic_isCC(value, _CC_VERTSPACE)
17535                      || is_VERTWS_cp_high(value)))
17536         {
17537             vFAIL("Literal vertical space in [] is illegal except under /x");
17538         }
17539         else if (value == '\\') {
17540             /* Is a backslash; get the code point of the char after it */
17541
17542             if (RExC_parse >= RExC_end) {
17543                 vFAIL("Unmatched [");
17544             }
17545
17546             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17547                 value = utf8n_to_uvchr((U8*)RExC_parse,
17548                                    RExC_end - RExC_parse,
17549                                    &numlen, UTF8_ALLOW_DEFAULT);
17550                 RExC_parse += numlen;
17551             }
17552             else
17553                 value = UCHARAT(RExC_parse++);
17554
17555             /* Some compilers cannot handle switching on 64-bit integer
17556              * values, therefore value cannot be an UV.  Yes, this will
17557              * be a problem later if we want switch on Unicode.
17558              * A similar issue a little bit later when switching on
17559              * namedclass. --jhi */
17560
17561             /* If the \ is escaping white space when white space is being
17562              * skipped, it means that that white space is wanted literally, and
17563              * is already in 'value'.  Otherwise, need to translate the escape
17564              * into what it signifies. */
17565             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17566                 const char * message;
17567                 U32 packed_warn;
17568                 U8 grok_c_char;
17569
17570             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
17571             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
17572             case 's':   namedclass = ANYOF_SPACE;       break;
17573             case 'S':   namedclass = ANYOF_NSPACE;      break;
17574             case 'd':   namedclass = ANYOF_DIGIT;       break;
17575             case 'D':   namedclass = ANYOF_NDIGIT;      break;
17576             case 'v':   namedclass = ANYOF_VERTWS;      break;
17577             case 'V':   namedclass = ANYOF_NVERTWS;     break;
17578             case 'h':   namedclass = ANYOF_HORIZWS;     break;
17579             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
17580             case 'N':  /* Handle \N{NAME} in class */
17581                 {
17582                     const char * const backslash_N_beg = RExC_parse - 2;
17583                     int cp_count;
17584
17585                     if (! grok_bslash_N(pRExC_state,
17586                                         NULL,      /* No regnode */
17587                                         &value,    /* Yes single value */
17588                                         &cp_count, /* Multiple code pt count */
17589                                         flagp,
17590                                         strict,
17591                                         depth)
17592                     ) {
17593
17594                         if (*flagp & NEED_UTF8)
17595                             FAIL("panic: grok_bslash_N set NEED_UTF8");
17596
17597                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17598
17599                         if (cp_count < 0) {
17600                             vFAIL("\\N in a character class must be a named character: \\N{...}");
17601                         }
17602                         else if (cp_count == 0) {
17603                             ckWARNreg(RExC_parse,
17604                               "Ignoring zero length \\N{} in character class");
17605                         }
17606                         else { /* cp_count > 1 */
17607                             assert(cp_count > 1);
17608                             if (! RExC_in_multi_char_class) {
17609                                 if ( ! allow_mutiple_chars
17610                                     || invert
17611                                     || range
17612                                     || *RExC_parse == '-')
17613                                 {
17614                                     if (strict) {
17615                                         RExC_parse--;
17616                                         vFAIL("\\N{} here is restricted to one character");
17617                                     }
17618                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17619                                     break; /* <value> contains the first code
17620                                               point. Drop out of the switch to
17621                                               process it */
17622                                 }
17623                                 else {
17624                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
17625                                                  RExC_parse - backslash_N_beg);
17626                                     multi_char_matches
17627                                         = add_multi_match(multi_char_matches,
17628                                                           multi_char_N,
17629                                                           cp_count);
17630                                 }
17631                             }
17632                         } /* End of cp_count != 1 */
17633
17634                         /* This element should not be processed further in this
17635                          * class */
17636                         element_count--;
17637                         value = save_value;
17638                         prevvalue = save_prevvalue;
17639                         continue;   /* Back to top of loop to get next char */
17640                     }
17641
17642                     /* Here, is a single code point, and <value> contains it */
17643                     unicode_range = TRUE;   /* \N{} are Unicode */
17644                 }
17645                 break;
17646             case 'p':
17647             case 'P':
17648                 {
17649                 char *e;
17650
17651                 if (RExC_pm_flags & PMf_WILDCARD) {
17652                     RExC_parse++;
17653                     /* diag_listed_as: Use of %s is not allowed in Unicode
17654                        property wildcard subpatterns in regex; marked by <--
17655                        HERE in m/%s/ */
17656                     vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
17657                            " wildcard subpatterns", (char) value, *(RExC_parse - 1));
17658                 }
17659
17660                 /* \p means they want Unicode semantics */
17661                 REQUIRE_UNI_RULES(flagp, 0);
17662
17663                 if (RExC_parse >= RExC_end)
17664                     vFAIL2("Empty \\%c", (U8)value);
17665                 if (*RExC_parse == '{') {
17666                     const U8 c = (U8)value;
17667                     e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17668                     if (!e) {
17669                         RExC_parse++;
17670                         vFAIL2("Missing right brace on \\%c{}", c);
17671                     }
17672
17673                     RExC_parse++;
17674
17675                     /* White space is allowed adjacent to the braces and after
17676                      * any '^', even when not under /x */
17677                     while (isSPACE(*RExC_parse)) {
17678                          RExC_parse++;
17679                     }
17680
17681                     if (UCHARAT(RExC_parse) == '^') {
17682
17683                         /* toggle.  (The rhs xor gets the single bit that
17684                          * differs between P and p; the other xor inverts just
17685                          * that bit) */
17686                         value ^= 'P' ^ 'p';
17687
17688                         RExC_parse++;
17689                         while (isSPACE(*RExC_parse)) {
17690                             RExC_parse++;
17691                         }
17692                     }
17693
17694                     if (e == RExC_parse)
17695                         vFAIL2("Empty \\%c{}", c);
17696
17697                     n = e - RExC_parse;
17698                     while (isSPACE(*(RExC_parse + n - 1)))
17699                         n--;
17700
17701                 }   /* The \p isn't immediately followed by a '{' */
17702                 else if (! isALPHA(*RExC_parse)) {
17703                     RExC_parse += (UTF)
17704                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17705                                   : 1;
17706                     vFAIL2("Character following \\%c must be '{' or a "
17707                            "single-character Unicode property name",
17708                            (U8) value);
17709                 }
17710                 else {
17711                     e = RExC_parse;
17712                     n = 1;
17713                 }
17714                 {
17715                     char* name = RExC_parse;
17716
17717                     /* Any message returned about expanding the definition */
17718                     SV* msg = newSVpvs_flags("", SVs_TEMP);
17719
17720                     /* If set TRUE, the property is user-defined as opposed to
17721                      * official Unicode */
17722                     bool user_defined = FALSE;
17723                     AV * strings = NULL;
17724
17725                     SV * prop_definition = parse_uniprop_string(
17726                                             name, n, UTF, FOLD,
17727                                             FALSE, /* This is compile-time */
17728
17729                                             /* We can't defer this defn when
17730                                              * the full result is required in
17731                                              * this call */
17732                                             ! cBOOL(ret_invlist),
17733
17734                                             &strings,
17735                                             &user_defined,
17736                                             msg,
17737                                             0 /* Base level */
17738                                            );
17739                     if (SvCUR(msg)) {   /* Assumes any error causes a msg */
17740                         assert(prop_definition == NULL);
17741                         RExC_parse = e + 1;
17742                         if (SvUTF8(msg)) {  /* msg being UTF-8 makes the whole
17743                                                thing so, or else the display is
17744                                                mojibake */
17745                             RExC_utf8 = TRUE;
17746                         }
17747                         /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17748                         vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17749                                     SvCUR(msg), SvPVX(msg)));
17750                     }
17751
17752                     assert(prop_definition || strings);
17753
17754                     if (strings) {
17755                         if (ret_invlist) {
17756                             if (! prop_definition) {
17757                                 RExC_parse = e + 1;
17758                                 vFAIL("Unicode string properties are not implemented in (?[...])");
17759                             }
17760                             else {
17761                                 ckWARNreg(e + 1,
17762                                     "Using just the single character results"
17763                                     " returned by \\p{} in (?[...])");
17764                             }
17765                         }
17766                         else if (! RExC_in_multi_char_class) {
17767                             if (invert ^ (value == 'P')) {
17768                                 RExC_parse = e + 1;
17769                                 vFAIL("Inverting a character class which contains"
17770                                     " a multi-character sequence is illegal");
17771                             }
17772
17773                             /* For each multi-character string ... */
17774                             while (av_count(strings) > 0) {
17775                                 /* ... Each entry is itself an array of code
17776                                 * points. */
17777                                 AV * this_string = (AV *) av_shift( strings);
17778                                 STRLEN cp_count = av_count(this_string);
17779                                 SV * final = newSV(cp_count * 4);
17780                                 SvPVCLEAR(final);
17781
17782                                 /* Create another string of sequences of \x{...} */
17783                                 while (av_count(this_string) > 0) {
17784                                     SV * character = av_shift(this_string);
17785                                     UV cp = SvUV(character);
17786
17787                                     if (cp > 255) {
17788                                         REQUIRE_UTF8(flagp);
17789                                     }
17790                                     Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
17791                                                                         cp);
17792                                     SvREFCNT_dec_NN(character);
17793                                 }
17794                                 SvREFCNT_dec_NN(this_string);
17795
17796                                 /* And add that to the list of such things */
17797                                 multi_char_matches
17798                                             = add_multi_match(multi_char_matches,
17799                                                             final,
17800                                                             cp_count);
17801                             }
17802                         }
17803                         SvREFCNT_dec_NN(strings);
17804                     }
17805
17806                     if (! prop_definition) {    /* If we got only a string,
17807                                                    this iteration didn't really
17808                                                    find a character */
17809                         element_count--;
17810                     }
17811                     else if (! is_invlist(prop_definition)) {
17812
17813                         /* Here, the definition isn't known, so we have gotten
17814                          * returned a string that will be evaluated if and when
17815                          * encountered at runtime.  We add it to the list of
17816                          * such properties, along with whether it should be
17817                          * complemented or not */
17818                         if (value == 'P') {
17819                             sv_catpvs(listsv, "!");
17820                         }
17821                         else {
17822                             sv_catpvs(listsv, "+");
17823                         }
17824                         sv_catsv(listsv, prop_definition);
17825
17826                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17827
17828                         /* We don't know yet what this matches, so have to flag
17829                          * it */
17830                         anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17831                     }
17832                     else {
17833                         assert (prop_definition && is_invlist(prop_definition));
17834
17835                         /* Here we do have the complete property definition
17836                          *
17837                          * Temporary workaround for [perl #133136].  For this
17838                          * precise input that is in the .t that is failing,
17839                          * load utf8.pm, which is what the test wants, so that
17840                          * that .t passes */
17841                         if (     memEQs(RExC_start, e + 1 - RExC_start,
17842                                         "foo\\p{Alnum}")
17843                             && ! hv_common(GvHVn(PL_incgv),
17844                                            NULL,
17845                                            "utf8.pm", sizeof("utf8.pm") - 1,
17846                                            0, HV_FETCH_ISEXISTS, NULL, 0))
17847                         {
17848                             require_pv("utf8.pm");
17849                         }
17850
17851                         if (! user_defined &&
17852                             /* We warn on matching an above-Unicode code point
17853                              * if the match would return true, except don't
17854                              * warn for \p{All}, which has exactly one element
17855                              * = 0 */
17856                             (_invlist_contains_cp(prop_definition, 0x110000)
17857                                 && (! (_invlist_len(prop_definition) == 1
17858                                        && *invlist_array(prop_definition) == 0))))
17859                         {
17860                             warn_super = TRUE;
17861                         }
17862
17863                         /* Invert if asking for the complement */
17864                         if (value == 'P') {
17865                             _invlist_union_complement_2nd(properties,
17866                                                           prop_definition,
17867                                                           &properties);
17868                         }
17869                         else {
17870                             _invlist_union(properties, prop_definition, &properties);
17871                         }
17872                     }
17873                 }
17874
17875                 RExC_parse = e + 1;
17876                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
17877                                                 named */
17878                 }
17879                 break;
17880             case 'n':   value = '\n';                   break;
17881             case 'r':   value = '\r';                   break;
17882             case 't':   value = '\t';                   break;
17883             case 'f':   value = '\f';                   break;
17884             case 'b':   value = '\b';                   break;
17885             case 'e':   value = ESC_NATIVE;             break;
17886             case 'a':   value = '\a';                   break;
17887             case 'o':
17888                 RExC_parse--;   /* function expects to be pointed at the 'o' */
17889                 if (! grok_bslash_o(&RExC_parse,
17890                                             RExC_end,
17891                                             &value,
17892                                             &message,
17893                                             &packed_warn,
17894                                             strict,
17895                                             cBOOL(range), /* MAX_UV allowed for range
17896                                                       upper limit */
17897                                             UTF))
17898                 {
17899                     vFAIL(message);
17900                 }
17901                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17902                     warn_non_literal_string(RExC_parse, packed_warn, message);
17903                 }
17904
17905                 if (value < 256) {
17906                     non_portable_endpoint++;
17907                 }
17908                 break;
17909             case 'x':
17910                 RExC_parse--;   /* function expects to be pointed at the 'x' */
17911                 if (!  grok_bslash_x(&RExC_parse,
17912                                             RExC_end,
17913                                             &value,
17914                                             &message,
17915                                             &packed_warn,
17916                                             strict,
17917                                             cBOOL(range), /* MAX_UV allowed for range
17918                                                       upper limit */
17919                                             UTF))
17920                 {
17921                     vFAIL(message);
17922                 }
17923                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17924                     warn_non_literal_string(RExC_parse, packed_warn, message);
17925                 }
17926
17927                 if (value < 256) {
17928                     non_portable_endpoint++;
17929                 }
17930                 break;
17931             case 'c':
17932                 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
17933                                                                 &packed_warn))
17934                 {
17935                     /* going to die anyway; point to exact spot of
17936                         * failure */
17937                     RExC_parse += (UTF)
17938                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17939                                   : 1;
17940                     vFAIL(message);
17941                 }
17942
17943                 value = grok_c_char;
17944                 RExC_parse++;
17945                 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17946                     warn_non_literal_string(RExC_parse, packed_warn, message);
17947                 }
17948
17949                 non_portable_endpoint++;
17950                 break;
17951             case '0': case '1': case '2': case '3': case '4':
17952             case '5': case '6': case '7':
17953                 {
17954                     /* Take 1-3 octal digits */
17955                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT
17956                               | PERL_SCAN_NOTIFY_ILLDIGIT;
17957                     numlen = (strict) ? 4 : 3;
17958                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17959                     RExC_parse += numlen;
17960                     if (numlen != 3) {
17961                         if (strict) {
17962                             RExC_parse += (UTF)
17963                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17964                                           : 1;
17965                             vFAIL("Need exactly 3 octal digits");
17966                         }
17967                         else if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
17968                                  && RExC_parse < RExC_end
17969                                  && isDIGIT(*RExC_parse)
17970                                  && ckWARN(WARN_REGEXP))
17971                         {
17972                             reg_warn_non_literal_string(
17973                                  RExC_parse + 1,
17974                                  form_alien_digit_msg(8, numlen, RExC_parse,
17975                                                         RExC_end, UTF, FALSE));
17976                         }
17977                     }
17978                     if (value < 256) {
17979                         non_portable_endpoint++;
17980                     }
17981                     break;
17982                 }
17983             default:
17984                 /* Allow \_ to not give an error */
17985                 if (isWORDCHAR(value) && value != '_') {
17986                     if (strict) {
17987                         vFAIL2("Unrecognized escape \\%c in character class",
17988                                (int)value);
17989                     }
17990                     else {
17991                         ckWARN2reg(RExC_parse,
17992                             "Unrecognized escape \\%c in character class passed through",
17993                             (int)value);
17994                     }
17995                 }
17996                 break;
17997             }   /* End of switch on char following backslash */
17998         } /* end of handling backslash escape sequences */
17999
18000         /* Here, we have the current token in 'value' */
18001
18002         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
18003             U8 classnum;
18004
18005             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
18006              * literal, as is the character that began the false range, i.e.
18007              * the 'a' in the examples */
18008             if (range) {
18009                 const int w = (RExC_parse >= rangebegin)
18010                                 ? RExC_parse - rangebegin
18011                                 : 0;
18012                 if (strict) {
18013                     vFAIL2utf8f(
18014                         "False [] range \"%" UTF8f "\"",
18015                         UTF8fARG(UTF, w, rangebegin));
18016                 }
18017                 else {
18018                     ckWARN2reg(RExC_parse,
18019                         "False [] range \"%" UTF8f "\"",
18020                         UTF8fARG(UTF, w, rangebegin));
18021                     cp_list = add_cp_to_invlist(cp_list, '-');
18022                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
18023                                                             prevvalue);
18024                 }
18025
18026                 range = 0; /* this was not a true range */
18027                 element_count += 2; /* So counts for three values */
18028             }
18029
18030             classnum = namedclass_to_classnum(namedclass);
18031
18032             if (LOC && namedclass < ANYOF_POSIXL_MAX
18033 #ifndef HAS_ISASCII
18034                 && classnum != _CC_ASCII
18035 #endif
18036             ) {
18037                 SV* scratch_list = NULL;
18038
18039                 /* What the Posix classes (like \w, [:space:]) match isn't
18040                  * generally knowable under locale until actual match time.  A
18041                  * special node is used for these which has extra space for a
18042                  * bitmap, with a bit reserved for each named class that is to
18043                  * be matched against.  (This isn't needed for \p{} and
18044                  * pseudo-classes, as they are not affected by locale, and
18045                  * hence are dealt with separately.)  However, if a named class
18046                  * and its complement are both present, then it matches
18047                  * everything, and there is no runtime dependency.  Odd numbers
18048                  * are the complements of the next lower number, so xor works.
18049                  * (Note that something like [\w\D] should match everything,
18050                  * because \d should be a proper subset of \w.  But rather than
18051                  * trust that the locale is well behaved, we leave this to
18052                  * runtime to sort out) */
18053                 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
18054                     cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
18055                     POSIXL_ZERO(posixl);
18056                     has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
18057                     anyof_flags &= ~ANYOF_MATCHES_POSIXL;
18058                     continue;   /* We could ignore the rest of the class, but
18059                                    best to parse it for any errors */
18060                 }
18061                 else { /* Here, isn't the complement of any already parsed
18062                           class */
18063                     POSIXL_SET(posixl, namedclass);
18064                     has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18065                     anyof_flags |= ANYOF_MATCHES_POSIXL;
18066
18067                     /* The above-Latin1 characters are not subject to locale
18068                      * rules.  Just add them to the unconditionally-matched
18069                      * list */
18070
18071                     /* Get the list of the above-Latin1 code points this
18072                      * matches */
18073                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
18074                                             PL_XPosix_ptrs[classnum],
18075
18076                                             /* Odd numbers are complements,
18077                                              * like NDIGIT, NASCII, ... */
18078                                             namedclass % 2 != 0,
18079                                             &scratch_list);
18080                     /* Checking if 'cp_list' is NULL first saves an extra
18081                      * clone.  Its reference count will be decremented at the
18082                      * next union, etc, or if this is the only instance, at the
18083                      * end of the routine */
18084                     if (! cp_list) {
18085                         cp_list = scratch_list;
18086                     }
18087                     else {
18088                         _invlist_union(cp_list, scratch_list, &cp_list);
18089                         SvREFCNT_dec_NN(scratch_list);
18090                     }
18091                     continue;   /* Go get next character */
18092                 }
18093             }
18094             else {
18095
18096                 /* Here, is not /l, or is a POSIX class for which /l doesn't
18097                  * matter (or is a Unicode property, which is skipped here). */
18098                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
18099                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
18100
18101                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
18102                          * nor /l make a difference in what these match,
18103                          * therefore we just add what they match to cp_list. */
18104                         if (classnum != _CC_VERTSPACE) {
18105                             assert(   namedclass == ANYOF_HORIZWS
18106                                    || namedclass == ANYOF_NHORIZWS);
18107
18108                             /* It turns out that \h is just a synonym for
18109                              * XPosixBlank */
18110                             classnum = _CC_BLANK;
18111                         }
18112
18113                         _invlist_union_maybe_complement_2nd(
18114                                 cp_list,
18115                                 PL_XPosix_ptrs[classnum],
18116                                 namedclass % 2 != 0,    /* Complement if odd
18117                                                           (NHORIZWS, NVERTWS)
18118                                                         */
18119                                 &cp_list);
18120                     }
18121                 }
18122                 else if (   AT_LEAST_UNI_SEMANTICS
18123                          || classnum == _CC_ASCII
18124                          || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
18125                                                    || classnum == _CC_XDIGIT)))
18126                 {
18127                     /* We usually have to worry about /d affecting what POSIX
18128                      * classes match, with special code needed because we won't
18129                      * know until runtime what all matches.  But there is no
18130                      * extra work needed under /u and /a; and [:ascii:] is
18131                      * unaffected by /d; and :digit: and :xdigit: don't have
18132                      * runtime differences under /d.  So we can special case
18133                      * these, and avoid some extra work below, and at runtime.
18134                      * */
18135                     _invlist_union_maybe_complement_2nd(
18136                                                      simple_posixes,
18137                                                       ((AT_LEAST_ASCII_RESTRICTED)
18138                                                        ? PL_Posix_ptrs[classnum]
18139                                                        : PL_XPosix_ptrs[classnum]),
18140                                                      namedclass % 2 != 0,
18141                                                      &simple_posixes);
18142                 }
18143                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
18144                            complement and use nposixes */
18145                     SV** posixes_ptr = namedclass % 2 == 0
18146                                        ? &posixes
18147                                        : &nposixes;
18148                     _invlist_union_maybe_complement_2nd(
18149                                                      *posixes_ptr,
18150                                                      PL_XPosix_ptrs[classnum],
18151                                                      namedclass % 2 != 0,
18152                                                      posixes_ptr);
18153                 }
18154             }
18155         } /* end of namedclass \blah */
18156
18157         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18158
18159         /* If 'range' is set, 'value' is the ending of a range--check its
18160          * validity.  (If value isn't a single code point in the case of a
18161          * range, we should have figured that out above in the code that
18162          * catches false ranges).  Later, we will handle each individual code
18163          * point in the range.  If 'range' isn't set, this could be the
18164          * beginning of a range, so check for that by looking ahead to see if
18165          * the next real character to be processed is the range indicator--the
18166          * minus sign */
18167
18168         if (range) {
18169 #ifdef EBCDIC
18170             /* For unicode ranges, we have to test that the Unicode as opposed
18171              * to the native values are not decreasing.  (Above 255, there is
18172              * no difference between native and Unicode) */
18173             if (unicode_range && prevvalue < 255 && value < 255) {
18174                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
18175                     goto backwards_range;
18176                 }
18177             }
18178             else
18179 #endif
18180             if (prevvalue > value) /* b-a */ {
18181                 int w;
18182 #ifdef EBCDIC
18183               backwards_range:
18184 #endif
18185                 w = RExC_parse - rangebegin;
18186                 vFAIL2utf8f(
18187                     "Invalid [] range \"%" UTF8f "\"",
18188                     UTF8fARG(UTF, w, rangebegin));
18189                 NOT_REACHED; /* NOTREACHED */
18190             }
18191         }
18192         else {
18193             prevvalue = value; /* save the beginning of the potential range */
18194             if (! stop_at_1     /* Can't be a range if parsing just one thing */
18195                 && *RExC_parse == '-')
18196             {
18197                 char* next_char_ptr = RExC_parse + 1;
18198
18199                 /* Get the next real char after the '-' */
18200                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
18201
18202                 /* If the '-' is at the end of the class (just before the ']',
18203                  * it is a literal minus; otherwise it is a range */
18204                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
18205                     RExC_parse = next_char_ptr;
18206
18207                     /* a bad range like \w-, [:word:]- ? */
18208                     if (namedclass > OOB_NAMEDCLASS) {
18209                         if (strict || ckWARN(WARN_REGEXP)) {
18210                             const int w = RExC_parse >= rangebegin
18211                                           ?  RExC_parse - rangebegin
18212                                           : 0;
18213                             if (strict) {
18214                                 vFAIL4("False [] range \"%*.*s\"",
18215                                     w, w, rangebegin);
18216                             }
18217                             else {
18218                                 vWARN4(RExC_parse,
18219                                     "False [] range \"%*.*s\"",
18220                                     w, w, rangebegin);
18221                             }
18222                         }
18223                         cp_list = add_cp_to_invlist(cp_list, '-');
18224                         element_count++;
18225                     } else
18226                         range = 1;      /* yeah, it's a range! */
18227                     continue;   /* but do it the next time */
18228                 }
18229             }
18230         }
18231
18232         if (namedclass > OOB_NAMEDCLASS) {
18233             continue;
18234         }
18235
18236         /* Here, we have a single value this time through the loop, and
18237          * <prevvalue> is the beginning of the range, if any; or <value> if
18238          * not. */
18239
18240         /* non-Latin1 code point implies unicode semantics. */
18241         if (value > 255) {
18242             if (value > MAX_LEGAL_CP && (   value != UV_MAX
18243                                          || prevvalue > MAX_LEGAL_CP))
18244             {
18245                 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
18246             }
18247             REQUIRE_UNI_RULES(flagp, 0);
18248             if (  ! silence_non_portable
18249                 &&  UNICODE_IS_PERL_EXTENDED(value)
18250                 &&  TO_OUTPUT_WARNINGS(RExC_parse))
18251             {
18252                 ckWARN2_non_literal_string(RExC_parse,
18253                                            packWARN(WARN_PORTABLE),
18254                                            PL_extended_cp_format,
18255                                            value);
18256             }
18257         }
18258
18259         /* Ready to process either the single value, or the completed range.
18260          * For single-valued non-inverted ranges, we consider the possibility
18261          * of multi-char folds.  (We made a conscious decision to not do this
18262          * for the other cases because it can often lead to non-intuitive
18263          * results.  For example, you have the peculiar case that:
18264          *  "s s" =~ /^[^\xDF]+$/i => Y
18265          *  "ss"  =~ /^[^\xDF]+$/i => N
18266          *
18267          * See [perl #89750] */
18268         if (FOLD && allow_mutiple_chars && value == prevvalue) {
18269             if (    value == LATIN_SMALL_LETTER_SHARP_S
18270                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
18271                                                         value)))
18272             {
18273                 /* Here <value> is indeed a multi-char fold.  Get what it is */
18274
18275                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18276                 STRLEN foldlen;
18277
18278                 UV folded = _to_uni_fold_flags(
18279                                 value,
18280                                 foldbuf,
18281                                 &foldlen,
18282                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
18283                                                    ? FOLD_FLAGS_NOMIX_ASCII
18284                                                    : 0)
18285                                 );
18286
18287                 /* Here, <folded> should be the first character of the
18288                  * multi-char fold of <value>, with <foldbuf> containing the
18289                  * whole thing.  But, if this fold is not allowed (because of
18290                  * the flags), <fold> will be the same as <value>, and should
18291                  * be processed like any other character, so skip the special
18292                  * handling */
18293                 if (folded != value) {
18294
18295                     /* Skip if we are recursed, currently parsing the class
18296                      * again.  Otherwise add this character to the list of
18297                      * multi-char folds. */
18298                     if (! RExC_in_multi_char_class) {
18299                         STRLEN cp_count = utf8_length(foldbuf,
18300                                                       foldbuf + foldlen);
18301                         SV* multi_fold = sv_2mortal(newSVpvs(""));
18302
18303                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
18304
18305                         multi_char_matches
18306                                         = add_multi_match(multi_char_matches,
18307                                                           multi_fold,
18308                                                           cp_count);
18309
18310                     }
18311
18312                     /* This element should not be processed further in this
18313                      * class */
18314                     element_count--;
18315                     value = save_value;
18316                     prevvalue = save_prevvalue;
18317                     continue;
18318                 }
18319             }
18320         }
18321
18322         if (strict && ckWARN(WARN_REGEXP)) {
18323             if (range) {
18324
18325                 /* If the range starts above 255, everything is portable and
18326                  * likely to be so for any forseeable character set, so don't
18327                  * warn. */
18328                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18329                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18330                 }
18331                 else if (prevvalue != value) {
18332
18333                     /* Under strict, ranges that stop and/or end in an ASCII
18334                      * printable should have each end point be a portable value
18335                      * for it (preferably like 'A', but we don't warn if it is
18336                      * a (portable) Unicode name or code point), and the range
18337                      * must be all digits or all letters of the same case.
18338                      * Otherwise, the range is non-portable and unclear as to
18339                      * what it contains */
18340                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
18341                         && (          non_portable_endpoint
18342                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
18343                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
18344                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
18345                     ))) {
18346                         vWARN(RExC_parse, "Ranges of ASCII printables should"
18347                                           " be some subset of \"0-9\","
18348                                           " \"A-Z\", or \"a-z\"");
18349                     }
18350                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18351                         SSize_t index_start;
18352                         SSize_t index_final;
18353
18354                         /* But the nature of Unicode and languages mean we
18355                          * can't do the same checks for above-ASCII ranges,
18356                          * except in the case of digit ones.  These should
18357                          * contain only digits from the same group of 10.  The
18358                          * ASCII case is handled just above.  Hence here, the
18359                          * range could be a range of digits.  First some
18360                          * unlikely special cases.  Grandfather in that a range
18361                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18362                          * if its starting value is one of the 10 digits prior
18363                          * to it.  This is because it is an alternate way of
18364                          * writing 19D1, and some people may expect it to be in
18365                          * that group.  But it is bad, because it won't give
18366                          * the expected results.  In Unicode 5.2 it was
18367                          * considered to be in that group (of 11, hence), but
18368                          * this was fixed in the next version */
18369
18370                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18371                             goto warn_bad_digit_range;
18372                         }
18373                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
18374                                           &&     value <= 0x1D7FF))
18375                         {
18376                             /* This is the only other case currently in Unicode
18377                              * where the algorithm below fails.  The code
18378                              * points just above are the end points of a single
18379                              * range containing only decimal digits.  It is 5
18380                              * different series of 0-9.  All other ranges of
18381                              * digits currently in Unicode are just a single
18382                              * series.  (And mktables will notify us if a later
18383                              * Unicode version breaks this.)
18384                              *
18385                              * If the range being checked is at most 9 long,
18386                              * and the digit values represented are in
18387                              * numerical order, they are from the same series.
18388                              * */
18389                             if (         value - prevvalue > 9
18390                                 ||    (((    value - 0x1D7CE) % 10)
18391                                      <= (prevvalue - 0x1D7CE) % 10))
18392                             {
18393                                 goto warn_bad_digit_range;
18394                             }
18395                         }
18396                         else {
18397
18398                             /* For all other ranges of digits in Unicode, the
18399                              * algorithm is just to check if both end points
18400                              * are in the same series, which is the same range.
18401                              * */
18402                             index_start = _invlist_search(
18403                                                     PL_XPosix_ptrs[_CC_DIGIT],
18404                                                     prevvalue);
18405
18406                             /* Warn if the range starts and ends with a digit,
18407                              * and they are not in the same group of 10. */
18408                             if (   index_start >= 0
18409                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18410                                 && (index_final =
18411                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18412                                                     value)) != index_start
18413                                 && index_final >= 0
18414                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18415                             {
18416                               warn_bad_digit_range:
18417                                 vWARN(RExC_parse, "Ranges of digits should be"
18418                                                   " from the same group of"
18419                                                   " 10");
18420                             }
18421                         }
18422                     }
18423                 }
18424             }
18425             if ((! range || prevvalue == value) && non_portable_endpoint) {
18426                 if (isPRINT_A(value)) {
18427                     char literal[3];
18428                     unsigned d = 0;
18429                     if (isBACKSLASHED_PUNCT(value)) {
18430                         literal[d++] = '\\';
18431                     }
18432                     literal[d++] = (char) value;
18433                     literal[d++] = '\0';
18434
18435                     vWARN4(RExC_parse,
18436                            "\"%.*s\" is more clearly written simply as \"%s\"",
18437                            (int) (RExC_parse - rangebegin),
18438                            rangebegin,
18439                            literal
18440                         );
18441                 }
18442                 else if (isMNEMONIC_CNTRL(value)) {
18443                     vWARN4(RExC_parse,
18444                            "\"%.*s\" is more clearly written simply as \"%s\"",
18445                            (int) (RExC_parse - rangebegin),
18446                            rangebegin,
18447                            cntrl_to_mnemonic((U8) value)
18448                         );
18449                 }
18450             }
18451         }
18452
18453         /* Deal with this element of the class */
18454
18455 #ifndef EBCDIC
18456         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18457                                                     prevvalue, value);
18458 #else
18459         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18460          * that don't require special handling, we can just add the range like
18461          * we do for ASCII platforms */
18462         if ((UNLIKELY(prevvalue == 0) && value >= 255)
18463             || ! (prevvalue < 256
18464                     && (unicode_range
18465                         || (! non_portable_endpoint
18466                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18467                                 || (isUPPER_A(prevvalue)
18468                                     && isUPPER_A(value)))))))
18469         {
18470             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18471                                                         prevvalue, value);
18472         }
18473         else {
18474             /* Here, requires special handling.  This can be because it is a
18475              * range whose code points are considered to be Unicode, and so
18476              * must be individually translated into native, or because its a
18477              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18478              * EBCDIC, but we have defined them to include only the "expected"
18479              * upper or lower case ASCII alphabetics.  Subranges above 255 are
18480              * the same in native and Unicode, so can be added as a range */
18481             U8 start = NATIVE_TO_LATIN1(prevvalue);
18482             unsigned j;
18483             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18484             for (j = start; j <= end; j++) {
18485                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18486             }
18487             if (value > 255) {
18488                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18489                                                             256, value);
18490             }
18491         }
18492 #endif
18493
18494         range = 0; /* this range (if it was one) is done now */
18495     } /* End of loop through all the text within the brackets */
18496
18497     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18498         output_posix_warnings(pRExC_state, posix_warnings);
18499     }
18500
18501     /* If anything in the class expands to more than one character, we have to
18502      * deal with them by building up a substitute parse string, and recursively
18503      * calling reg() on it, instead of proceeding */
18504     if (multi_char_matches) {
18505         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18506         I32 cp_count;
18507         STRLEN len;
18508         char *save_end = RExC_end;
18509         char *save_parse = RExC_parse;
18510         char *save_start = RExC_start;
18511         Size_t constructed_prefix_len = 0; /* This gives the length of the
18512                                               constructed portion of the
18513                                               substitute parse. */
18514         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
18515                                        a "|" */
18516         I32 reg_flags;
18517
18518         assert(! invert);
18519         /* Only one level of recursion allowed */
18520         assert(RExC_copy_start_in_constructed == RExC_precomp);
18521
18522 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
18523            because too confusing */
18524         if (invert) {
18525             sv_catpvs(substitute_parse, "(?:");
18526         }
18527 #endif
18528
18529         /* Look at the longest strings first */
18530         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18531                         cp_count > 0;
18532                         cp_count--)
18533         {
18534
18535             if (av_exists(multi_char_matches, cp_count)) {
18536                 AV** this_array_ptr;
18537                 SV* this_sequence;
18538
18539                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18540                                                  cp_count, FALSE);
18541                 while ((this_sequence = av_pop(*this_array_ptr)) !=
18542                                                                 &PL_sv_undef)
18543                 {
18544                     if (! first_time) {
18545                         sv_catpvs(substitute_parse, "|");
18546                     }
18547                     first_time = FALSE;
18548
18549                     sv_catpv(substitute_parse, SvPVX(this_sequence));
18550                 }
18551             }
18552         }
18553
18554         /* If the character class contains anything else besides these
18555          * multi-character strings, have to include it in recursive parsing */
18556         if (element_count) {
18557             bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
18558
18559             sv_catpvs(substitute_parse, "|");
18560             if (has_l_bracket) {    /* Add an [ if the original had one */
18561                 sv_catpvs(substitute_parse, "[");
18562             }
18563             constructed_prefix_len = SvCUR(substitute_parse);
18564             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18565
18566             /* Put in a closing ']' to match any opening one, but not if going
18567              * off the end, as otherwise we are adding something that really
18568              * isn't there */
18569             if (has_l_bracket && RExC_parse < RExC_end) {
18570                 sv_catpvs(substitute_parse, "]");
18571             }
18572         }
18573
18574         sv_catpvs(substitute_parse, ")");
18575 #if 0
18576         if (invert) {
18577             /* This is a way to get the parse to skip forward a whole named
18578              * sequence instead of matching the 2nd character when it fails the
18579              * first */
18580             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18581         }
18582 #endif
18583
18584         /* Set up the data structure so that any errors will be properly
18585          * reported.  See the comments at the definition of
18586          * REPORT_LOCATION_ARGS for details */
18587         RExC_copy_start_in_input = (char *) orig_parse;
18588         RExC_start = RExC_parse = SvPV(substitute_parse, len);
18589         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18590         RExC_end = RExC_parse + len;
18591         RExC_in_multi_char_class = 1;
18592
18593         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
18594
18595         *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
18596
18597         /* And restore so can parse the rest of the pattern */
18598         RExC_parse = save_parse;
18599         RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18600         RExC_end = save_end;
18601         RExC_in_multi_char_class = 0;
18602         SvREFCNT_dec_NN(multi_char_matches);
18603         return ret;
18604     }
18605
18606     /* If folding, we calculate all characters that could fold to or from the
18607      * ones already on the list */
18608     if (cp_foldable_list) {
18609         if (FOLD) {
18610             UV start, end;      /* End points of code point ranges */
18611
18612             SV* fold_intersection = NULL;
18613             SV** use_list;
18614
18615             /* Our calculated list will be for Unicode rules.  For locale
18616              * matching, we have to keep a separate list that is consulted at
18617              * runtime only when the locale indicates Unicode rules (and we
18618              * don't include potential matches in the ASCII/Latin1 range, as
18619              * any code point could fold to any other, based on the run-time
18620              * locale).   For non-locale, we just use the general list */
18621             if (LOC) {
18622                 use_list = &only_utf8_locale_list;
18623             }
18624             else {
18625                 use_list = &cp_list;
18626             }
18627
18628             /* Only the characters in this class that participate in folds need
18629              * be checked.  Get the intersection of this class and all the
18630              * possible characters that are foldable.  This can quickly narrow
18631              * down a large class */
18632             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18633                                   &fold_intersection);
18634
18635             /* Now look at the foldable characters in this class individually */
18636             invlist_iterinit(fold_intersection);
18637             while (invlist_iternext(fold_intersection, &start, &end)) {
18638                 UV j;
18639                 UV folded;
18640
18641                 /* Look at every character in the range */
18642                 for (j = start; j <= end; j++) {
18643                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18644                     STRLEN foldlen;
18645                     unsigned int k;
18646                     Size_t folds_count;
18647                     U32 first_fold;
18648                     const U32 * remaining_folds;
18649
18650                     if (j < 256) {
18651
18652                         /* Under /l, we don't know what code points below 256
18653                          * fold to, except we do know the MICRO SIGN folds to
18654                          * an above-255 character if the locale is UTF-8, so we
18655                          * add it to the special list (in *use_list)  Otherwise
18656                          * we know now what things can match, though some folds
18657                          * are valid under /d only if the target is UTF-8.
18658                          * Those go in a separate list */
18659                         if (      IS_IN_SOME_FOLD_L1(j)
18660                             && ! (LOC && j != MICRO_SIGN))
18661                         {
18662
18663                             /* ASCII is always matched; non-ASCII is matched
18664                              * only under Unicode rules (which could happen
18665                              * under /l if the locale is a UTF-8 one */
18666                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18667                                 *use_list = add_cp_to_invlist(*use_list,
18668                                                             PL_fold_latin1[j]);
18669                             }
18670                             else if (j != PL_fold_latin1[j]) {
18671                                 upper_latin1_only_utf8_matches
18672                                         = add_cp_to_invlist(
18673                                                 upper_latin1_only_utf8_matches,
18674                                                 PL_fold_latin1[j]);
18675                             }
18676                         }
18677
18678                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18679                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18680                         {
18681                             add_above_Latin1_folds(pRExC_state,
18682                                                    (U8) j,
18683                                                    use_list);
18684                         }
18685                         continue;
18686                     }
18687
18688                     /* Here is an above Latin1 character.  We don't have the
18689                      * rules hard-coded for it.  First, get its fold.  This is
18690                      * the simple fold, as the multi-character folds have been
18691                      * handled earlier and separated out */
18692                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18693                                                         (ASCII_FOLD_RESTRICTED)
18694                                                         ? FOLD_FLAGS_NOMIX_ASCII
18695                                                         : 0);
18696
18697                     /* Single character fold of above Latin1.  Add everything
18698                      * in its fold closure to the list that this node should
18699                      * match. */
18700                     folds_count = _inverse_folds(folded, &first_fold,
18701                                                     &remaining_folds);
18702                     for (k = 0; k <= folds_count; k++) {
18703                         UV c = (k == 0)     /* First time through use itself */
18704                                 ? folded
18705                                 : (k == 1)  /* 2nd time use, the first fold */
18706                                    ? first_fold
18707
18708                                      /* Then the remaining ones */
18709                                    : remaining_folds[k-2];
18710
18711                         /* /aa doesn't allow folds between ASCII and non- */
18712                         if ((   ASCII_FOLD_RESTRICTED
18713                             && (isASCII(c) != isASCII(j))))
18714                         {
18715                             continue;
18716                         }
18717
18718                         /* Folds under /l which cross the 255/256 boundary are
18719                          * added to a separate list.  (These are valid only
18720                          * when the locale is UTF-8.) */
18721                         if (c < 256 && LOC) {
18722                             *use_list = add_cp_to_invlist(*use_list, c);
18723                             continue;
18724                         }
18725
18726                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18727                         {
18728                             cp_list = add_cp_to_invlist(cp_list, c);
18729                         }
18730                         else {
18731                             /* Similarly folds involving non-ascii Latin1
18732                              * characters under /d are added to their list */
18733                             upper_latin1_only_utf8_matches
18734                                     = add_cp_to_invlist(
18735                                                 upper_latin1_only_utf8_matches,
18736                                                 c);
18737                         }
18738                     }
18739                 }
18740             }
18741             SvREFCNT_dec_NN(fold_intersection);
18742         }
18743
18744         /* Now that we have finished adding all the folds, there is no reason
18745          * to keep the foldable list separate */
18746         _invlist_union(cp_list, cp_foldable_list, &cp_list);
18747         SvREFCNT_dec_NN(cp_foldable_list);
18748     }
18749
18750     /* And combine the result (if any) with any inversion lists from posix
18751      * classes.  The lists are kept separate up to now because we don't want to
18752      * fold the classes */
18753     if (simple_posixes) {   /* These are the classes known to be unaffected by
18754                                /a, /aa, and /d */
18755         if (cp_list) {
18756             _invlist_union(cp_list, simple_posixes, &cp_list);
18757             SvREFCNT_dec_NN(simple_posixes);
18758         }
18759         else {
18760             cp_list = simple_posixes;
18761         }
18762     }
18763     if (posixes || nposixes) {
18764         if (! DEPENDS_SEMANTICS) {
18765
18766             /* For everything but /d, we can just add the current 'posixes' and
18767              * 'nposixes' to the main list */
18768             if (posixes) {
18769                 if (cp_list) {
18770                     _invlist_union(cp_list, posixes, &cp_list);
18771                     SvREFCNT_dec_NN(posixes);
18772                 }
18773                 else {
18774                     cp_list = posixes;
18775                 }
18776             }
18777             if (nposixes) {
18778                 if (cp_list) {
18779                     _invlist_union(cp_list, nposixes, &cp_list);
18780                     SvREFCNT_dec_NN(nposixes);
18781                 }
18782                 else {
18783                     cp_list = nposixes;
18784                 }
18785             }
18786         }
18787         else {
18788             /* Under /d, things like \w match upper Latin1 characters only if
18789              * the target string is in UTF-8.  But things like \W match all the
18790              * upper Latin1 characters if the target string is not in UTF-8.
18791              *
18792              * Handle the case with something like \W separately */
18793             if (nposixes) {
18794                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18795
18796                 /* A complemented posix class matches all upper Latin1
18797                  * characters if not in UTF-8.  And it matches just certain
18798                  * ones when in UTF-8.  That means those certain ones are
18799                  * matched regardless, so can just be added to the
18800                  * unconditional list */
18801                 if (cp_list) {
18802                     _invlist_union(cp_list, nposixes, &cp_list);
18803                     SvREFCNT_dec_NN(nposixes);
18804                     nposixes = NULL;
18805                 }
18806                 else {
18807                     cp_list = nposixes;
18808                 }
18809
18810                 /* Likewise for 'posixes' */
18811                 _invlist_union(posixes, cp_list, &cp_list);
18812                 SvREFCNT_dec(posixes);
18813
18814                 /* Likewise for anything else in the range that matched only
18815                  * under UTF-8 */
18816                 if (upper_latin1_only_utf8_matches) {
18817                     _invlist_union(cp_list,
18818                                    upper_latin1_only_utf8_matches,
18819                                    &cp_list);
18820                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18821                     upper_latin1_only_utf8_matches = NULL;
18822                 }
18823
18824                 /* If we don't match all the upper Latin1 characters regardless
18825                  * of UTF-8ness, we have to set a flag to match the rest when
18826                  * not in UTF-8 */
18827                 _invlist_subtract(only_non_utf8_list, cp_list,
18828                                   &only_non_utf8_list);
18829                 if (_invlist_len(only_non_utf8_list) != 0) {
18830                     anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18831                 }
18832                 SvREFCNT_dec_NN(only_non_utf8_list);
18833             }
18834             else {
18835                 /* Here there were no complemented posix classes.  That means
18836                  * the upper Latin1 characters in 'posixes' match only when the
18837                  * target string is in UTF-8.  So we have to add them to the
18838                  * list of those types of code points, while adding the
18839                  * remainder to the unconditional list.
18840                  *
18841                  * First calculate what they are */
18842                 SV* nonascii_but_latin1_properties = NULL;
18843                 _invlist_intersection(posixes, PL_UpperLatin1,
18844                                       &nonascii_but_latin1_properties);
18845
18846                 /* And add them to the final list of such characters. */
18847                 _invlist_union(upper_latin1_only_utf8_matches,
18848                                nonascii_but_latin1_properties,
18849                                &upper_latin1_only_utf8_matches);
18850
18851                 /* Remove them from what now becomes the unconditional list */
18852                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18853                                   &posixes);
18854
18855                 /* And add those unconditional ones to the final list */
18856                 if (cp_list) {
18857                     _invlist_union(cp_list, posixes, &cp_list);
18858                     SvREFCNT_dec_NN(posixes);
18859                     posixes = NULL;
18860                 }
18861                 else {
18862                     cp_list = posixes;
18863                 }
18864
18865                 SvREFCNT_dec(nonascii_but_latin1_properties);
18866
18867                 /* Get rid of any characters from the conditional list that we
18868                  * now know are matched unconditionally, which may make that
18869                  * list empty */
18870                 _invlist_subtract(upper_latin1_only_utf8_matches,
18871                                   cp_list,
18872                                   &upper_latin1_only_utf8_matches);
18873                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18874                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18875                     upper_latin1_only_utf8_matches = NULL;
18876                 }
18877             }
18878         }
18879     }
18880
18881     /* And combine the result (if any) with any inversion list from properties.
18882      * The lists are kept separate up to now so that we can distinguish the two
18883      * in regards to matching above-Unicode.  A run-time warning is generated
18884      * if a Unicode property is matched against a non-Unicode code point. But,
18885      * we allow user-defined properties to match anything, without any warning,
18886      * and we also suppress the warning if there is a portion of the character
18887      * class that isn't a Unicode property, and which matches above Unicode, \W
18888      * or [\x{110000}] for example.
18889      * (Note that in this case, unlike the Posix one above, there is no
18890      * <upper_latin1_only_utf8_matches>, because having a Unicode property
18891      * forces Unicode semantics */
18892     if (properties) {
18893         if (cp_list) {
18894
18895             /* If it matters to the final outcome, see if a non-property
18896              * component of the class matches above Unicode.  If so, the
18897              * warning gets suppressed.  This is true even if just a single
18898              * such code point is specified, as, though not strictly correct if
18899              * another such code point is matched against, the fact that they
18900              * are using above-Unicode code points indicates they should know
18901              * the issues involved */
18902             if (warn_super) {
18903                 warn_super = ! (invert
18904                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18905             }
18906
18907             _invlist_union(properties, cp_list, &cp_list);
18908             SvREFCNT_dec_NN(properties);
18909         }
18910         else {
18911             cp_list = properties;
18912         }
18913
18914         if (warn_super) {
18915             anyof_flags
18916              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18917
18918             /* Because an ANYOF node is the only one that warns, this node
18919              * can't be optimized into something else */
18920             optimizable = FALSE;
18921         }
18922     }
18923
18924     /* Here, we have calculated what code points should be in the character
18925      * class.
18926      *
18927      * Now we can see about various optimizations.  Fold calculation (which we
18928      * did above) needs to take place before inversion.  Otherwise /[^k]/i
18929      * would invert to include K, which under /i would match k, which it
18930      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
18931      * folded until runtime */
18932
18933     /* If we didn't do folding, it's because some information isn't available
18934      * until runtime; set the run-time fold flag for these  We know to set the
18935      * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18936      * at least one 0-255 range code point */
18937     if (LOC && FOLD) {
18938
18939         /* Some things on the list might be unconditionally included because of
18940          * other components.  Remove them, and clean up the list if it goes to
18941          * 0 elements */
18942         if (only_utf8_locale_list && cp_list) {
18943             _invlist_subtract(only_utf8_locale_list, cp_list,
18944                               &only_utf8_locale_list);
18945
18946             if (_invlist_len(only_utf8_locale_list) == 0) {
18947                 SvREFCNT_dec_NN(only_utf8_locale_list);
18948                 only_utf8_locale_list = NULL;
18949             }
18950         }
18951         if (    only_utf8_locale_list
18952             || (cp_list && (   _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18953                             || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18954         {
18955             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18956             anyof_flags
18957                  |= ANYOFL_FOLD
18958                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18959         }
18960         else if (cp_list && invlist_lowest(cp_list) < 256) {
18961             /* If nothing is below 256, has no locale dependency; otherwise it
18962              * does */
18963             anyof_flags |= ANYOFL_FOLD;
18964             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18965         }
18966     }
18967     else if (   DEPENDS_SEMANTICS
18968              && (    upper_latin1_only_utf8_matches
18969                  || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18970     {
18971         RExC_seen_d_op = TRUE;
18972         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18973     }
18974
18975     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18976      * compile time. */
18977     if (     cp_list
18978         &&   invert
18979         && ! has_runtime_dependency)
18980     {
18981         _invlist_invert(cp_list);
18982
18983         /* Clear the invert flag since have just done it here */
18984         invert = FALSE;
18985     }
18986
18987     /* All possible optimizations below still have these characteristics.
18988      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18989      * routine) */
18990     *flagp |= HASWIDTH|SIMPLE;
18991
18992     if (ret_invlist) {
18993         *ret_invlist = cp_list;
18994
18995         return (cp_list) ? RExC_emit : 0;
18996     }
18997
18998     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
18999         RExC_contains_locale = 1;
19000     }
19001
19002     /* Some character classes are equivalent to other nodes.  Such nodes take
19003      * up less room, and some nodes require fewer operations to execute, than
19004      * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
19005      * improve efficiency. */
19006
19007     if (optimizable) {
19008         PERL_UINT_FAST8_T i;
19009         UV partial_cp_count = 0;
19010         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
19011         UV   end[MAX_FOLD_FROMS+1] = { 0 };
19012         bool single_range = FALSE;
19013
19014         if (cp_list) { /* Count the code points in enough ranges that we would
19015                           see all the ones possible in any fold in this version
19016                           of Unicode */
19017
19018             invlist_iterinit(cp_list);
19019             for (i = 0; i <= MAX_FOLD_FROMS; i++) {
19020                 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
19021                     break;
19022                 }
19023                 partial_cp_count += end[i] - start[i] + 1;
19024             }
19025
19026             if (i == 1) {
19027                 single_range = TRUE;
19028             }
19029             invlist_iterfinish(cp_list);
19030         }
19031
19032         /* If we know at compile time that this matches every possible code
19033          * point, any run-time dependencies don't matter */
19034         if (start[0] == 0 && end[0] == UV_MAX) {
19035             if (invert) {
19036                 ret = reganode(pRExC_state, OPFAIL, 0);
19037             }
19038             else {
19039                 ret = reg_node(pRExC_state, SANY);
19040                 MARK_NAUGHTY(1);
19041             }
19042             goto not_anyof;
19043         }
19044
19045         /* Similarly, for /l posix classes, if both a class and its
19046          * complement match, any run-time dependencies don't matter */
19047         if (posixl) {
19048             for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
19049                                                         namedclass += 2)
19050             {
19051                 if (   POSIXL_TEST(posixl, namedclass)      /* class */
19052                     && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
19053                 {
19054                     if (invert) {
19055                         ret = reganode(pRExC_state, OPFAIL, 0);
19056                     }
19057                     else {
19058                         ret = reg_node(pRExC_state, SANY);
19059                         MARK_NAUGHTY(1);
19060                     }
19061                     goto not_anyof;
19062                 }
19063             }
19064
19065             /* For well-behaved locales, some classes are subsets of others,
19066              * so complementing the subset and including the non-complemented
19067              * superset should match everything, like [\D[:alnum:]], and
19068              * [[:^alpha:][:alnum:]], but some implementations of locales are
19069              * buggy, and khw thinks its a bad idea to have optimization change
19070              * behavior, even if it avoids an OS bug in a given case */
19071
19072 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
19073
19074             /* If is a single posix /l class, can optimize to just that op.
19075              * Such a node will not match anything in the Latin1 range, as that
19076              * is not determinable until runtime, but will match whatever the
19077              * class does outside that range.  (Note that some classes won't
19078              * match anything outside the range, like [:ascii:]) */
19079             if (    isSINGLE_BIT_SET(posixl)
19080                 && (partial_cp_count == 0 || start[0] > 255))
19081             {
19082                 U8 classnum;
19083                 SV * class_above_latin1 = NULL;
19084                 bool already_inverted;
19085                 bool are_equivalent;
19086
19087                 /* Compute which bit is set, which is the same thing as, e.g.,
19088                  * ANYOF_CNTRL.  From
19089                  * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
19090                  * */
19091                 static const int MultiplyDeBruijnBitPosition2[32] =
19092                     {
19093                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
19094                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
19095                     };
19096
19097                 namedclass = MultiplyDeBruijnBitPosition2[(posixl
19098                                                           * 0x077CB531U) >> 27];
19099                 classnum = namedclass_to_classnum(namedclass);
19100
19101                 /* The named classes are such that the inverted number is one
19102                  * larger than the non-inverted one */
19103                 already_inverted = namedclass
19104                                  - classnum_to_namedclass(classnum);
19105
19106                 /* Create an inversion list of the official property, inverted
19107                  * if the constructed node list is inverted, and restricted to
19108                  * only the above latin1 code points, which are the only ones
19109                  * known at compile time */
19110                 _invlist_intersection_maybe_complement_2nd(
19111                                                     PL_AboveLatin1,
19112                                                     PL_XPosix_ptrs[classnum],
19113                                                     already_inverted,
19114                                                     &class_above_latin1);
19115                 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
19116                                                                         FALSE);
19117                 SvREFCNT_dec_NN(class_above_latin1);
19118
19119                 if (are_equivalent) {
19120
19121                     /* Resolve the run-time inversion flag with this possibly
19122                      * inverted class */
19123                     invert = invert ^ already_inverted;
19124
19125                     ret = reg_node(pRExC_state,
19126                                    POSIXL + invert * (NPOSIXL - POSIXL));
19127                     FLAGS(REGNODE_p(ret)) = classnum;
19128                     goto not_anyof;
19129                 }
19130             }
19131         }
19132
19133         /* khw can't think of any other possible transformation involving
19134          * these. */
19135         if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
19136             goto is_anyof;
19137         }
19138
19139         if (! has_runtime_dependency) {
19140
19141             /* If the list is empty, nothing matches.  This happens, for
19142              * example, when a Unicode property that doesn't match anything is
19143              * the only element in the character class (perluniprops.pod notes
19144              * such properties). */
19145             if (partial_cp_count == 0) {
19146                 if (invert) {
19147                     ret = reg_node(pRExC_state, SANY);
19148                 }
19149                 else {
19150                     ret = reganode(pRExC_state, OPFAIL, 0);
19151                 }
19152
19153                 goto not_anyof;
19154             }
19155
19156             /* If matches everything but \n */
19157             if (   start[0] == 0 && end[0] == '\n' - 1
19158                 && start[1] == '\n' + 1 && end[1] == UV_MAX)
19159             {
19160                 assert (! invert);
19161                 ret = reg_node(pRExC_state, REG_ANY);
19162                 MARK_NAUGHTY(1);
19163                 goto not_anyof;
19164             }
19165         }
19166
19167         /* Next see if can optimize classes that contain just a few code points
19168          * into an EXACTish node.  The reason to do this is to let the
19169          * optimizer join this node with adjacent EXACTish ones, and ANYOF
19170          * nodes require conversion to code point from UTF-8.
19171          *
19172          * An EXACTFish node can be generated even if not under /i, and vice
19173          * versa.  But care must be taken.  An EXACTFish node has to be such
19174          * that it only matches precisely the code points in the class, but we
19175          * want to generate the least restrictive one that does that, to
19176          * increase the odds of being able to join with an adjacent node.  For
19177          * example, if the class contains [kK], we have to make it an EXACTFAA
19178          * node to prevent the KELVIN SIGN from matching.  Whether we are under
19179          * /i or not is irrelevant in this case.  Less obvious is the pattern
19180          * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
19181          * supposed to match the single character U+0149 LATIN SMALL LETTER N
19182          * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
19183          * that includes \X{02BC}, there is a multi-char fold that does, and so
19184          * the node generated for it must be an EXACTFish one.  On the other
19185          * hand qr/:/i should generate a plain EXACT node since the colon
19186          * participates in no fold whatsoever, and having it EXACT tells the
19187          * optimizer the target string cannot match unless it has a colon in
19188          * it.
19189          */
19190         if (   ! posixl
19191             && ! invert
19192
19193                 /* Only try if there are no more code points in the class than
19194                  * in the max possible fold */
19195             &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
19196         {
19197             if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
19198             {
19199                 /* We can always make a single code point class into an
19200                  * EXACTish node. */
19201
19202                 if (LOC) {
19203
19204                     /* Here is /l:  Use EXACTL, except if there is a fold not
19205                      * known until runtime so shows as only a single code point
19206                      * here.  For code points above 255, we know which can
19207                      * cause problems by having a potential fold to the Latin1
19208                      * range. */
19209                     if (  ! FOLD
19210                         || (     start[0] > 255
19211                             && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
19212                     {
19213                         op = EXACTL;
19214                     }
19215                     else {
19216                         op = EXACTFL;
19217                     }
19218                 }
19219                 else if (! FOLD) { /* Not /l and not /i */
19220                     op = (start[0] < 256) ? EXACT : EXACT_REQ8;
19221                 }
19222                 else if (start[0] < 256) { /* /i, not /l, and the code point is
19223                                               small */
19224
19225                     /* Under /i, it gets a little tricky.  A code point that
19226                      * doesn't participate in a fold should be an EXACT node.
19227                      * We know this one isn't the result of a simple fold, or
19228                      * there'd be more than one code point in the list, but it
19229                      * could be part of a multi- character fold.  In that case
19230                      * we better not create an EXACT node, as we would wrongly
19231                      * be telling the optimizer that this code point must be in
19232                      * the target string, and that is wrong.  This is because
19233                      * if the sequence around this code point forms a
19234                      * multi-char fold, what needs to be in the string could be
19235                      * the code point that folds to the sequence.
19236                      *
19237                      * This handles the case of below-255 code points, as we
19238                      * have an easy look up for those.  The next clause handles
19239                      * the above-256 one */
19240                     op = IS_IN_SOME_FOLD_L1(start[0])
19241                          ? EXACTFU
19242                          : EXACT;
19243                 }
19244                 else {  /* /i, larger code point.  Since we are under /i, and
19245                            have just this code point, we know that it can't
19246                            fold to something else, so PL_InMultiCharFold
19247                            applies to it */
19248                     op = _invlist_contains_cp(PL_InMultiCharFold,
19249                                               start[0])
19250                          ? EXACTFU_REQ8
19251                          : EXACT_REQ8;
19252                 }
19253
19254                 value = start[0];
19255             }
19256             else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
19257                      && _invlist_contains_cp(PL_in_some_fold, start[0]))
19258             {
19259                 /* Here, the only runtime dependency, if any, is from /d, and
19260                  * the class matches more than one code point, and the lowest
19261                  * code point participates in some fold.  It might be that the
19262                  * other code points are /i equivalent to this one, and hence
19263                  * they would representable by an EXACTFish node.  Above, we
19264                  * eliminated classes that contain too many code points to be
19265                  * EXACTFish, with the test for MAX_FOLD_FROMS
19266                  *
19267                  * First, special case the ASCII fold pairs, like 'B' and 'b'.
19268                  * We do this because we have EXACTFAA at our disposal for the
19269                  * ASCII range */
19270                 if (partial_cp_count == 2 && isASCII(start[0])) {
19271
19272                     /* The only ASCII characters that participate in folds are
19273                      * alphabetics */
19274                     assert(isALPHA(start[0]));
19275                     if (   end[0] == start[0]   /* First range is a single
19276                                                    character, so 2nd exists */
19277                         && isALPHA_FOLD_EQ(start[0], start[1]))
19278                     {
19279
19280                         /* Here, is part of an ASCII fold pair */
19281
19282                         if (   ASCII_FOLD_RESTRICTED
19283                             || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
19284                         {
19285                             /* If the second clause just above was true, it
19286                              * means we can't be under /i, or else the list
19287                              * would have included more than this fold pair.
19288                              * Therefore we have to exclude the possibility of
19289                              * whatever else it is that folds to these, by
19290                              * using EXACTFAA */
19291                             op = EXACTFAA;
19292                         }
19293                         else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
19294
19295                             /* Here, there's no simple fold that start[0] is part
19296                              * of, but there is a multi-character one.  If we
19297                              * are not under /i, we want to exclude that
19298                              * possibility; if under /i, we want to include it
19299                              * */
19300                             op = (FOLD) ? EXACTFU : EXACTFAA;
19301                         }
19302                         else {
19303
19304                             /* Here, the only possible fold start[0] particpates in
19305                              * is with start[1].  /i or not isn't relevant */
19306                             op = EXACTFU;
19307                         }
19308
19309                         value = toFOLD(start[0]);
19310                     }
19311                 }
19312                 else if (  ! upper_latin1_only_utf8_matches
19313                          || (   _invlist_len(upper_latin1_only_utf8_matches)
19314                                                                           == 2
19315                              && PL_fold_latin1[
19316                                invlist_highest(upper_latin1_only_utf8_matches)]
19317                              == start[0]))
19318                 {
19319                     /* Here, the smallest character is non-ascii or there are
19320                      * more than 2 code points matched by this node.  Also, we
19321                      * either don't have /d UTF-8 dependent matches, or if we
19322                      * do, they look like they could be a single character that
19323                      * is the fold of the lowest one in the always-match list.
19324                      * This test quickly excludes most of the false positives
19325                      * when there are /d UTF-8 depdendent matches.  These are
19326                      * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
19327                      * SMALL LETTER A WITH GRAVE iff the target string is
19328                      * UTF-8.  (We don't have to worry above about exceeding
19329                      * the array bounds of PL_fold_latin1[] because any code
19330                      * point in 'upper_latin1_only_utf8_matches' is below 256.)
19331                      *
19332                      * EXACTFAA would apply only to pairs (hence exactly 2 code
19333                      * points) in the ASCII range, so we can't use it here to
19334                      * artificially restrict the fold domain, so we check if
19335                      * the class does or does not match some EXACTFish node.
19336                      * Further, if we aren't under /i, and the folded-to
19337                      * character is part of a multi-character fold, we can't do
19338                      * this optimization, as the sequence around it could be
19339                      * that multi-character fold, and we don't here know the
19340                      * context, so we have to assume it is that multi-char
19341                      * fold, to prevent potential bugs.
19342                      *
19343                      * To do the general case, we first find the fold of the
19344                      * lowest code point (which may be higher than the lowest
19345                      * one), then find everything that folds to it.  (The data
19346                      * structure we have only maps from the folded code points,
19347                      * so we have to do the earlier step.) */
19348
19349                     Size_t foldlen;
19350                     U8 foldbuf[UTF8_MAXBYTES_CASE];
19351                     UV folded = _to_uni_fold_flags(start[0],
19352                                                         foldbuf, &foldlen, 0);
19353                     U32 first_fold;
19354                     const U32 * remaining_folds;
19355                     Size_t folds_to_this_cp_count = _inverse_folds(
19356                                                             folded,
19357                                                             &first_fold,
19358                                                             &remaining_folds);
19359                     Size_t folds_count = folds_to_this_cp_count + 1;
19360                     SV * fold_list = _new_invlist(folds_count);
19361                     unsigned int i;
19362
19363                     /* If there are UTF-8 dependent matches, create a temporary
19364                      * list of what this node matches, including them. */
19365                     SV * all_cp_list = NULL;
19366                     SV ** use_this_list = &cp_list;
19367
19368                     if (upper_latin1_only_utf8_matches) {
19369                         all_cp_list = _new_invlist(0);
19370                         use_this_list = &all_cp_list;
19371                         _invlist_union(cp_list,
19372                                        upper_latin1_only_utf8_matches,
19373                                        use_this_list);
19374                     }
19375
19376                     /* Having gotten everything that participates in the fold
19377                      * containing the lowest code point, we turn that into an
19378                      * inversion list, making sure everything is included. */
19379                     fold_list = add_cp_to_invlist(fold_list, start[0]);
19380                     fold_list = add_cp_to_invlist(fold_list, folded);
19381                     if (folds_to_this_cp_count > 0) {
19382                         fold_list = add_cp_to_invlist(fold_list, first_fold);
19383                         for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19384                             fold_list = add_cp_to_invlist(fold_list,
19385                                                         remaining_folds[i]);
19386                         }
19387                     }
19388
19389                     /* If the fold list is identical to what's in this ANYOF
19390                      * node, the node can be represented by an EXACTFish one
19391                      * instead */
19392                     if (_invlistEQ(*use_this_list, fold_list,
19393                                    0 /* Don't complement */ )
19394                     ) {
19395
19396                         /* But, we have to be careful, as mentioned above.
19397                          * Just the right sequence of characters could match
19398                          * this if it is part of a multi-character fold.  That
19399                          * IS what we want if we are under /i.  But it ISN'T
19400                          * what we want if not under /i, as it could match when
19401                          * it shouldn't.  So, when we aren't under /i and this
19402                          * character participates in a multi-char fold, we
19403                          * don't optimize into an EXACTFish node.  So, for each
19404                          * case below we have to check if we are folding
19405                          * and if not, if it is not part of a multi-char fold.
19406                          * */
19407                         if (start[0] > 255) {    /* Highish code point */
19408                             if (FOLD || ! _invlist_contains_cp(
19409                                             PL_InMultiCharFold, folded))
19410                             {
19411                                 op = (LOC)
19412                                      ? EXACTFLU8
19413                                      : (ASCII_FOLD_RESTRICTED)
19414                                        ? EXACTFAA
19415                                        : EXACTFU_REQ8;
19416                                 value = folded;
19417                             }
19418                         }   /* Below, the lowest code point < 256 */
19419                         else if (    FOLD
19420                                  &&  folded == 's'
19421                                  &&  DEPENDS_SEMANTICS)
19422                         {   /* An EXACTF node containing a single character
19423                                 's', can be an EXACTFU if it doesn't get
19424                                 joined with an adjacent 's' */
19425                             op = EXACTFU_S_EDGE;
19426                             value = folded;
19427                         }
19428                         else if (    FOLD
19429                                 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19430                         {
19431                             if (upper_latin1_only_utf8_matches) {
19432                                 op = EXACTF;
19433
19434                                 /* We can't use the fold, as that only matches
19435                                  * under UTF-8 */
19436                                 value = start[0];
19437                             }
19438                             else if (     UNLIKELY(start[0] == MICRO_SIGN)
19439                                      && ! UTF)
19440                             {   /* EXACTFUP is a special node for this
19441                                    character */
19442                                 op = (ASCII_FOLD_RESTRICTED)
19443                                      ? EXACTFAA
19444                                      : EXACTFUP;
19445                                 value = MICRO_SIGN;
19446                             }
19447                             else if (     ASCII_FOLD_RESTRICTED
19448                                      && ! isASCII(start[0]))
19449                             {   /* For ASCII under /iaa, we can use EXACTFU
19450                                    below */
19451                                 op = EXACTFAA;
19452                                 value = folded;
19453                             }
19454                             else {
19455                                 op = EXACTFU;
19456                                 value = folded;
19457                             }
19458                         }
19459                     }
19460
19461                     SvREFCNT_dec_NN(fold_list);
19462                     SvREFCNT_dec(all_cp_list);
19463                 }
19464             }
19465
19466             if (op != END) {
19467                 U8 len;
19468
19469                 /* Here, we have calculated what EXACTish node to use.  Have to
19470                  * convert to UTF-8 if not already there */
19471                 if (value > 255) {
19472                     if (! UTF) {
19473                         SvREFCNT_dec(cp_list);;
19474                         REQUIRE_UTF8(flagp);
19475                     }
19476
19477                     /* This is a kludge to the special casing issues with this
19478                      * ligature under /aa.  FB05 should fold to FB06, but the
19479                      * call above to _to_uni_fold_flags() didn't find this, as
19480                      * it didn't use the /aa restriction in order to not miss
19481                      * other folds that would be affected.  This is the only
19482                      * instance likely to ever be a problem in all of Unicode.
19483                      * So special case it. */
19484                     if (   value == LATIN_SMALL_LIGATURE_LONG_S_T
19485                         && ASCII_FOLD_RESTRICTED)
19486                     {
19487                         value = LATIN_SMALL_LIGATURE_ST;
19488                     }
19489                 }
19490
19491                 len = (UTF) ? UVCHR_SKIP(value) : 1;
19492
19493                 ret = regnode_guts(pRExC_state, op, len, "exact");
19494                 FILL_NODE(ret, op);
19495                 RExC_emit += 1 + STR_SZ(len);
19496                 setSTR_LEN(REGNODE_p(ret), len);
19497                 if (len == 1) {
19498                     *STRINGs(REGNODE_p(ret)) = (U8) value;
19499                 }
19500                 else {
19501                     uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19502                 }
19503                 goto not_anyof;
19504             }
19505         }
19506
19507         if (! has_runtime_dependency) {
19508
19509             /* See if this can be turned into an ANYOFM node.  Think about the
19510              * bit patterns in two different bytes.  In some positions, the
19511              * bits in each will be 1; and in other positions both will be 0;
19512              * and in some positions the bit will be 1 in one byte, and 0 in
19513              * the other.  Let 'n' be the number of positions where the bits
19514              * differ.  We create a mask which has exactly 'n' 0 bits, each in
19515              * a position where the two bytes differ.  Now take the set of all
19516              * bytes that when ANDed with the mask yield the same result.  That
19517              * set has 2**n elements, and is representable by just two 8 bit
19518              * numbers: the result and the mask.  Importantly, matching the set
19519              * can be vectorized by creating a word full of the result bytes,
19520              * and a word full of the mask bytes, yielding a significant speed
19521              * up.  Here, see if this node matches such a set.  As a concrete
19522              * example consider [01], and the byte representing '0' which is
19523              * 0x30 on ASCII machines.  It has the bits 0011 0000.  Take the
19524              * mask 1111 1110.  If we AND 0x31 and 0x30 with that mask we get
19525              * 0x30.  Any other bytes ANDed yield something else.  So [01],
19526              * which is a common usage, is optimizable into ANYOFM, and can
19527              * benefit from the speed up.  We can only do this on UTF-8
19528              * invariant bytes, because they have the same bit patterns under
19529              * UTF-8 as not. */
19530             PERL_UINT_FAST8_T inverted = 0;
19531 #ifdef EBCDIC
19532             const PERL_UINT_FAST8_T max_permissible = 0xFF;
19533 #else
19534             const PERL_UINT_FAST8_T max_permissible = 0x7F;
19535 #endif
19536             /* If doesn't fit the criteria for ANYOFM, invert and try again.
19537              * If that works we will instead later generate an NANYOFM, and
19538              * invert back when through */
19539             if (invlist_highest(cp_list) > max_permissible) {
19540                 _invlist_invert(cp_list);
19541                 inverted = 1;
19542             }
19543
19544             if (invlist_highest(cp_list) <= max_permissible) {
19545                 UV this_start, this_end;
19546                 UV lowest_cp = UV_MAX;  /* init'ed to suppress compiler warn */
19547                 U8 bits_differing = 0;
19548                 Size_t full_cp_count = 0;
19549                 bool first_time = TRUE;
19550
19551                 /* Go through the bytes and find the bit positions that differ
19552                  * */
19553                 invlist_iterinit(cp_list);
19554                 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19555                     unsigned int i = this_start;
19556
19557                     if (first_time) {
19558                         if (! UVCHR_IS_INVARIANT(i)) {
19559                             goto done_anyofm;
19560                         }
19561
19562                         first_time = FALSE;
19563                         lowest_cp = this_start;
19564
19565                         /* We have set up the code point to compare with.
19566                          * Don't compare it with itself */
19567                         i++;
19568                     }
19569
19570                     /* Find the bit positions that differ from the lowest code
19571                      * point in the node.  Keep track of all such positions by
19572                      * OR'ing */
19573                     for (; i <= this_end; i++) {
19574                         if (! UVCHR_IS_INVARIANT(i)) {
19575                             goto done_anyofm;
19576                         }
19577
19578                         bits_differing  |= i ^ lowest_cp;
19579                     }
19580
19581                     full_cp_count += this_end - this_start + 1;
19582                 }
19583
19584                 /* At the end of the loop, we count how many bits differ from
19585                  * the bits in lowest code point, call the count 'd'.  If the
19586                  * set we found contains 2**d elements, it is the closure of
19587                  * all code points that differ only in those bit positions.  To
19588                  * convince yourself of that, first note that the number in the
19589                  * closure must be a power of 2, which we test for.  The only
19590                  * way we could have that count and it be some differing set,
19591                  * is if we got some code points that don't differ from the
19592                  * lowest code point in any position, but do differ from each
19593                  * other in some other position.  That means one code point has
19594                  * a 1 in that position, and another has a 0.  But that would
19595                  * mean that one of them differs from the lowest code point in
19596                  * that position, which possibility we've already excluded.  */
19597                 if (  (inverted || full_cp_count > 1)
19598                     && full_cp_count == 1U << PL_bitcount[bits_differing])
19599                 {
19600                     U8 ANYOFM_mask;
19601
19602                     op = ANYOFM + inverted;;
19603
19604                     /* We need to make the bits that differ be 0's */
19605                     ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19606
19607                     /* The argument is the lowest code point */
19608                     ret = reganode(pRExC_state, op, lowest_cp);
19609                     FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19610                 }
19611
19612               done_anyofm:
19613                 invlist_iterfinish(cp_list);
19614             }
19615
19616             if (inverted) {
19617                 _invlist_invert(cp_list);
19618             }
19619
19620             if (op != END) {
19621                 goto not_anyof;
19622             }
19623
19624             /* XXX We could create an ANYOFR_LOW node here if we saved above if
19625              * all were invariants, it wasn't inverted, and there is a single
19626              * range.  This would be faster than some of the posix nodes we
19627              * create below like /\d/a, but would be twice the size.  Without
19628              * having actually measured the gain, khw doesn't think the
19629              * tradeoff is really worth it */
19630         }
19631
19632         if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19633             PERL_UINT_FAST8_T type;
19634             SV * intersection = NULL;
19635             SV* d_invlist = NULL;
19636
19637             /* See if this matches any of the POSIX classes.  The POSIXA and
19638              * POSIXD ones are about the same speed as ANYOF ops, but take less
19639              * room; the ones that have above-Latin1 code point matches are
19640              * somewhat faster than ANYOF.  */
19641
19642             for (type = POSIXA; type >= POSIXD; type--) {
19643                 int posix_class;
19644
19645                 if (type == POSIXL) {   /* But not /l posix classes */
19646                     continue;
19647                 }
19648
19649                 for (posix_class = 0;
19650                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19651                      posix_class++)
19652                 {
19653                     SV** our_code_points = &cp_list;
19654                     SV** official_code_points;
19655                     int try_inverted;
19656
19657                     if (type == POSIXA) {
19658                         official_code_points = &PL_Posix_ptrs[posix_class];
19659                     }
19660                     else {
19661                         official_code_points = &PL_XPosix_ptrs[posix_class];
19662                     }
19663
19664                     /* Skip non-existent classes of this type.  e.g. \v only
19665                      * has an entry in PL_XPosix_ptrs */
19666                     if (! *official_code_points) {
19667                         continue;
19668                     }
19669
19670                     /* Try both the regular class, and its inversion */
19671                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19672                         bool this_inverted = invert ^ try_inverted;
19673
19674                         if (type != POSIXD) {
19675
19676                             /* This class that isn't /d can't match if we have
19677                              * /d dependencies */
19678                             if (has_runtime_dependency
19679                                                     & HAS_D_RUNTIME_DEPENDENCY)
19680                             {
19681                                 continue;
19682                             }
19683                         }
19684                         else /* is /d */ if (! this_inverted) {
19685
19686                             /* /d classes don't match anything non-ASCII below
19687                              * 256 unconditionally (which cp_list contains) */
19688                             _invlist_intersection(cp_list, PL_UpperLatin1,
19689                                                            &intersection);
19690                             if (_invlist_len(intersection) != 0) {
19691                                 continue;
19692                             }
19693
19694                             SvREFCNT_dec(d_invlist);
19695                             d_invlist = invlist_clone(cp_list, NULL);
19696
19697                             /* But under UTF-8 it turns into using /u rules.
19698                              * Add the things it matches under these conditions
19699                              * so that we check below that these are identical
19700                              * to what the tested class should match */
19701                             if (upper_latin1_only_utf8_matches) {
19702                                 _invlist_union(
19703                                             d_invlist,
19704                                             upper_latin1_only_utf8_matches,
19705                                             &d_invlist);
19706                             }
19707                             our_code_points = &d_invlist;
19708                         }
19709                         else {  /* POSIXD, inverted.  If this doesn't have this
19710                                    flag set, it isn't /d. */
19711                             if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19712                             {
19713                                 continue;
19714                             }
19715                             our_code_points = &cp_list;
19716                         }
19717
19718                         /* Here, have weeded out some things.  We want to see
19719                          * if the list of characters this node contains
19720                          * ('*our_code_points') precisely matches those of the
19721                          * class we are currently checking against
19722                          * ('*official_code_points'). */
19723                         if (_invlistEQ(*our_code_points,
19724                                        *official_code_points,
19725                                        try_inverted))
19726                         {
19727                             /* Here, they precisely match.  Optimize this ANYOF
19728                              * node into its equivalent POSIX one of the
19729                              * correct type, possibly inverted */
19730                             ret = reg_node(pRExC_state, (try_inverted)
19731                                                         ? type + NPOSIXA
19732                                                                 - POSIXA
19733                                                         : type);
19734                             FLAGS(REGNODE_p(ret)) = posix_class;
19735                             SvREFCNT_dec(d_invlist);
19736                             SvREFCNT_dec(intersection);
19737                             goto not_anyof;
19738                         }
19739                     }
19740                 }
19741             }
19742             SvREFCNT_dec(d_invlist);
19743             SvREFCNT_dec(intersection);
19744         }
19745
19746         /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19747          * both in size and speed.  Currently, a 20 bit range base (smallest
19748          * code point in the range), and a 12 bit maximum delta are packed into
19749          * a 32 bit word.  This allows for using it on all of the Unicode code
19750          * points except for the highest plane, which is only for private use
19751          * code points.  khw doubts that a bigger delta is likely in real world
19752          * applications */
19753         if (     single_range
19754             && ! has_runtime_dependency
19755             &&   anyof_flags == 0
19756             &&   start[0] < (1 << ANYOFR_BASE_BITS)
19757             &&   end[0] - start[0]
19758                     < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19759                                    * CHARBITS - ANYOFR_BASE_BITS))))
19760
19761         {
19762             U8 low_utf8[UTF8_MAXBYTES+1];
19763             U8 high_utf8[UTF8_MAXBYTES+1];
19764
19765             ret = reganode(pRExC_state, ANYOFR,
19766                         (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19767
19768             /* Place the lowest UTF-8 start byte in the flags field, so as to
19769              * allow efficient ruling out at run time of many possible inputs.
19770              * */
19771             (void) uvchr_to_utf8(low_utf8, start[0]);
19772             (void) uvchr_to_utf8(high_utf8, end[0]);
19773
19774             /* If all code points share the same first byte, this can be an
19775              * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
19776              * quickly rule out many inputs at run-time without having to
19777              * compute the code point from UTF-8.  For EBCDIC, we use I8, as
19778              * not doing that transformation would not rule out nearly so many
19779              * things */
19780             if (low_utf8[0] == high_utf8[0]) {
19781                 OP(REGNODE_p(ret)) = ANYOFRb;
19782                 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19783             }
19784             else {
19785                 ANYOF_FLAGS(REGNODE_p(ret))
19786                                     = NATIVE_UTF8_TO_I8(low_utf8[0]);
19787             }
19788
19789             goto not_anyof;
19790         }
19791
19792         /* If didn't find an optimization and there is no need for a bitmap,
19793          * optimize to indicate that */
19794         if (     start[0] >= NUM_ANYOF_CODE_POINTS
19795             && ! LOC
19796             && ! upper_latin1_only_utf8_matches
19797             &&   anyof_flags == 0)
19798         {
19799             U8 low_utf8[UTF8_MAXBYTES+1];
19800             UV highest_cp = invlist_highest(cp_list);
19801
19802             /* Currently the maximum allowed code point by the system is
19803              * IV_MAX.  Higher ones are reserved for future internal use.  This
19804              * particular regnode can be used for higher ones, but we can't
19805              * calculate the code point of those.  IV_MAX suffices though, as
19806              * it will be a large first byte */
19807             Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19808                            - low_utf8;
19809
19810             /* We store the lowest possible first byte of the UTF-8
19811              * representation, using the flags field.  This allows for quick
19812              * ruling out of some inputs without having to convert from UTF-8
19813              * to code point.  For EBCDIC, we use I8, as not doing that
19814              * transformation would not rule out nearly so many things */
19815             anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19816
19817             op = ANYOFH;
19818
19819             /* If the first UTF-8 start byte for the highest code point in the
19820              * range is suitably small, we may be able to get an upper bound as
19821              * well */
19822             if (highest_cp <= IV_MAX) {
19823                 U8 high_utf8[UTF8_MAXBYTES+1];
19824                 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19825                                 - high_utf8;
19826
19827                 /* If the lowest and highest are the same, we can get an exact
19828                  * first byte instead of a just minimum or even a sequence of
19829                  * exact leading bytes.  We signal these with different
19830                  * regnodes */
19831                 if (low_utf8[0] == high_utf8[0]) {
19832                     Size_t len = find_first_differing_byte_pos(low_utf8,
19833                                                                high_utf8,
19834                                                        MIN(low_len, high_len));
19835
19836                     if (len == 1) {
19837
19838                         /* No need to convert to I8 for EBCDIC as this is an
19839                          * exact match */
19840                         anyof_flags = low_utf8[0];
19841                         op = ANYOFHb;
19842                     }
19843                     else {
19844                         op = ANYOFHs;
19845                         ret = regnode_guts(pRExC_state, op,
19846                                            regarglen[op] + STR_SZ(len),
19847                                            "anyofhs");
19848                         FILL_NODE(ret, op);
19849                         ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
19850                                                                         = len;
19851                         Copy(low_utf8,  /* Add the common bytes */
19852                            ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
19853                            len, U8);
19854                         RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
19855                         set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19856                                                   NULL, only_utf8_locale_list);
19857                         goto not_anyof;
19858                     }
19859                 }
19860                 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19861                 {
19862
19863                     /* Here, the high byte is not the same as the low, but is
19864                      * small enough that its reasonable to have a loose upper
19865                      * bound, which is packed in with the strict lower bound.
19866                      * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19867                      * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
19868                      * is the same thing as UTF-8 */
19869
19870                     U8 bits = 0;
19871                     U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19872                     U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19873                                   - anyof_flags;
19874
19875                     if (range_diff <= max_range_diff / 8) {
19876                         bits = 3;
19877                     }
19878                     else if (range_diff <= max_range_diff / 4) {
19879                         bits = 2;
19880                     }
19881                     else if (range_diff <= max_range_diff / 2) {
19882                         bits = 1;
19883                     }
19884                     anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19885                     op = ANYOFHr;
19886                 }
19887             }
19888
19889             goto done_finding_op;
19890         }
19891     }   /* End of seeing if can optimize it into a different node */
19892
19893   is_anyof: /* It's going to be an ANYOF node. */
19894     op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19895          ? ANYOFD
19896          : ((posixl)
19897             ? ANYOFPOSIXL
19898             : ((LOC)
19899                ? ANYOFL
19900                : ANYOF));
19901
19902   done_finding_op:
19903
19904     ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19905     FILL_NODE(ret, op);        /* We set the argument later */
19906     RExC_emit += 1 + regarglen[op];
19907     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19908
19909     /* Here, <cp_list> contains all the code points we can determine at
19910      * compile time that match under all conditions.  Go through it, and
19911      * for things that belong in the bitmap, put them there, and delete from
19912      * <cp_list>.  While we are at it, see if everything above 255 is in the
19913      * list, and if so, set a flag to speed up execution */
19914
19915     populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19916
19917     if (posixl) {
19918         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19919     }
19920
19921     if (invert) {
19922         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19923     }
19924
19925     /* Here, the bitmap has been populated with all the Latin1 code points that
19926      * always match.  Can now add to the overall list those that match only
19927      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19928      * */
19929     if (upper_latin1_only_utf8_matches) {
19930         if (cp_list) {
19931             _invlist_union(cp_list,
19932                            upper_latin1_only_utf8_matches,
19933                            &cp_list);
19934             SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19935         }
19936         else {
19937             cp_list = upper_latin1_only_utf8_matches;
19938         }
19939         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19940     }
19941
19942     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19943                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19944                    ? listsv
19945                    : NULL,
19946                   only_utf8_locale_list);
19947     SvREFCNT_dec(cp_list);;
19948     SvREFCNT_dec(only_utf8_locale_list);
19949     return ret;
19950
19951   not_anyof:
19952
19953     /* Here, the node is getting optimized into something that's not an ANYOF
19954      * one.  Finish up. */
19955
19956     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19957                                            RExC_parse - orig_parse);;
19958     SvREFCNT_dec(cp_list);;
19959     SvREFCNT_dec(only_utf8_locale_list);
19960     return ret;
19961 }
19962
19963 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19964
19965 STATIC void
19966 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19967                 regnode* const node,
19968                 SV* const cp_list,
19969                 SV* const runtime_defns,
19970                 SV* const only_utf8_locale_list)
19971 {
19972     /* Sets the arg field of an ANYOF-type node 'node', using information about
19973      * the node passed-in.  If there is nothing outside the node's bitmap, the
19974      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
19975      * the count returned by add_data(), having allocated and stored an array,
19976      * av, as follows:
19977      *
19978      *  av[0] stores the inversion list defining this class as far as known at
19979      *        this time, or PL_sv_undef if nothing definite is now known.
19980      *  av[1] stores the inversion list of code points that match only if the
19981      *        current locale is UTF-8, or if none, PL_sv_undef if there is an
19982      *        av[2], or no entry otherwise.
19983      *  av[2] stores the list of user-defined properties whose subroutine
19984      *        definitions aren't known at this time, or no entry if none. */
19985
19986     UV n;
19987
19988     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19989
19990     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19991         assert(! (ANYOF_FLAGS(node)
19992                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19993         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19994     }
19995     else {
19996         AV * const av = newAV();
19997         SV *rv;
19998
19999         if (cp_list) {
20000             av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
20001         }
20002
20003         /* (Note that if any of this changes, the size calculations in
20004          * S_optimize_regclass() might need to be updated.) */
20005
20006         if (only_utf8_locale_list) {
20007             av_store(av, ONLY_LOCALE_MATCHES_INDEX,
20008                                      SvREFCNT_inc_NN(only_utf8_locale_list));
20009         }
20010
20011         if (runtime_defns) {
20012             av_store(av, DEFERRED_USER_DEFINED_INDEX,
20013                          SvREFCNT_inc_NN(runtime_defns));
20014         }
20015
20016         rv = newRV_noinc(MUTABLE_SV(av));
20017         n = add_data(pRExC_state, STR_WITH_LEN("s"));
20018         RExC_rxi->data->data[n] = (void*)rv;
20019         ARG_SET(node, n);
20020     }
20021 }
20022
20023 SV *
20024
20025 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20026 Perl_get_regclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20027 #else
20028 Perl_get_re_gclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20029 #endif
20030
20031 {
20032     /* For internal core use only.
20033      * Returns the inversion list for the input 'node' in the regex 'prog'.
20034      * If <doinit> is 'true', will attempt to create the inversion list if not
20035      *    already done.
20036      * If <listsvp> is non-null, will return the printable contents of the
20037      *    property definition.  This can be used to get debugging information
20038      *    even before the inversion list exists, by calling this function with
20039      *    'doinit' set to false, in which case the components that will be used
20040      *    to eventually create the inversion list are returned  (in a printable
20041      *    form).
20042      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
20043      *    store an inversion list of code points that should match only if the
20044      *    execution-time locale is a UTF-8 one.
20045      * If <output_invlist> is not NULL, it is where this routine is to store an
20046      *    inversion list of the code points that would be instead returned in
20047      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
20048      *    when this parameter is used, is just the non-code point data that
20049      *    will go into creating the inversion list.  This currently should be just
20050      *    user-defined properties whose definitions were not known at compile
20051      *    time.  Using this parameter allows for easier manipulation of the
20052      *    inversion list's data by the caller.  It is illegal to call this
20053      *    function with this parameter set, but not <listsvp>
20054      *
20055      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
20056      * that, in spite of this function's name, the inversion list it returns
20057      * may include the bitmap data as well */
20058
20059     SV *si  = NULL;         /* Input initialization string */
20060     SV* invlist = NULL;
20061
20062     RXi_GET_DECL(prog, progi);
20063     const struct reg_data * const data = prog ? progi->data : NULL;
20064
20065 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20066     PERL_ARGS_ASSERT_GET_REGCLASS_NONBITMAP_DATA;
20067 #else
20068     PERL_ARGS_ASSERT_GET_RE_GCLASS_NONBITMAP_DATA;
20069 #endif
20070     assert(! output_invlist || listsvp);
20071
20072     if (data && data->count) {
20073         const U32 n = ARG(node);
20074
20075         if (data->what[n] == 's') {
20076             SV * const rv = MUTABLE_SV(data->data[n]);
20077             AV * const av = MUTABLE_AV(SvRV(rv));
20078             SV **const ary = AvARRAY(av);
20079
20080             invlist = ary[INVLIST_INDEX];
20081
20082             if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
20083                 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
20084             }
20085
20086             if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
20087                 si = ary[DEFERRED_USER_DEFINED_INDEX];
20088             }
20089
20090             if (doinit && (si || invlist)) {
20091                 if (si) {
20092                     bool user_defined;
20093                     SV * msg = newSVpvs_flags("", SVs_TEMP);
20094
20095                     SV * prop_definition = handle_user_defined_property(
20096                             "", 0, FALSE,   /* There is no \p{}, \P{} */
20097                             SvPVX_const(si)[1] - '0',   /* /i or not has been
20098                                                            stored here for just
20099                                                            this occasion */
20100                             TRUE,           /* run time */
20101                             FALSE,          /* This call must find the defn */
20102                             si,             /* The property definition  */
20103                             &user_defined,
20104                             msg,
20105                             0               /* base level call */
20106                            );
20107
20108                     if (SvCUR(msg)) {
20109                         assert(prop_definition == NULL);
20110
20111                         Perl_croak(aTHX_ "%" UTF8f,
20112                                 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
20113                     }
20114
20115                     if (invlist) {
20116                         _invlist_union(invlist, prop_definition, &invlist);
20117                         SvREFCNT_dec_NN(prop_definition);
20118                     }
20119                     else {
20120                         invlist = prop_definition;
20121                     }
20122
20123                     STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
20124                     STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
20125
20126                     ary[INVLIST_INDEX] = invlist;
20127                     av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
20128                                  ? ONLY_LOCALE_MATCHES_INDEX
20129                                  : INVLIST_INDEX);
20130                     si = NULL;
20131                 }
20132             }
20133         }
20134     }
20135
20136     /* If requested, return a printable version of what this ANYOF node matches
20137      * */
20138     if (listsvp) {
20139         SV* matches_string = NULL;
20140
20141         /* This function can be called at compile-time, before everything gets
20142          * resolved, in which case we return the currently best available
20143          * information, which is the string that will eventually be used to do
20144          * that resolving, 'si' */
20145         if (si) {
20146             /* Here, we only have 'si' (and possibly some passed-in data in
20147              * 'invlist', which is handled below)  If the caller only wants
20148              * 'si', use that.  */
20149             if (! output_invlist) {
20150                 matches_string = newSVsv(si);
20151             }
20152             else {
20153                 /* But if the caller wants an inversion list of the node, we
20154                  * need to parse 'si' and place as much as possible in the
20155                  * desired output inversion list, making 'matches_string' only
20156                  * contain the currently unresolvable things */
20157                 const char *si_string = SvPVX(si);
20158                 STRLEN remaining = SvCUR(si);
20159                 UV prev_cp = 0;
20160                 U8 count = 0;
20161
20162                 /* Ignore everything before and including the first new-line */
20163                 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
20164                 assert (si_string != NULL);
20165                 si_string++;
20166                 remaining = SvPVX(si) + SvCUR(si) - si_string;
20167
20168                 while (remaining > 0) {
20169
20170                     /* The data consists of just strings defining user-defined
20171                      * property names, but in prior incarnations, and perhaps
20172                      * somehow from pluggable regex engines, it could still
20173                      * hold hex code point definitions, all of which should be
20174                      * legal (or it wouldn't have gotten this far).  Each
20175                      * component of a range would be separated by a tab, and
20176                      * each range by a new-line.  If these are found, instead
20177                      * add them to the inversion list */
20178                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
20179                                      |PERL_SCAN_SILENT_NON_PORTABLE;
20180                     STRLEN len = remaining;
20181                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
20182
20183                     /* If the hex decode routine found something, it should go
20184                      * up to the next \n */
20185                     if (   *(si_string + len) == '\n') {
20186                         if (count) {    /* 2nd code point on line */
20187                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
20188                         }
20189                         else {
20190                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
20191                         }
20192                         count = 0;
20193                         goto prepare_for_next_iteration;
20194                     }
20195
20196                     /* If the hex decode was instead for the lower range limit,
20197                      * save it, and go parse the upper range limit */
20198                     if (*(si_string + len) == '\t') {
20199                         assert(count == 0);
20200
20201                         prev_cp = cp;
20202                         count = 1;
20203                       prepare_for_next_iteration:
20204                         si_string += len + 1;
20205                         remaining -= len + 1;
20206                         continue;
20207                     }
20208
20209                     /* Here, didn't find a legal hex number.  Just add the text
20210                      * from here up to the next \n, omitting any trailing
20211                      * markers. */
20212
20213                     remaining -= len;
20214                     len = strcspn(si_string,
20215                                         DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
20216                     remaining -= len;
20217                     if (matches_string) {
20218                         sv_catpvn(matches_string, si_string, len);
20219                     }
20220                     else {
20221                         matches_string = newSVpvn(si_string, len);
20222                     }
20223                     sv_catpvs(matches_string, " ");
20224
20225                     si_string += len;
20226                     if (   remaining
20227                         && UCHARAT(si_string)
20228                                             == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
20229                     {
20230                         si_string++;
20231                         remaining--;
20232                     }
20233                     if (remaining && UCHARAT(si_string) == '\n') {
20234                         si_string++;
20235                         remaining--;
20236                     }
20237                 } /* end of loop through the text */
20238
20239                 assert(matches_string);
20240                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
20241                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
20242                 }
20243             } /* end of has an 'si' */
20244         }
20245
20246         /* Add the stuff that's already known */
20247         if (invlist) {
20248
20249             /* Again, if the caller doesn't want the output inversion list, put
20250              * everything in 'matches-string' */
20251             if (! output_invlist) {
20252                 if ( ! matches_string) {
20253                     matches_string = newSVpvs("\n");
20254                 }
20255                 sv_catsv(matches_string, invlist_contents(invlist,
20256                                                   TRUE /* traditional style */
20257                                                   ));
20258             }
20259             else if (! *output_invlist) {
20260                 *output_invlist = invlist_clone(invlist, NULL);
20261             }
20262             else {
20263                 _invlist_union(*output_invlist, invlist, output_invlist);
20264             }
20265         }
20266
20267         *listsvp = matches_string;
20268     }
20269
20270     return invlist;
20271 }
20272
20273 /* reg_skipcomment()
20274
20275    Absorbs an /x style # comment from the input stream,
20276    returning a pointer to the first character beyond the comment, or if the
20277    comment terminates the pattern without anything following it, this returns
20278    one past the final character of the pattern (in other words, RExC_end) and
20279    sets the REG_RUN_ON_COMMENT_SEEN flag.
20280
20281    Note it's the callers responsibility to ensure that we are
20282    actually in /x mode
20283
20284 */
20285
20286 PERL_STATIC_INLINE char*
20287 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
20288 {
20289     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
20290
20291     assert(*p == '#');
20292
20293     while (p < RExC_end) {
20294         if (*(++p) == '\n') {
20295             return p+1;
20296         }
20297     }
20298
20299     /* we ran off the end of the pattern without ending the comment, so we have
20300      * to add an \n when wrapping */
20301     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
20302     return p;
20303 }
20304
20305 STATIC void
20306 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
20307                                 char ** p,
20308                                 const bool force_to_xmod
20309                          )
20310 {
20311     /* If the text at the current parse position '*p' is a '(?#...)' comment,
20312      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
20313      * is /x whitespace, advance '*p' so that on exit it points to the first
20314      * byte past all such white space and comments */
20315
20316     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
20317
20318     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
20319
20320     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
20321
20322     for (;;) {
20323         if (RExC_end - (*p) >= 3
20324             && *(*p)     == '('
20325             && *(*p + 1) == '?'
20326             && *(*p + 2) == '#')
20327         {
20328             while (*(*p) != ')') {
20329                 if ((*p) == RExC_end)
20330                     FAIL("Sequence (?#... not terminated");
20331                 (*p)++;
20332             }
20333             (*p)++;
20334             continue;
20335         }
20336
20337         if (use_xmod) {
20338             const char * save_p = *p;
20339             while ((*p) < RExC_end) {
20340                 STRLEN len;
20341                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
20342                     (*p) += len;
20343                 }
20344                 else if (*(*p) == '#') {
20345                     (*p) = reg_skipcomment(pRExC_state, (*p));
20346                 }
20347                 else {
20348                     break;
20349                 }
20350             }
20351             if (*p != save_p) {
20352                 continue;
20353             }
20354         }
20355
20356         break;
20357     }
20358
20359     return;
20360 }
20361
20362 /* nextchar()
20363
20364    Advances the parse position by one byte, unless that byte is the beginning
20365    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
20366    those two cases, the parse position is advanced beyond all such comments and
20367    white space.
20368
20369    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20370 */
20371
20372 STATIC void
20373 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20374 {
20375     PERL_ARGS_ASSERT_NEXTCHAR;
20376
20377     if (RExC_parse < RExC_end) {
20378         assert(   ! UTF
20379                || UTF8_IS_INVARIANT(*RExC_parse)
20380                || UTF8_IS_START(*RExC_parse));
20381
20382         RExC_parse += (UTF)
20383                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20384                       : 1;
20385
20386         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20387                                 FALSE /* Don't force /x */ );
20388     }
20389 }
20390
20391 STATIC void
20392 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20393 {
20394     /* 'size' is the delta number of smallest regnode equivalents to add or
20395      * subtract from the current memory allocated to the regex engine being
20396      * constructed. */
20397
20398     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20399
20400     RExC_size += size;
20401
20402     Renewc(RExC_rxi,
20403            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20404                                                 /* +1 for REG_MAGIC */
20405            char,
20406            regexp_internal);
20407     if ( RExC_rxi == NULL )
20408         FAIL("Regexp out of space");
20409     RXi_SET(RExC_rx, RExC_rxi);
20410
20411     RExC_emit_start = RExC_rxi->program;
20412     if (size > 0) {
20413         Zero(REGNODE_p(RExC_emit), size, regnode);
20414     }
20415
20416 #ifdef RE_TRACK_PATTERN_OFFSETS
20417     Renew(RExC_offsets, 2*RExC_size+1, U32);
20418     if (size > 0) {
20419         Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20420     }
20421     RExC_offsets[0] = RExC_size;
20422 #endif
20423 }
20424
20425 STATIC regnode_offset
20426 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20427 {
20428     /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20429      * equivalents space.  It aligns and increments RExC_size
20430      *
20431      * It returns the regnode's offset into the regex engine program */
20432
20433     const regnode_offset ret = RExC_emit;
20434
20435     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20436
20437     PERL_ARGS_ASSERT_REGNODE_GUTS;
20438
20439     SIZE_ALIGN(RExC_size);
20440     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20441     NODE_ALIGN_FILL(REGNODE_p(ret));
20442 #ifndef RE_TRACK_PATTERN_OFFSETS
20443     PERL_UNUSED_ARG(name);
20444     PERL_UNUSED_ARG(op);
20445 #else
20446     assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20447
20448     if (RExC_offsets) {         /* MJD */
20449         MJD_OFFSET_DEBUG(
20450               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20451               name, __LINE__,
20452               PL_reg_name[op],
20453               (UV)(RExC_emit) > RExC_offsets[0]
20454                 ? "Overwriting end of array!\n" : "OK",
20455               (UV)(RExC_emit),
20456               (UV)(RExC_parse - RExC_start),
20457               (UV)RExC_offsets[0]));
20458         Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20459     }
20460 #endif
20461     return(ret);
20462 }
20463
20464 /*
20465 - reg_node - emit a node
20466 */
20467 STATIC regnode_offset /* Location. */
20468 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20469 {
20470     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20471     regnode_offset ptr = ret;
20472
20473     PERL_ARGS_ASSERT_REG_NODE;
20474
20475     assert(regarglen[op] == 0);
20476
20477     FILL_ADVANCE_NODE(ptr, op);
20478     RExC_emit = ptr;
20479     return(ret);
20480 }
20481
20482 /*
20483 - reganode - emit a node with an argument
20484 */
20485 STATIC regnode_offset /* Location. */
20486 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20487 {
20488     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20489     regnode_offset ptr = ret;
20490
20491     PERL_ARGS_ASSERT_REGANODE;
20492
20493     /* ANYOF are special cased to allow non-length 1 args */
20494     assert(regarglen[op] == 1);
20495
20496     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20497     RExC_emit = ptr;
20498     return(ret);
20499 }
20500
20501 /*
20502 - regpnode - emit a temporary node with a SV* argument
20503 */
20504 STATIC regnode_offset /* Location. */
20505 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
20506 {
20507     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
20508     regnode_offset ptr = ret;
20509
20510     PERL_ARGS_ASSERT_REGPNODE;
20511
20512     FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
20513     RExC_emit = ptr;
20514     return(ret);
20515 }
20516
20517 STATIC regnode_offset
20518 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20519 {
20520     /* emit a node with U32 and I32 arguments */
20521
20522     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20523     regnode_offset ptr = ret;
20524
20525     PERL_ARGS_ASSERT_REG2LANODE;
20526
20527     assert(regarglen[op] == 2);
20528
20529     FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20530     RExC_emit = ptr;
20531     return(ret);
20532 }
20533
20534 /*
20535 - reginsert - insert an operator in front of already-emitted operand
20536 *
20537 * That means that on exit 'operand' is the offset of the newly inserted
20538 * operator, and the original operand has been relocated.
20539 *
20540 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20541 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20542 *
20543 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20544 *   NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20545 *
20546 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20547 */
20548 STATIC void
20549 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20550                   const regnode_offset operand, const U32 depth)
20551 {
20552     regnode *src;
20553     regnode *dst;
20554     regnode *place;
20555     const int offset = regarglen[(U8)op];
20556     const int size = NODE_STEP_REGNODE + offset;
20557     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20558
20559     PERL_ARGS_ASSERT_REGINSERT;
20560     PERL_UNUSED_CONTEXT;
20561     PERL_UNUSED_ARG(depth);
20562 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20563     DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20564     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20565                                     studying. If this is wrong then we need to adjust RExC_recurse
20566                                     below like we do with RExC_open_parens/RExC_close_parens. */
20567     change_engine_size(pRExC_state, (Ptrdiff_t) size);
20568     src = REGNODE_p(RExC_emit);
20569     RExC_emit += size;
20570     dst = REGNODE_p(RExC_emit);
20571
20572     /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20573      * and [perl #133871] shows this can lead to problems, so skip this
20574      * realignment of parens until a later pass when they are reliable */
20575     if (! IN_PARENS_PASS && RExC_open_parens) {
20576         int paren;
20577         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20578         /* remember that RExC_npar is rex->nparens + 1,
20579          * iow it is 1 more than the number of parens seen in
20580          * the pattern so far. */
20581         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20582             /* note, RExC_open_parens[0] is the start of the
20583              * regex, it can't move. RExC_close_parens[0] is the end
20584              * of the regex, it *can* move. */
20585             if ( paren && RExC_open_parens[paren] >= operand ) {
20586                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20587                 RExC_open_parens[paren] += size;
20588             } else {
20589                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20590             }
20591             if ( RExC_close_parens[paren] >= operand ) {
20592                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20593                 RExC_close_parens[paren] += size;
20594             } else {
20595                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20596             }
20597         }
20598     }
20599     if (RExC_end_op)
20600         RExC_end_op += size;
20601
20602     while (src > REGNODE_p(operand)) {
20603         StructCopy(--src, --dst, regnode);
20604 #ifdef RE_TRACK_PATTERN_OFFSETS
20605         if (RExC_offsets) {     /* MJD 20010112 */
20606             MJD_OFFSET_DEBUG(
20607                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20608                   "reginsert",
20609                   __LINE__,
20610                   PL_reg_name[op],
20611                   (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20612                     ? "Overwriting end of array!\n" : "OK",
20613                   (UV)REGNODE_OFFSET(src),
20614                   (UV)REGNODE_OFFSET(dst),
20615                   (UV)RExC_offsets[0]));
20616             Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20617             Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20618         }
20619 #endif
20620     }
20621
20622     place = REGNODE_p(operand); /* Op node, where operand used to be. */
20623 #ifdef RE_TRACK_PATTERN_OFFSETS
20624     if (RExC_offsets) {         /* MJD */
20625         MJD_OFFSET_DEBUG(
20626               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20627               "reginsert",
20628               __LINE__,
20629               PL_reg_name[op],
20630               (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20631               ? "Overwriting end of array!\n" : "OK",
20632               (UV)REGNODE_OFFSET(place),
20633               (UV)(RExC_parse - RExC_start),
20634               (UV)RExC_offsets[0]));
20635         Set_Node_Offset(place, RExC_parse);
20636         Set_Node_Length(place, 1);
20637     }
20638 #endif
20639     src = NEXTOPER(place);
20640     FLAGS(place) = 0;
20641     FILL_NODE(operand, op);
20642
20643     /* Zero out any arguments in the new node */
20644     Zero(src, offset, regnode);
20645 }
20646
20647 /*
20648 - regtail - set the next-pointer at the end of a node chain of p to val.  If
20649             that value won't fit in the space available, instead returns FALSE.
20650             (Except asserts if we can't fit in the largest space the regex
20651             engine is designed for.)
20652 - SEE ALSO: regtail_study
20653 */
20654 STATIC bool
20655 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20656                 const regnode_offset p,
20657                 const regnode_offset val,
20658                 const U32 depth)
20659 {
20660     regnode_offset scan;
20661     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20662
20663     PERL_ARGS_ASSERT_REGTAIL;
20664 #ifndef DEBUGGING
20665     PERL_UNUSED_ARG(depth);
20666 #endif
20667
20668     /* The final node in the chain is the first one with a nonzero next pointer
20669      * */
20670     scan = (regnode_offset) p;
20671     for (;;) {
20672         regnode * const temp = regnext(REGNODE_p(scan));
20673         DEBUG_PARSE_r({
20674             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20675             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20676             Perl_re_printf( aTHX_  "~ %s (%zu) %s %s\n",
20677                 SvPV_nolen_const(RExC_mysv), scan,
20678                     (temp == NULL ? "->" : ""),
20679                     (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20680             );
20681         });
20682         if (temp == NULL)
20683             break;
20684         scan = REGNODE_OFFSET(temp);
20685     }
20686
20687     /* Populate this node's next pointer */
20688     assert(val >= scan);
20689     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20690         assert((UV) (val - scan) <= U32_MAX);
20691         ARG_SET(REGNODE_p(scan), val - scan);
20692     }
20693     else {
20694         if (val - scan > U16_MAX) {
20695             /* Populate this with something that won't loop and will likely
20696              * lead to a crash if the caller ignores the failure return, and
20697              * execution continues */
20698             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20699             return FALSE;
20700         }
20701         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20702     }
20703
20704     return TRUE;
20705 }
20706
20707 #ifdef DEBUGGING
20708 /*
20709 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20710 - Look for optimizable sequences at the same time.
20711 - currently only looks for EXACT chains.
20712
20713 This is experimental code. The idea is to use this routine to perform
20714 in place optimizations on branches and groups as they are constructed,
20715 with the long term intention of removing optimization from study_chunk so
20716 that it is purely analytical.
20717
20718 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20719 to control which is which.
20720
20721 This used to return a value that was ignored.  It was a problem that it is
20722 #ifdef'd to be another function that didn't return a value.  khw has changed it
20723 so both currently return a pass/fail return.
20724
20725 */
20726 /* TODO: All four parms should be const */
20727
20728 STATIC bool
20729 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20730                       const regnode_offset val, U32 depth)
20731 {
20732     regnode_offset scan;
20733     U8 exact = PSEUDO;
20734 #ifdef EXPERIMENTAL_INPLACESCAN
20735     I32 min = 0;
20736 #endif
20737     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20738
20739     PERL_ARGS_ASSERT_REGTAIL_STUDY;
20740
20741
20742     /* Find last node. */
20743
20744     scan = p;
20745     for (;;) {
20746         regnode * const temp = regnext(REGNODE_p(scan));
20747 #ifdef EXPERIMENTAL_INPLACESCAN
20748         if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20749             bool unfolded_multi_char;   /* Unexamined in this routine */
20750             if (join_exact(pRExC_state, scan, &min,
20751                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20752                 return TRUE; /* Was return EXACT */
20753         }
20754 #endif
20755         if ( exact ) {
20756             switch (OP(REGNODE_p(scan))) {
20757                 case LEXACT:
20758                 case EXACT:
20759                 case LEXACT_REQ8:
20760                 case EXACT_REQ8:
20761                 case EXACTL:
20762                 case EXACTF:
20763                 case EXACTFU_S_EDGE:
20764                 case EXACTFAA_NO_TRIE:
20765                 case EXACTFAA:
20766                 case EXACTFU:
20767                 case EXACTFU_REQ8:
20768                 case EXACTFLU8:
20769                 case EXACTFUP:
20770                 case EXACTFL:
20771                         if( exact == PSEUDO )
20772                             exact= OP(REGNODE_p(scan));
20773                         else if ( exact != OP(REGNODE_p(scan)) )
20774                             exact= 0;
20775                 case NOTHING:
20776                     break;
20777                 default:
20778                     exact= 0;
20779             }
20780         }
20781         DEBUG_PARSE_r({
20782             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20783             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20784             Perl_re_printf( aTHX_  "~ %s (%zu) -> %s\n",
20785                 SvPV_nolen_const(RExC_mysv),
20786                 scan,
20787                 PL_reg_name[exact]);
20788         });
20789         if (temp == NULL)
20790             break;
20791         scan = REGNODE_OFFSET(temp);
20792     }
20793     DEBUG_PARSE_r({
20794         DEBUG_PARSE_MSG("");
20795         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20796         Perl_re_printf( aTHX_
20797                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20798                       SvPV_nolen_const(RExC_mysv),
20799                       (IV)val,
20800                       (IV)(val - scan)
20801         );
20802     });
20803     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20804         assert((UV) (val - scan) <= U32_MAX);
20805         ARG_SET(REGNODE_p(scan), val - scan);
20806     }
20807     else {
20808         if (val - scan > U16_MAX) {
20809             /* Populate this with something that won't loop and will likely
20810              * lead to a crash if the caller ignores the failure return, and
20811              * execution continues */
20812             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20813             return FALSE;
20814         }
20815         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20816     }
20817
20818     return TRUE; /* Was 'return exact' */
20819 }
20820 #endif
20821
20822 STATIC SV*
20823 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20824
20825     /* Returns an inversion list of all the code points matched by the
20826      * ANYOFM/NANYOFM node 'n' */
20827
20828     SV * cp_list = _new_invlist(-1);
20829     const U8 lowest = (U8) ARG(n);
20830     unsigned int i;
20831     U8 count = 0;
20832     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20833
20834     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20835
20836     /* Starting with the lowest code point, any code point that ANDed with the
20837      * mask yields the lowest code point is in the set */
20838     for (i = lowest; i <= 0xFF; i++) {
20839         if ((i & FLAGS(n)) == ARG(n)) {
20840             cp_list = add_cp_to_invlist(cp_list, i);
20841             count++;
20842
20843             /* We know how many code points (a power of two) that are in the
20844              * set.  No use looking once we've got that number */
20845             if (count >= needed) break;
20846         }
20847     }
20848
20849     if (OP(n) == NANYOFM) {
20850         _invlist_invert(cp_list);
20851     }
20852     return cp_list;
20853 }
20854
20855 /*
20856  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20857  */
20858 #ifdef DEBUGGING
20859
20860 static void
20861 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20862 {
20863     int bit;
20864     int set=0;
20865
20866     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20867
20868     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20869         if (flags & (1<<bit)) {
20870             if (!set++ && lead)
20871                 Perl_re_printf( aTHX_  "%s", lead);
20872             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
20873         }
20874     }
20875     if (lead)  {
20876         if (set)
20877             Perl_re_printf( aTHX_  "\n");
20878         else
20879             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20880     }
20881 }
20882
20883 static void
20884 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20885 {
20886     int bit;
20887     int set=0;
20888     regex_charset cs;
20889
20890     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20891
20892     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20893         if (flags & (1<<bit)) {
20894             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
20895                 continue;
20896             }
20897             if (!set++ && lead)
20898                 Perl_re_printf( aTHX_  "%s", lead);
20899             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
20900         }
20901     }
20902     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20903             if (!set++ && lead) {
20904                 Perl_re_printf( aTHX_  "%s", lead);
20905             }
20906             switch (cs) {
20907                 case REGEX_UNICODE_CHARSET:
20908                     Perl_re_printf( aTHX_  "UNICODE");
20909                     break;
20910                 case REGEX_LOCALE_CHARSET:
20911                     Perl_re_printf( aTHX_  "LOCALE");
20912                     break;
20913                 case REGEX_ASCII_RESTRICTED_CHARSET:
20914                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
20915                     break;
20916                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20917                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
20918                     break;
20919                 default:
20920                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
20921                     break;
20922             }
20923     }
20924     if (lead)  {
20925         if (set)
20926             Perl_re_printf( aTHX_  "\n");
20927         else
20928             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20929     }
20930 }
20931 #endif
20932
20933 void
20934 Perl_regdump(pTHX_ const regexp *r)
20935 {
20936 #ifdef DEBUGGING
20937     int i;
20938     SV * const sv = sv_newmortal();
20939     SV *dsv= sv_newmortal();
20940     RXi_GET_DECL(r, ri);
20941     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20942
20943     PERL_ARGS_ASSERT_REGDUMP;
20944
20945     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20946
20947     /* Header fields of interest. */
20948     for (i = 0; i < 2; i++) {
20949         if (r->substrs->data[i].substr) {
20950             RE_PV_QUOTED_DECL(s, 0, dsv,
20951                             SvPVX_const(r->substrs->data[i].substr),
20952                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
20953                             PL_dump_re_max_len);
20954             Perl_re_printf( aTHX_
20955                           "%s %s%s at %" IVdf "..%" UVuf " ",
20956                           i ? "floating" : "anchored",
20957                           s,
20958                           RE_SV_TAIL(r->substrs->data[i].substr),
20959                           (IV)r->substrs->data[i].min_offset,
20960                           (UV)r->substrs->data[i].max_offset);
20961         }
20962         else if (r->substrs->data[i].utf8_substr) {
20963             RE_PV_QUOTED_DECL(s, 1, dsv,
20964                             SvPVX_const(r->substrs->data[i].utf8_substr),
20965                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20966                             30);
20967             Perl_re_printf( aTHX_
20968                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20969                           i ? "floating" : "anchored",
20970                           s,
20971                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20972                           (IV)r->substrs->data[i].min_offset,
20973                           (UV)r->substrs->data[i].max_offset);
20974         }
20975     }
20976
20977     if (r->check_substr || r->check_utf8)
20978         Perl_re_printf( aTHX_
20979                       (const char *)
20980                       (   r->check_substr == r->substrs->data[1].substr
20981                        && r->check_utf8   == r->substrs->data[1].utf8_substr
20982                        ? "(checking floating" : "(checking anchored"));
20983     if (r->intflags & PREGf_NOSCAN)
20984         Perl_re_printf( aTHX_  " noscan");
20985     if (r->extflags & RXf_CHECK_ALL)
20986         Perl_re_printf( aTHX_  " isall");
20987     if (r->check_substr || r->check_utf8)
20988         Perl_re_printf( aTHX_  ") ");
20989
20990     if (ri->regstclass) {
20991         regprop(r, sv, ri->regstclass, NULL, NULL);
20992         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
20993     }
20994     if (r->intflags & PREGf_ANCH) {
20995         Perl_re_printf( aTHX_  "anchored");
20996         if (r->intflags & PREGf_ANCH_MBOL)
20997             Perl_re_printf( aTHX_  "(MBOL)");
20998         if (r->intflags & PREGf_ANCH_SBOL)
20999             Perl_re_printf( aTHX_  "(SBOL)");
21000         if (r->intflags & PREGf_ANCH_GPOS)
21001             Perl_re_printf( aTHX_  "(GPOS)");
21002         Perl_re_printf( aTHX_ " ");
21003     }
21004     if (r->intflags & PREGf_GPOS_SEEN)
21005         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
21006     if (r->intflags & PREGf_SKIP)
21007         Perl_re_printf( aTHX_  "plus ");
21008     if (r->intflags & PREGf_IMPLICIT)
21009         Perl_re_printf( aTHX_  "implicit ");
21010     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
21011     if (r->extflags & RXf_EVAL_SEEN)
21012         Perl_re_printf( aTHX_  "with eval ");
21013     Perl_re_printf( aTHX_  "\n");
21014     DEBUG_FLAGS_r({
21015         regdump_extflags("r->extflags: ", r->extflags);
21016         regdump_intflags("r->intflags: ", r->intflags);
21017     });
21018 #else
21019     PERL_ARGS_ASSERT_REGDUMP;
21020     PERL_UNUSED_CONTEXT;
21021     PERL_UNUSED_ARG(r);
21022 #endif  /* DEBUGGING */
21023 }
21024
21025 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
21026 #ifdef DEBUGGING
21027
21028 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
21029      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
21030      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
21031      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
21032      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
21033      || _CC_VERTSPACE != 15
21034 #   error Need to adjust order of anyofs[]
21035 #  endif
21036 static const char * const anyofs[] = {
21037     "\\w",
21038     "\\W",
21039     "\\d",
21040     "\\D",
21041     "[:alpha:]",
21042     "[:^alpha:]",
21043     "[:lower:]",
21044     "[:^lower:]",
21045     "[:upper:]",
21046     "[:^upper:]",
21047     "[:punct:]",
21048     "[:^punct:]",
21049     "[:print:]",
21050     "[:^print:]",
21051     "[:alnum:]",
21052     "[:^alnum:]",
21053     "[:graph:]",
21054     "[:^graph:]",
21055     "[:cased:]",
21056     "[:^cased:]",
21057     "\\s",
21058     "\\S",
21059     "[:blank:]",
21060     "[:^blank:]",
21061     "[:xdigit:]",
21062     "[:^xdigit:]",
21063     "[:cntrl:]",
21064     "[:^cntrl:]",
21065     "[:ascii:]",
21066     "[:^ascii:]",
21067     "\\v",
21068     "\\V"
21069 };
21070 #endif
21071
21072 /*
21073 - regprop - printable representation of opcode, with run time support
21074 */
21075
21076 void
21077 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
21078 {
21079 #ifdef DEBUGGING
21080     int k;
21081     RXi_GET_DECL(prog, progi);
21082     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21083
21084     PERL_ARGS_ASSERT_REGPROP;
21085
21086     SvPVCLEAR(sv);
21087
21088     if (OP(o) > REGNODE_MAX) {          /* regnode.type is unsigned */
21089         if (pRExC_state) {  /* This gives more info, if we have it */
21090             FAIL3("panic: corrupted regexp opcode %d > %d",
21091                   (int)OP(o), (int)REGNODE_MAX);
21092         }
21093         else {
21094             Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
21095                              (int)OP(o), (int)REGNODE_MAX);
21096         }
21097     }
21098     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
21099
21100     k = PL_regkind[OP(o)];
21101
21102     if (k == EXACT) {
21103         sv_catpvs(sv, " ");
21104         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
21105          * is a crude hack but it may be the best for now since
21106          * we have no flag "this EXACTish node was UTF-8"
21107          * --jhi */
21108         pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
21109                   PL_colors[0], PL_colors[1],
21110                   PERL_PV_ESCAPE_UNI_DETECT |
21111                   PERL_PV_ESCAPE_NONASCII   |
21112                   PERL_PV_PRETTY_ELLIPSES   |
21113                   PERL_PV_PRETTY_LTGT       |
21114                   PERL_PV_PRETTY_NOCLEAR
21115                   );
21116     } else if (k == TRIE) {
21117         /* print the details of the trie in dumpuntil instead, as
21118          * progi->data isn't available here */
21119         const char op = OP(o);
21120         const U32 n = ARG(o);
21121         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
21122                (reg_ac_data *)progi->data->data[n] :
21123                NULL;
21124         const reg_trie_data * const trie
21125             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
21126
21127         Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
21128         DEBUG_TRIE_COMPILE_r({
21129           if (trie->jump)
21130             sv_catpvs(sv, "(JUMP)");
21131           Perl_sv_catpvf(aTHX_ sv,
21132             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
21133             (UV)trie->startstate,
21134             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
21135             (UV)trie->wordcount,
21136             (UV)trie->minlen,
21137             (UV)trie->maxlen,
21138             (UV)TRIE_CHARCOUNT(trie),
21139             (UV)trie->uniquecharcount
21140           );
21141         });
21142         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
21143             sv_catpvs(sv, "[");
21144             (void) put_charclass_bitmap_innards(sv,
21145                                                 ((IS_ANYOF_TRIE(op))
21146                                                  ? ANYOF_BITMAP(o)
21147                                                  : TRIE_BITMAP(trie)),
21148                                                 NULL,
21149                                                 NULL,
21150                                                 NULL,
21151                                                 0,
21152                                                 FALSE
21153                                                );
21154             sv_catpvs(sv, "]");
21155         }
21156     } else if (k == CURLY) {
21157         U32 lo = ARG1(o), hi = ARG2(o);
21158         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
21159             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
21160         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
21161         if (hi == REG_INFTY)
21162             sv_catpvs(sv, "INFTY");
21163         else
21164             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
21165         sv_catpvs(sv, "}");
21166     }
21167     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
21168         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
21169     else if (k == REF || k == OPEN || k == CLOSE
21170              || k == GROUPP || OP(o)==ACCEPT)
21171     {
21172         AV *name_list= NULL;
21173         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
21174         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
21175         if ( RXp_PAREN_NAMES(prog) ) {
21176             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21177         } else if ( pRExC_state ) {
21178             name_list= RExC_paren_name_list;
21179         }
21180         if (name_list) {
21181             if ( k != REF || (OP(o) < REFN)) {
21182                 SV **name= av_fetch(name_list, parno, 0 );
21183                 if (name)
21184                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21185             }
21186             else {
21187                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
21188                 I32 *nums=(I32*)SvPVX(sv_dat);
21189                 SV **name= av_fetch(name_list, nums[0], 0 );
21190                 I32 n;
21191                 if (name) {
21192                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
21193                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
21194                                     (n ? "," : ""), (IV)nums[n]);
21195                     }
21196                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21197                 }
21198             }
21199         }
21200         if ( k == REF && reginfo) {
21201             U32 n = ARG(o);  /* which paren pair */
21202             I32 ln = prog->offs[n].start;
21203             if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
21204                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
21205             else if (ln == prog->offs[n].end)
21206                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
21207             else {
21208                 const char *s = reginfo->strbeg + ln;
21209                 Perl_sv_catpvf(aTHX_ sv, ": ");
21210                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
21211                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
21212             }
21213         }
21214     } else if (k == GOSUB) {
21215         AV *name_list= NULL;
21216         if ( RXp_PAREN_NAMES(prog) ) {
21217             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21218         } else if ( pRExC_state ) {
21219             name_list= RExC_paren_name_list;
21220         }
21221
21222         /* Paren and offset */
21223         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
21224                 (int)((o + (int)ARG2L(o)) - progi->program) );
21225         if (name_list) {
21226             SV **name= av_fetch(name_list, ARG(o), 0 );
21227             if (name)
21228                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21229         }
21230     }
21231     else if (k == LOGICAL)
21232         /* 2: embedded, otherwise 1 */
21233         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
21234     else if (k == ANYOF || k == ANYOFR) {
21235         U8 flags;
21236         char * bitmap;
21237         U32 arg;
21238         bool do_sep = FALSE;    /* Do we need to separate various components of
21239                                    the output? */
21240         /* Set if there is still an unresolved user-defined property */
21241         SV *unresolved                = NULL;
21242
21243         /* Things that are ignored except when the runtime locale is UTF-8 */
21244         SV *only_utf8_locale_invlist = NULL;
21245
21246         /* Code points that don't fit in the bitmap */
21247         SV *nonbitmap_invlist = NULL;
21248
21249         /* And things that aren't in the bitmap, but are small enough to be */
21250         SV* bitmap_range_not_in_bitmap = NULL;
21251
21252         bool inverted;
21253
21254         if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21255             flags = 0;
21256             bitmap = NULL;
21257             arg = 0;
21258         }
21259         else {
21260             flags = ANYOF_FLAGS(o);
21261             bitmap = ANYOF_BITMAP(o);
21262             arg = ARG(o);
21263         }
21264
21265         if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
21266             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
21267                 sv_catpvs(sv, "{utf8-locale-reqd}");
21268             }
21269             if (flags & ANYOFL_FOLD) {
21270                 sv_catpvs(sv, "{i}");
21271             }
21272         }
21273
21274         inverted = flags & ANYOF_INVERT;
21275
21276         /* If there is stuff outside the bitmap, get it */
21277         if (arg != ANYOF_ONLY_HAS_BITMAP) {
21278             if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
21279                 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21280                                             ANYOFRbase(o),
21281                                             ANYOFRbase(o) + ANYOFRdelta(o));
21282             }
21283             else {
21284 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
21285                 (void) get_regclass_nonbitmap_data(prog, o, FALSE,
21286                                                 &unresolved,
21287                                                 &only_utf8_locale_invlist,
21288                                                 &nonbitmap_invlist);
21289 #else
21290                 (void) get_re_gclass_nonbitmap_data(prog, o, FALSE,
21291                                                 &unresolved,
21292                                                 &only_utf8_locale_invlist,
21293                                                 &nonbitmap_invlist);
21294 #endif
21295             }
21296
21297             /* The non-bitmap data may contain stuff that could fit in the
21298              * bitmap.  This could come from a user-defined property being
21299              * finally resolved when this call was done; or much more likely
21300              * because there are matches that require UTF-8 to be valid, and so
21301              * aren't in the bitmap (or ANYOFR).  This is teased apart later */
21302             _invlist_intersection(nonbitmap_invlist,
21303                                   PL_InBitmap,
21304                                   &bitmap_range_not_in_bitmap);
21305             /* Leave just the things that don't fit into the bitmap */
21306             _invlist_subtract(nonbitmap_invlist,
21307                               PL_InBitmap,
21308                               &nonbitmap_invlist);
21309         }
21310
21311         /* Obey this flag to add all above-the-bitmap code points */
21312         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
21313             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21314                                                       NUM_ANYOF_CODE_POINTS,
21315                                                       UV_MAX);
21316         }
21317
21318         /* Ready to start outputting.  First, the initial left bracket */
21319         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21320
21321         /* ANYOFH by definition doesn't have anything that will fit inside the
21322          * bitmap;  ANYOFR may or may not. */
21323         if (  ! inRANGE(OP(o), ANYOFH, ANYOFHr)
21324             && (   ! inRANGE(OP(o), ANYOFR, ANYOFRb)
21325                 ||   ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
21326         {
21327             /* Then all the things that could fit in the bitmap */
21328             do_sep = put_charclass_bitmap_innards(sv,
21329                                                   bitmap,
21330                                                   bitmap_range_not_in_bitmap,
21331                                                   only_utf8_locale_invlist,
21332                                                   o,
21333                                                   flags,
21334
21335                                                   /* Can't try inverting for a
21336                                                    * better display if there
21337                                                    * are things that haven't
21338                                                    * been resolved */
21339                                                   unresolved != NULL
21340                                             || inRANGE(OP(o), ANYOFR, ANYOFRb));
21341             SvREFCNT_dec(bitmap_range_not_in_bitmap);
21342
21343             /* If there are user-defined properties which haven't been defined
21344              * yet, output them.  If the result is not to be inverted, it is
21345              * clearest to output them in a separate [] from the bitmap range
21346              * stuff.  If the result is to be complemented, we have to show
21347              * everything in one [], as the inversion applies to the whole
21348              * thing.  Use {braces} to separate them from anything in the
21349              * bitmap and anything above the bitmap. */
21350             if (unresolved) {
21351                 if (inverted) {
21352                     if (! do_sep) { /* If didn't output anything in the bitmap
21353                                      */
21354                         sv_catpvs(sv, "^");
21355                     }
21356                     sv_catpvs(sv, "{");
21357                 }
21358                 else if (do_sep) {
21359                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
21360                                                       PL_colors[0]);
21361                 }
21362                 sv_catsv(sv, unresolved);
21363                 if (inverted) {
21364                     sv_catpvs(sv, "}");
21365                 }
21366                 do_sep = ! inverted;
21367             }
21368         }
21369
21370         /* And, finally, add the above-the-bitmap stuff */
21371         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
21372             SV* contents;
21373
21374             /* See if truncation size is overridden */
21375             const STRLEN dump_len = (PL_dump_re_max_len > 256)
21376                                     ? PL_dump_re_max_len
21377                                     : 256;
21378
21379             /* This is output in a separate [] */
21380             if (do_sep) {
21381                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
21382             }
21383
21384             /* And, for easy of understanding, it is shown in the
21385              * uncomplemented form if possible.  The one exception being if
21386              * there are unresolved items, where the inversion has to be
21387              * delayed until runtime */
21388             if (inverted && ! unresolved) {
21389                 _invlist_invert(nonbitmap_invlist);
21390                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21391             }
21392
21393             contents = invlist_contents(nonbitmap_invlist,
21394                                         FALSE /* output suitable for catsv */
21395                                        );
21396
21397             /* If the output is shorter than the permissible maximum, just do it. */
21398             if (SvCUR(contents) <= dump_len) {
21399                 sv_catsv(sv, contents);
21400             }
21401             else {
21402                 const char * contents_string = SvPVX(contents);
21403                 STRLEN i = dump_len;
21404
21405                 /* Otherwise, start at the permissible max and work back to the
21406                  * first break possibility */
21407                 while (i > 0 && contents_string[i] != ' ') {
21408                     i--;
21409                 }
21410                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
21411                                        find a legal break */
21412                     i = dump_len;
21413                 }
21414
21415                 sv_catpvn(sv, contents_string, i);
21416                 sv_catpvs(sv, "...");
21417             }
21418
21419             SvREFCNT_dec_NN(contents);
21420             SvREFCNT_dec_NN(nonbitmap_invlist);
21421         }
21422
21423         /* And finally the matching, closing ']' */
21424         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21425
21426         if (OP(o) == ANYOFHs) {
21427             Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21428         }
21429         else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21430             U8 lowest = (OP(o) != ANYOFHr)
21431                          ? FLAGS(o)
21432                          : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21433             U8 highest = (OP(o) == ANYOFHr)
21434                          ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21435                          : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21436                            ? 0xFF
21437                            : lowest;
21438 #ifndef EBCDIC
21439             if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
21440 #endif
21441             {
21442                 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21443                 if (lowest != highest) {
21444                     Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21445                 }
21446                 Perl_sv_catpvf(aTHX_ sv, ")");
21447             }
21448         }
21449
21450         SvREFCNT_dec(unresolved);
21451     }
21452     else if (k == ANYOFM) {
21453         SV * cp_list = get_ANYOFM_contents(o);
21454
21455         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21456         if (OP(o) == NANYOFM) {
21457             _invlist_invert(cp_list);
21458         }
21459
21460         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21461         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21462
21463         SvREFCNT_dec(cp_list);
21464     }
21465     else if (k == POSIXD || k == NPOSIXD) {
21466         U8 index = FLAGS(o) * 2;
21467         if (index < C_ARRAY_LENGTH(anyofs)) {
21468             if (*anyofs[index] != '[')  {
21469                 sv_catpvs(sv, "[");
21470             }
21471             sv_catpv(sv, anyofs[index]);
21472             if (*anyofs[index] != '[')  {
21473                 sv_catpvs(sv, "]");
21474             }
21475         }
21476         else {
21477             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21478         }
21479     }
21480     else if (k == BOUND || k == NBOUND) {
21481         /* Must be synced with order of 'bound_type' in regcomp.h */
21482         const char * const bounds[] = {
21483             "",      /* Traditional */
21484             "{gcb}",
21485             "{lb}",
21486             "{sb}",
21487             "{wb}"
21488         };
21489         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21490         sv_catpv(sv, bounds[FLAGS(o)]);
21491     }
21492     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21493         Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21494         if (o->next_off) {
21495             Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21496         }
21497         Perl_sv_catpvf(aTHX_ sv, "]");
21498     }
21499     else if (OP(o) == SBOL)
21500         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21501
21502     /* add on the verb argument if there is one */
21503     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21504         if ( ARG(o) )
21505             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21506                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21507         else
21508             sv_catpvs(sv, ":NULL");
21509     }
21510 #else
21511     PERL_UNUSED_CONTEXT;
21512     PERL_UNUSED_ARG(sv);
21513     PERL_UNUSED_ARG(o);
21514     PERL_UNUSED_ARG(prog);
21515     PERL_UNUSED_ARG(reginfo);
21516     PERL_UNUSED_ARG(pRExC_state);
21517 #endif  /* DEBUGGING */
21518 }
21519
21520
21521
21522 SV *
21523 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21524 {                               /* Assume that RE_INTUIT is set */
21525     /* Returns an SV containing a string that must appear in the target for it
21526      * to match, or NULL if nothing is known that must match.
21527      *
21528      * CAUTION: the SV can be freed during execution of the regex engine */
21529
21530     struct regexp *const prog = ReANY(r);
21531     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21532
21533     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21534     PERL_UNUSED_CONTEXT;
21535
21536     DEBUG_COMPILE_r(
21537         {
21538             if (prog->maxlen > 0) {
21539                 const char * const s = SvPV_nolen_const(RX_UTF8(r)
21540                       ? prog->check_utf8 : prog->check_substr);
21541
21542                 if (!PL_colorset) reginitcolors();
21543                 Perl_re_printf( aTHX_
21544                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21545                       PL_colors[4],
21546                       RX_UTF8(r) ? "utf8 " : "",
21547                       PL_colors[5], PL_colors[0],
21548                       s,
21549                       PL_colors[1],
21550                       (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21551             }
21552         } );
21553
21554     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21555     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21556 }
21557
21558 /*
21559    pregfree()
21560
21561    handles refcounting and freeing the perl core regexp structure. When
21562    it is necessary to actually free the structure the first thing it
21563    does is call the 'free' method of the regexp_engine associated to
21564    the regexp, allowing the handling of the void *pprivate; member
21565    first. (This routine is not overridable by extensions, which is why
21566    the extensions free is called first.)
21567
21568    See regdupe and regdupe_internal if you change anything here.
21569 */
21570 #ifndef PERL_IN_XSUB_RE
21571 void
21572 Perl_pregfree(pTHX_ REGEXP *r)
21573 {
21574     SvREFCNT_dec(r);
21575 }
21576
21577 void
21578 Perl_pregfree2(pTHX_ REGEXP *rx)
21579 {
21580     struct regexp *const r = ReANY(rx);
21581     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21582
21583     PERL_ARGS_ASSERT_PREGFREE2;
21584
21585     if (! r)
21586         return;
21587
21588     if (r->mother_re) {
21589         ReREFCNT_dec(r->mother_re);
21590     } else {
21591         CALLREGFREE_PVT(rx); /* free the private data */
21592         SvREFCNT_dec(RXp_PAREN_NAMES(r));
21593     }
21594     if (r->substrs) {
21595         int i;
21596         for (i = 0; i < 2; i++) {
21597             SvREFCNT_dec(r->substrs->data[i].substr);
21598             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21599         }
21600         Safefree(r->substrs);
21601     }
21602     RX_MATCH_COPY_FREE(rx);
21603 #ifdef PERL_ANY_COW
21604     SvREFCNT_dec(r->saved_copy);
21605 #endif
21606     Safefree(r->offs);
21607     SvREFCNT_dec(r->qr_anoncv);
21608     if (r->recurse_locinput)
21609         Safefree(r->recurse_locinput);
21610 }
21611
21612
21613 /*  reg_temp_copy()
21614
21615     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21616     except that dsv will be created if NULL.
21617
21618     This function is used in two main ways. First to implement
21619         $r = qr/....; $s = $$r;
21620
21621     Secondly, it is used as a hacky workaround to the structural issue of
21622     match results
21623     being stored in the regexp structure which is in turn stored in
21624     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21625     could be PL_curpm in multiple contexts, and could require multiple
21626     result sets being associated with the pattern simultaneously, such
21627     as when doing a recursive match with (??{$qr})
21628
21629     The solution is to make a lightweight copy of the regexp structure
21630     when a qr// is returned from the code executed by (??{$qr}) this
21631     lightweight copy doesn't actually own any of its data except for
21632     the starp/end and the actual regexp structure itself.
21633
21634 */
21635
21636
21637 REGEXP *
21638 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21639 {
21640     struct regexp *drx;
21641     struct regexp *const srx = ReANY(ssv);
21642     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21643
21644     PERL_ARGS_ASSERT_REG_TEMP_COPY;
21645
21646     if (!dsv)
21647         dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21648     else {
21649         assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21650
21651         /* our only valid caller, sv_setsv_flags(), should have done
21652          * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21653         assert(!SvOOK(dsv));
21654         assert(!SvIsCOW(dsv));
21655         assert(!SvROK(dsv));
21656
21657         if (SvPVX_const(dsv)) {
21658             if (SvLEN(dsv))
21659                 Safefree(SvPVX(dsv));
21660             SvPVX(dsv) = NULL;
21661         }
21662         SvLEN_set(dsv, 0);
21663         SvCUR_set(dsv, 0);
21664         SvOK_off((SV *)dsv);
21665
21666         if (islv) {
21667             /* For PVLVs, the head (sv_any) points to an XPVLV, while
21668              * the LV's xpvlenu_rx will point to a regexp body, which
21669              * we allocate here */
21670             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21671             assert(!SvPVX(dsv));
21672             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21673             temp->sv_any = NULL;
21674             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21675             SvREFCNT_dec_NN(temp);
21676             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21677                ing below will not set it. */
21678             SvCUR_set(dsv, SvCUR(ssv));
21679         }
21680     }
21681     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21682        sv_force_normal(sv) is called.  */
21683     SvFAKE_on(dsv);
21684     drx = ReANY(dsv);
21685
21686     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21687     SvPV_set(dsv, RX_WRAPPED(ssv));
21688     /* We share the same string buffer as the original regexp, on which we
21689        hold a reference count, incremented when mother_re is set below.
21690        The string pointer is copied here, being part of the regexp struct.
21691      */
21692     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21693            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21694     if (!islv)
21695         SvLEN_set(dsv, 0);
21696     if (srx->offs) {
21697         const I32 npar = srx->nparens+1;
21698         Newx(drx->offs, npar, regexp_paren_pair);
21699         Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21700     }
21701     if (srx->substrs) {
21702         int i;
21703         Newx(drx->substrs, 1, struct reg_substr_data);
21704         StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21705
21706         for (i = 0; i < 2; i++) {
21707             SvREFCNT_inc_void(drx->substrs->data[i].substr);
21708             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21709         }
21710
21711         /* check_substr and check_utf8, if non-NULL, point to either their
21712            anchored or float namesakes, and don't hold a second reference.  */
21713     }
21714     RX_MATCH_COPIED_off(dsv);
21715 #ifdef PERL_ANY_COW
21716     drx->saved_copy = NULL;
21717 #endif
21718     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21719     SvREFCNT_inc_void(drx->qr_anoncv);
21720     if (srx->recurse_locinput)
21721         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21722
21723     return dsv;
21724 }
21725 #endif
21726
21727
21728 /* regfree_internal()
21729
21730    Free the private data in a regexp. This is overloadable by
21731    extensions. Perl takes care of the regexp structure in pregfree(),
21732    this covers the *pprivate pointer which technically perl doesn't
21733    know about, however of course we have to handle the
21734    regexp_internal structure when no extension is in use.
21735
21736    Note this is called before freeing anything in the regexp
21737    structure.
21738  */
21739
21740 void
21741 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21742 {
21743     struct regexp *const r = ReANY(rx);
21744     RXi_GET_DECL(r, ri);
21745     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21746
21747     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21748
21749     if (! ri) {
21750         return;
21751     }
21752
21753     DEBUG_COMPILE_r({
21754         if (!PL_colorset)
21755             reginitcolors();
21756         {
21757             SV *dsv= sv_newmortal();
21758             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21759                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21760             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21761                 PL_colors[4], PL_colors[5], s);
21762         }
21763     });
21764
21765 #ifdef RE_TRACK_PATTERN_OFFSETS
21766     if (ri->u.offsets)
21767         Safefree(ri->u.offsets);             /* 20010421 MJD */
21768 #endif
21769     if (ri->code_blocks)
21770         S_free_codeblocks(aTHX_ ri->code_blocks);
21771
21772     if (ri->data) {
21773         int n = ri->data->count;
21774
21775         while (--n >= 0) {
21776           /* If you add a ->what type here, update the comment in regcomp.h */
21777             switch (ri->data->what[n]) {
21778             case 'a':
21779             case 'r':
21780             case 's':
21781             case 'S':
21782             case 'u':
21783                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21784                 break;
21785             case 'f':
21786                 Safefree(ri->data->data[n]);
21787                 break;
21788             case 'l':
21789             case 'L':
21790                 break;
21791             case 'T':
21792                 { /* Aho Corasick add-on structure for a trie node.
21793                      Used in stclass optimization only */
21794                     U32 refcount;
21795                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21796 #ifdef USE_ITHREADS
21797 #endif
21798                     OP_REFCNT_LOCK;
21799                     refcount = --aho->refcount;
21800                     OP_REFCNT_UNLOCK;
21801                     if ( !refcount ) {
21802                         PerlMemShared_free(aho->states);
21803                         PerlMemShared_free(aho->fail);
21804                          /* do this last!!!! */
21805                         PerlMemShared_free(ri->data->data[n]);
21806                         /* we should only ever get called once, so
21807                          * assert as much, and also guard the free
21808                          * which /might/ happen twice. At the least
21809                          * it will make code anlyzers happy and it
21810                          * doesn't cost much. - Yves */
21811                         assert(ri->regstclass);
21812                         if (ri->regstclass) {
21813                             PerlMemShared_free(ri->regstclass);
21814                             ri->regstclass = 0;
21815                         }
21816                     }
21817                 }
21818                 break;
21819             case 't':
21820                 {
21821                     /* trie structure. */
21822                     U32 refcount;
21823                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21824 #ifdef USE_ITHREADS
21825 #endif
21826                     OP_REFCNT_LOCK;
21827                     refcount = --trie->refcount;
21828                     OP_REFCNT_UNLOCK;
21829                     if ( !refcount ) {
21830                         PerlMemShared_free(trie->charmap);
21831                         PerlMemShared_free(trie->states);
21832                         PerlMemShared_free(trie->trans);
21833                         if (trie->bitmap)
21834                             PerlMemShared_free(trie->bitmap);
21835                         if (trie->jump)
21836                             PerlMemShared_free(trie->jump);
21837                         PerlMemShared_free(trie->wordinfo);
21838                         /* do this last!!!! */
21839                         PerlMemShared_free(ri->data->data[n]);
21840                     }
21841                 }
21842                 break;
21843             default:
21844                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
21845                                                     ri->data->what[n]);
21846             }
21847         }
21848         Safefree(ri->data->what);
21849         Safefree(ri->data);
21850     }
21851
21852     Safefree(ri);
21853 }
21854
21855 #define av_dup_inc(s, t)        MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21856 #define hv_dup_inc(s, t)        MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21857 #define SAVEPVN(p, n)   ((p) ? savepvn(p, n) : NULL)
21858
21859 /*
21860 =for apidoc_section REGEXP Functions
21861 =for apidoc re_dup_guts
21862 Duplicate a regexp.
21863
21864 This routine is expected to clone a given regexp structure. It is only
21865 compiled under USE_ITHREADS.
21866
21867 After all of the core data stored in struct regexp is duplicated
21868 the regexp_engine.dupe method is used to copy any private data
21869 stored in the *pprivate pointer. This allows extensions to handle
21870 any duplication they need to do.
21871
21872 =cut
21873
21874    See pregfree() and regfree_internal() if you change anything here.
21875 */
21876 #if defined(USE_ITHREADS)
21877 #ifndef PERL_IN_XSUB_RE
21878 void
21879 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21880 {
21881     I32 npar;
21882     const struct regexp *r = ReANY(sstr);
21883     struct regexp *ret = ReANY(dstr);
21884
21885     PERL_ARGS_ASSERT_RE_DUP_GUTS;
21886
21887     npar = r->nparens+1;
21888     Newx(ret->offs, npar, regexp_paren_pair);
21889     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21890
21891     if (ret->substrs) {
21892         /* Do it this way to avoid reading from *r after the StructCopy().
21893            That way, if any of the sv_dup_inc()s dislodge *r from the L1
21894            cache, it doesn't matter.  */
21895         int i;
21896         const bool anchored = r->check_substr
21897             ? r->check_substr == r->substrs->data[0].substr
21898             : r->check_utf8   == r->substrs->data[0].utf8_substr;
21899         Newx(ret->substrs, 1, struct reg_substr_data);
21900         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21901
21902         for (i = 0; i < 2; i++) {
21903             ret->substrs->data[i].substr =
21904                         sv_dup_inc(ret->substrs->data[i].substr, param);
21905             ret->substrs->data[i].utf8_substr =
21906                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21907         }
21908
21909         /* check_substr and check_utf8, if non-NULL, point to either their
21910            anchored or float namesakes, and don't hold a second reference.  */
21911
21912         if (ret->check_substr) {
21913             if (anchored) {
21914                 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21915
21916                 ret->check_substr = ret->substrs->data[0].substr;
21917                 ret->check_utf8   = ret->substrs->data[0].utf8_substr;
21918             } else {
21919                 assert(r->check_substr == r->substrs->data[1].substr);
21920                 assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
21921
21922                 ret->check_substr = ret->substrs->data[1].substr;
21923                 ret->check_utf8   = ret->substrs->data[1].utf8_substr;
21924             }
21925         } else if (ret->check_utf8) {
21926             if (anchored) {
21927                 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21928             } else {
21929                 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21930             }
21931         }
21932     }
21933
21934     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21935     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21936     if (r->recurse_locinput)
21937         Newx(ret->recurse_locinput, r->nparens + 1, char *);
21938
21939     if (ret->pprivate)
21940         RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21941
21942     if (RX_MATCH_COPIED(dstr))
21943         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
21944     else
21945         ret->subbeg = NULL;
21946 #ifdef PERL_ANY_COW
21947     ret->saved_copy = NULL;
21948 #endif
21949
21950     /* Whether mother_re be set or no, we need to copy the string.  We
21951        cannot refrain from copying it when the storage points directly to
21952        our mother regexp, because that's
21953                1: a buffer in a different thread
21954                2: something we no longer hold a reference on
21955                so we need to copy it locally.  */
21956     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21957     /* set malloced length to a non-zero value so it will be freed
21958      * (otherwise in combination with SVf_FAKE it looks like an alien
21959      * buffer). It doesn't have to be the actual malloced size, since it
21960      * should never be grown */
21961     SvLEN_set(dstr, SvCUR(sstr)+1);
21962     ret->mother_re   = NULL;
21963 }
21964 #endif /* PERL_IN_XSUB_RE */
21965
21966 /*
21967    regdupe_internal()
21968
21969    This is the internal complement to regdupe() which is used to copy
21970    the structure pointed to by the *pprivate pointer in the regexp.
21971    This is the core version of the extension overridable cloning hook.
21972    The regexp structure being duplicated will be copied by perl prior
21973    to this and will be provided as the regexp *r argument, however
21974    with the /old/ structures pprivate pointer value. Thus this routine
21975    may override any copying normally done by perl.
21976
21977    It returns a pointer to the new regexp_internal structure.
21978 */
21979
21980 void *
21981 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21982 {
21983     struct regexp *const r = ReANY(rx);
21984     regexp_internal *reti;
21985     int len;
21986     RXi_GET_DECL(r, ri);
21987
21988     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21989
21990     len = ProgLen(ri);
21991
21992     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
21993           char, regexp_internal);
21994     Copy(ri->program, reti->program, len+1, regnode);
21995
21996
21997     if (ri->code_blocks) {
21998         int n;
21999         Newx(reti->code_blocks, 1, struct reg_code_blocks);
22000         Newx(reti->code_blocks->cb, ri->code_blocks->count,
22001                     struct reg_code_block);
22002         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
22003              ri->code_blocks->count, struct reg_code_block);
22004         for (n = 0; n < ri->code_blocks->count; n++)
22005              reti->code_blocks->cb[n].src_regex = (REGEXP*)
22006                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
22007         reti->code_blocks->count = ri->code_blocks->count;
22008         reti->code_blocks->refcnt = 1;
22009     }
22010     else
22011         reti->code_blocks = NULL;
22012
22013     reti->regstclass = NULL;
22014
22015     if (ri->data) {
22016         struct reg_data *d;
22017         const int count = ri->data->count;
22018         int i;
22019
22020         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
22021                 char, struct reg_data);
22022         Newx(d->what, count, U8);
22023
22024         d->count = count;
22025         for (i = 0; i < count; i++) {
22026             d->what[i] = ri->data->what[i];
22027             switch (d->what[i]) {
22028                 /* see also regcomp.h and regfree_internal() */
22029             case 'a': /* actually an AV, but the dup function is identical.
22030                          values seem to be "plain sv's" generally. */
22031             case 'r': /* a compiled regex (but still just another SV) */
22032             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
22033                          this use case should go away, the code could have used
22034                          'a' instead - see S_set_ANYOF_arg() for array contents. */
22035             case 'S': /* actually an SV, but the dup function is identical.  */
22036             case 'u': /* actually an HV, but the dup function is identical.
22037                          values are "plain sv's" */
22038                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
22039                 break;
22040             case 'f':
22041                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
22042                  * patterns which could start with several different things. Pre-TRIE
22043                  * this was more important than it is now, however this still helps
22044                  * in some places, for instance /x?a+/ might produce a SSC equivalent
22045                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
22046                  * in regexec.c
22047                  */
22048                 /* This is cheating. */
22049                 Newx(d->data[i], 1, regnode_ssc);
22050                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
22051                 reti->regstclass = (regnode*)d->data[i];
22052                 break;
22053             case 'T':
22054                 /* AHO-CORASICK fail table */
22055                 /* Trie stclasses are readonly and can thus be shared
22056                  * without duplication. We free the stclass in pregfree
22057                  * when the corresponding reg_ac_data struct is freed.
22058                  */
22059                 reti->regstclass= ri->regstclass;
22060                 /* FALLTHROUGH */
22061             case 't':
22062                 /* TRIE transition table */
22063                 OP_REFCNT_LOCK;
22064                 ((reg_trie_data*)ri->data->data[i])->refcount++;
22065                 OP_REFCNT_UNLOCK;
22066                 /* FALLTHROUGH */
22067             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
22068             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
22069                          is not from another regexp */
22070                 d->data[i] = ri->data->data[i];
22071                 break;
22072             default:
22073                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
22074                                                            ri->data->what[i]);
22075             }
22076         }
22077
22078         reti->data = d;
22079     }
22080     else
22081         reti->data = NULL;
22082
22083     reti->name_list_idx = ri->name_list_idx;
22084
22085 #ifdef RE_TRACK_PATTERN_OFFSETS
22086     if (ri->u.offsets) {
22087         Newx(reti->u.offsets, 2*len+1, U32);
22088         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
22089     }
22090 #else
22091     SetProgLen(reti, len);
22092 #endif
22093
22094     return (void*)reti;
22095 }
22096
22097 #endif    /* USE_ITHREADS */
22098
22099 #ifndef PERL_IN_XSUB_RE
22100
22101 /*
22102  - regnext - dig the "next" pointer out of a node
22103  */
22104 regnode *
22105 Perl_regnext(pTHX_ regnode *p)
22106 {
22107     I32 offset;
22108
22109     if (!p)
22110         return(NULL);
22111
22112     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
22113         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
22114                                                 (int)OP(p), (int)REGNODE_MAX);
22115     }
22116
22117     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
22118     if (offset == 0)
22119         return(NULL);
22120
22121     return(p+offset);
22122 }
22123
22124 #endif
22125
22126 STATIC void
22127 S_re_croak(pTHX_ bool utf8, const char* pat,...)
22128 {
22129     va_list args;
22130     STRLEN len = strlen(pat);
22131     char buf[512];
22132     SV *msv;
22133     const char *message;
22134
22135     PERL_ARGS_ASSERT_RE_CROAK;
22136
22137     if (len > 510)
22138         len = 510;
22139     Copy(pat, buf, len , char);
22140     buf[len] = '\n';
22141     buf[len + 1] = '\0';
22142     va_start(args, pat);
22143     msv = vmess(buf, &args);
22144     va_end(args);
22145     message = SvPV_const(msv, len);
22146     if (len > 512)
22147         len = 512;
22148     Copy(message, buf, len , char);
22149     /* len-1 to avoid \n */
22150     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
22151 }
22152
22153 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
22154
22155 #ifndef PERL_IN_XSUB_RE
22156 void
22157 Perl_save_re_context(pTHX)
22158 {
22159     I32 nparens = -1;
22160     I32 i;
22161
22162     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
22163
22164     if (PL_curpm) {
22165         const REGEXP * const rx = PM_GETRE(PL_curpm);
22166         if (rx)
22167             nparens = RX_NPARENS(rx);
22168     }
22169
22170     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
22171      * that PL_curpm will be null, but that utf8.pm and the modules it
22172      * loads will only use $1..$3.
22173      * The t/porting/re_context.t test file checks this assumption.
22174      */
22175     if (nparens == -1)
22176         nparens = 3;
22177
22178     for (i = 1; i <= nparens; i++) {
22179         char digits[TYPE_CHARS(long)];
22180         const STRLEN len = my_snprintf(digits, sizeof(digits),
22181                                        "%lu", (long)i);
22182         GV *const *const gvp
22183             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
22184
22185         if (gvp) {
22186             GV * const gv = *gvp;
22187             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
22188                 save_scalar(gv);
22189         }
22190     }
22191 }
22192 #endif
22193
22194 #ifdef DEBUGGING
22195
22196 STATIC void
22197 S_put_code_point(pTHX_ SV *sv, UV c)
22198 {
22199     PERL_ARGS_ASSERT_PUT_CODE_POINT;
22200
22201     if (c > 255) {
22202         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
22203     }
22204     else if (isPRINT(c)) {
22205         const char string = (char) c;
22206
22207         /* We use {phrase} as metanotation in the class, so also escape literal
22208          * braces */
22209         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
22210             sv_catpvs(sv, "\\");
22211         sv_catpvn(sv, &string, 1);
22212     }
22213     else if (isMNEMONIC_CNTRL(c)) {
22214         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
22215     }
22216     else {
22217         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
22218     }
22219 }
22220
22221 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
22222
22223 STATIC void
22224 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
22225 {
22226     /* Appends to 'sv' a displayable version of the range of code points from
22227      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
22228      * that have them, when they occur at the beginning or end of the range.
22229      * It uses hex to output the remaining code points, unless 'allow_literals'
22230      * is true, in which case the printable ASCII ones are output as-is (though
22231      * some of these will be escaped by put_code_point()).
22232      *
22233      * NOTE:  This is designed only for printing ranges of code points that fit
22234      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
22235      */
22236
22237     const unsigned int min_range_count = 3;
22238
22239     assert(start <= end);
22240
22241     PERL_ARGS_ASSERT_PUT_RANGE;
22242
22243     while (start <= end) {
22244         UV this_end;
22245         const char * format;
22246
22247         if (    end - start < min_range_count
22248             && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
22249         {
22250             /* Output a range of 1 or 2 chars individually, or longer ranges
22251              * when printable */
22252             for (; start <= end; start++) {
22253                 put_code_point(sv, start);
22254             }
22255             break;
22256         }
22257
22258         /* If permitted by the input options, and there is a possibility that
22259          * this range contains a printable literal, look to see if there is
22260          * one. */
22261         if (allow_literals && start <= MAX_PRINT_A) {
22262
22263             /* If the character at the beginning of the range isn't an ASCII
22264              * printable, effectively split the range into two parts:
22265              *  1) the portion before the first such printable,
22266              *  2) the rest
22267              * and output them separately. */
22268             if (! isPRINT_A(start)) {
22269                 UV temp_end = start + 1;
22270
22271                 /* There is no point looking beyond the final possible
22272                  * printable, in MAX_PRINT_A */
22273                 UV max = MIN(end, MAX_PRINT_A);
22274
22275                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
22276                     temp_end++;
22277                 }
22278
22279                 /* Here, temp_end points to one beyond the first printable if
22280                  * found, or to one beyond 'max' if not.  If none found, make
22281                  * sure that we use the entire range */
22282                 if (temp_end > MAX_PRINT_A) {
22283                     temp_end = end + 1;
22284                 }
22285
22286                 /* Output the first part of the split range: the part that
22287                  * doesn't have printables, with the parameter set to not look
22288                  * for literals (otherwise we would infinitely recurse) */
22289                 put_range(sv, start, temp_end - 1, FALSE);
22290
22291                 /* The 2nd part of the range (if any) starts here. */
22292                 start = temp_end;
22293
22294                 /* We do a continue, instead of dropping down, because even if
22295                  * the 2nd part is non-empty, it could be so short that we want
22296                  * to output it as individual characters, as tested for at the
22297                  * top of this loop.  */
22298                 continue;
22299             }
22300
22301             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
22302              * output a sub-range of just the digits or letters, then process
22303              * the remaining portion as usual. */
22304             if (isALPHANUMERIC_A(start)) {
22305                 UV mask = (isDIGIT_A(start))
22306                            ? _CC_DIGIT
22307                              : isUPPER_A(start)
22308                                ? _CC_UPPER
22309                                : _CC_LOWER;
22310                 UV temp_end = start + 1;
22311
22312                 /* Find the end of the sub-range that includes just the
22313                  * characters in the same class as the first character in it */
22314                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
22315                     temp_end++;
22316                 }
22317                 temp_end--;
22318
22319                 /* For short ranges, don't duplicate the code above to output
22320                  * them; just call recursively */
22321                 if (temp_end - start < min_range_count) {
22322                     put_range(sv, start, temp_end, FALSE);
22323                 }
22324                 else {  /* Output as a range */
22325                     put_code_point(sv, start);
22326                     sv_catpvs(sv, "-");
22327                     put_code_point(sv, temp_end);
22328                 }
22329                 start = temp_end + 1;
22330                 continue;
22331             }
22332
22333             /* We output any other printables as individual characters */
22334             if (isPUNCT_A(start) || isSPACE_A(start)) {
22335                 while (start <= end && (isPUNCT_A(start)
22336                                         || isSPACE_A(start)))
22337                 {
22338                     put_code_point(sv, start);
22339                     start++;
22340                 }
22341                 continue;
22342             }
22343         } /* End of looking for literals */
22344
22345         /* Here is not to output as a literal.  Some control characters have
22346          * mnemonic names.  Split off any of those at the beginning and end of
22347          * the range to print mnemonically.  It isn't possible for many of
22348          * these to be in a row, so this won't overwhelm with output */
22349         if (   start <= end
22350             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
22351         {
22352             while (isMNEMONIC_CNTRL(start) && start <= end) {
22353                 put_code_point(sv, start);
22354                 start++;
22355             }
22356
22357             /* If this didn't take care of the whole range ... */
22358             if (start <= end) {
22359
22360                 /* Look backwards from the end to find the final non-mnemonic
22361                  * */
22362                 UV temp_end = end;
22363                 while (isMNEMONIC_CNTRL(temp_end)) {
22364                     temp_end--;
22365                 }
22366
22367                 /* And separately output the interior range that doesn't start
22368                  * or end with mnemonics */
22369                 put_range(sv, start, temp_end, FALSE);
22370
22371                 /* Then output the mnemonic trailing controls */
22372                 start = temp_end + 1;
22373                 while (start <= end) {
22374                     put_code_point(sv, start);
22375                     start++;
22376                 }
22377                 break;
22378             }
22379         }
22380
22381         /* As a final resort, output the range or subrange as hex. */
22382
22383         if (start >= NUM_ANYOF_CODE_POINTS) {
22384             this_end = end;
22385         }
22386         else {  /* Have to split range at the bitmap boundary */
22387             this_end = (end < NUM_ANYOF_CODE_POINTS)
22388                         ? end
22389                         : NUM_ANYOF_CODE_POINTS - 1;
22390         }
22391 #if NUM_ANYOF_CODE_POINTS > 256
22392         format = (this_end < 256)
22393                  ? "\\x%02" UVXf "-\\x%02" UVXf
22394                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
22395 #else
22396         format = "\\x%02" UVXf "-\\x%02" UVXf;
22397 #endif
22398         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22399         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22400         GCC_DIAG_RESTORE_STMT;
22401         break;
22402     }
22403 }
22404
22405 STATIC void
22406 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22407 {
22408     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22409      * 'invlist' */
22410
22411     UV start, end;
22412     bool allow_literals = TRUE;
22413
22414     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22415
22416     /* Generally, it is more readable if printable characters are output as
22417      * literals, but if a range (nearly) spans all of them, it's best to output
22418      * it as a single range.  This code will use a single range if all but 2
22419      * ASCII printables are in it */
22420     invlist_iterinit(invlist);
22421     while (invlist_iternext(invlist, &start, &end)) {
22422
22423         /* If the range starts beyond the final printable, it doesn't have any
22424          * in it */
22425         if (start > MAX_PRINT_A) {
22426             break;
22427         }
22428
22429         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
22430          * all but two, the range must start and end no later than 2 from
22431          * either end */
22432         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22433             if (end > MAX_PRINT_A) {
22434                 end = MAX_PRINT_A;
22435             }
22436             if (start < ' ') {
22437                 start = ' ';
22438             }
22439             if (end - start >= MAX_PRINT_A - ' ' - 2) {
22440                 allow_literals = FALSE;
22441             }
22442             break;
22443         }
22444     }
22445     invlist_iterfinish(invlist);
22446
22447     /* Here we have figured things out.  Output each range */
22448     invlist_iterinit(invlist);
22449     while (invlist_iternext(invlist, &start, &end)) {
22450         if (start >= NUM_ANYOF_CODE_POINTS) {
22451             break;
22452         }
22453         put_range(sv, start, end, allow_literals);
22454     }
22455     invlist_iterfinish(invlist);
22456
22457     return;
22458 }
22459
22460 STATIC SV*
22461 S_put_charclass_bitmap_innards_common(pTHX_
22462         SV* invlist,            /* The bitmap */
22463         SV* posixes,            /* Under /l, things like [:word:], \S */
22464         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
22465         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
22466         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
22467         const bool invert       /* Is the result to be inverted? */
22468 )
22469 {
22470     /* Create and return an SV containing a displayable version of the bitmap
22471      * and associated information determined by the input parameters.  If the
22472      * output would have been only the inversion indicator '^', NULL is instead
22473      * returned. */
22474
22475     SV * output;
22476
22477     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22478
22479     if (invert) {
22480         output = newSVpvs("^");
22481     }
22482     else {
22483         output = newSVpvs("");
22484     }
22485
22486     /* First, the code points in the bitmap that are unconditionally there */
22487     put_charclass_bitmap_innards_invlist(output, invlist);
22488
22489     /* Traditionally, these have been placed after the main code points */
22490     if (posixes) {
22491         sv_catsv(output, posixes);
22492     }
22493
22494     if (only_utf8 && _invlist_len(only_utf8)) {
22495         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22496         put_charclass_bitmap_innards_invlist(output, only_utf8);
22497     }
22498
22499     if (not_utf8 && _invlist_len(not_utf8)) {
22500         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22501         put_charclass_bitmap_innards_invlist(output, not_utf8);
22502     }
22503
22504     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22505         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22506         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22507
22508         /* This is the only list in this routine that can legally contain code
22509          * points outside the bitmap range.  The call just above to
22510          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22511          * output them here.  There's about a half-dozen possible, and none in
22512          * contiguous ranges longer than 2 */
22513         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22514             UV start, end;
22515             SV* above_bitmap = NULL;
22516
22517             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22518
22519             invlist_iterinit(above_bitmap);
22520             while (invlist_iternext(above_bitmap, &start, &end)) {
22521                 UV i;
22522
22523                 for (i = start; i <= end; i++) {
22524                     put_code_point(output, i);
22525                 }
22526             }
22527             invlist_iterfinish(above_bitmap);
22528             SvREFCNT_dec_NN(above_bitmap);
22529         }
22530     }
22531
22532     if (invert && SvCUR(output) == 1) {
22533         return NULL;
22534     }
22535
22536     return output;
22537 }
22538
22539 STATIC bool
22540 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22541                                      char *bitmap,
22542                                      SV *nonbitmap_invlist,
22543                                      SV *only_utf8_locale_invlist,
22544                                      const regnode * const node,
22545                                      const U8 flags,
22546                                      const bool force_as_is_display)
22547 {
22548     /* Appends to 'sv' a displayable version of the innards of the bracketed
22549      * character class defined by the other arguments:
22550      *  'bitmap' points to the bitmap, or NULL if to ignore that.
22551      *  'nonbitmap_invlist' is an inversion list of the code points that are in
22552      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
22553      *      none.  The reasons for this could be that they require some
22554      *      condition such as the target string being or not being in UTF-8
22555      *      (under /d), or because they came from a user-defined property that
22556      *      was not resolved at the time of the regex compilation (under /u)
22557      *  'only_utf8_locale_invlist' is an inversion list of the code points that
22558      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
22559      *  'node' is the regex pattern ANYOF node.  It is needed only when the
22560      *      above two parameters are not null, and is passed so that this
22561      *      routine can tease apart the various reasons for them.
22562      *  'flags' is the flags field of 'node'
22563      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
22564      *      to invert things to see if that leads to a cleaner display.  If
22565      *      FALSE, this routine is free to use its judgment about doing this.
22566      *
22567      * It returns TRUE if there was actually something output.  (It may be that
22568      * the bitmap, etc is empty.)
22569      *
22570      * When called for outputting the bitmap of a non-ANYOF node, just pass the
22571      * bitmap, with the succeeding parameters set to NULL, and the final one to
22572      * FALSE.
22573      */
22574
22575     /* In general, it tries to display the 'cleanest' representation of the
22576      * innards, choosing whether to display them inverted or not, regardless of
22577      * whether the class itself is to be inverted.  However,  there are some
22578      * cases where it can't try inverting, as what actually matches isn't known
22579      * until runtime, and hence the inversion isn't either. */
22580
22581     bool inverting_allowed = ! force_as_is_display;
22582
22583     int i;
22584     STRLEN orig_sv_cur = SvCUR(sv);
22585
22586     SV* invlist;            /* Inversion list we accumulate of code points that
22587                                are unconditionally matched */
22588     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
22589                                UTF-8 */
22590     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
22591                              */
22592     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
22593     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
22594                                        is UTF-8 */
22595
22596     SV* as_is_display;      /* The output string when we take the inputs
22597                                literally */
22598     SV* inverted_display;   /* The output string when we invert the inputs */
22599
22600     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
22601                                                    to match? */
22602     /* We are biased in favor of displaying things without them being inverted,
22603      * as that is generally easier to understand */
22604     const int bias = 5;
22605
22606     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22607
22608     /* Start off with whatever code points are passed in.  (We clone, so we
22609      * don't change the caller's list) */
22610     if (nonbitmap_invlist) {
22611         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22612         invlist = invlist_clone(nonbitmap_invlist, NULL);
22613     }
22614     else {  /* Worst case size is every other code point is matched */
22615         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22616     }
22617
22618     if (flags) {
22619         if (OP(node) == ANYOFD) {
22620
22621             /* This flag indicates that the code points below 0x100 in the
22622              * nonbitmap list are precisely the ones that match only when the
22623              * target is UTF-8 (they should all be non-ASCII). */
22624             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22625             {
22626                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22627                 _invlist_subtract(invlist, only_utf8, &invlist);
22628             }
22629
22630             /* And this flag for matching all non-ASCII 0xFF and below */
22631             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22632             {
22633                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22634             }
22635         }
22636         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22637
22638             /* If either of these flags are set, what matches isn't
22639              * determinable except during execution, so don't know enough here
22640              * to invert */
22641             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22642                 inverting_allowed = FALSE;
22643             }
22644
22645             /* What the posix classes match also varies at runtime, so these
22646              * will be output symbolically. */
22647             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22648                 int i;
22649
22650                 posixes = newSVpvs("");
22651                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22652                     if (ANYOF_POSIXL_TEST(node, i)) {
22653                         sv_catpv(posixes, anyofs[i]);
22654                     }
22655                 }
22656             }
22657         }
22658     }
22659
22660     /* Accumulate the bit map into the unconditional match list */
22661     if (bitmap) {
22662         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22663             if (BITMAP_TEST(bitmap, i)) {
22664                 int start = i++;
22665                 for (;
22666                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22667                      i++)
22668                 { /* empty */ }
22669                 invlist = _add_range_to_invlist(invlist, start, i-1);
22670             }
22671         }
22672     }
22673
22674     /* Make sure that the conditional match lists don't have anything in them
22675      * that match unconditionally; otherwise the output is quite confusing.
22676      * This could happen if the code that populates these misses some
22677      * duplication. */
22678     if (only_utf8) {
22679         _invlist_subtract(only_utf8, invlist, &only_utf8);
22680     }
22681     if (not_utf8) {
22682         _invlist_subtract(not_utf8, invlist, &not_utf8);
22683     }
22684
22685     if (only_utf8_locale_invlist) {
22686
22687         /* Since this list is passed in, we have to make a copy before
22688          * modifying it */
22689         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22690
22691         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22692
22693         /* And, it can get really weird for us to try outputting an inverted
22694          * form of this list when it has things above the bitmap, so don't even
22695          * try */
22696         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22697             inverting_allowed = FALSE;
22698         }
22699     }
22700
22701     /* Calculate what the output would be if we take the input as-is */
22702     as_is_display = put_charclass_bitmap_innards_common(invlist,
22703                                                     posixes,
22704                                                     only_utf8,
22705                                                     not_utf8,
22706                                                     only_utf8_locale,
22707                                                     invert);
22708
22709     /* If have to take the output as-is, just do that */
22710     if (! inverting_allowed) {
22711         if (as_is_display) {
22712             sv_catsv(sv, as_is_display);
22713             SvREFCNT_dec_NN(as_is_display);
22714         }
22715     }
22716     else { /* But otherwise, create the output again on the inverted input, and
22717               use whichever version is shorter */
22718
22719         int inverted_bias, as_is_bias;
22720
22721         /* We will apply our bias to whichever of the results doesn't have
22722          * the '^' */
22723         if (invert) {
22724             invert = FALSE;
22725             as_is_bias = bias;
22726             inverted_bias = 0;
22727         }
22728         else {
22729             invert = TRUE;
22730             as_is_bias = 0;
22731             inverted_bias = bias;
22732         }
22733
22734         /* Now invert each of the lists that contribute to the output,
22735          * excluding from the result things outside the possible range */
22736
22737         /* For the unconditional inversion list, we have to add in all the
22738          * conditional code points, so that when inverted, they will be gone
22739          * from it */
22740         _invlist_union(only_utf8, invlist, &invlist);
22741         _invlist_union(not_utf8, invlist, &invlist);
22742         _invlist_union(only_utf8_locale, invlist, &invlist);
22743         _invlist_invert(invlist);
22744         _invlist_intersection(invlist, PL_InBitmap, &invlist);
22745
22746         if (only_utf8) {
22747             _invlist_invert(only_utf8);
22748             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22749         }
22750         else if (not_utf8) {
22751
22752             /* If a code point matches iff the target string is not in UTF-8,
22753              * then complementing the result has it not match iff not in UTF-8,
22754              * which is the same thing as matching iff it is UTF-8. */
22755             only_utf8 = not_utf8;
22756             not_utf8 = NULL;
22757         }
22758
22759         if (only_utf8_locale) {
22760             _invlist_invert(only_utf8_locale);
22761             _invlist_intersection(only_utf8_locale,
22762                                   PL_InBitmap,
22763                                   &only_utf8_locale);
22764         }
22765
22766         inverted_display = put_charclass_bitmap_innards_common(
22767                                             invlist,
22768                                             posixes,
22769                                             only_utf8,
22770                                             not_utf8,
22771                                             only_utf8_locale, invert);
22772
22773         /* Use the shortest representation, taking into account our bias
22774          * against showing it inverted */
22775         if (   inverted_display
22776             && (   ! as_is_display
22777                 || (  SvCUR(inverted_display) + inverted_bias
22778                     < SvCUR(as_is_display)    + as_is_bias)))
22779         {
22780             sv_catsv(sv, inverted_display);
22781         }
22782         else if (as_is_display) {
22783             sv_catsv(sv, as_is_display);
22784         }
22785
22786         SvREFCNT_dec(as_is_display);
22787         SvREFCNT_dec(inverted_display);
22788     }
22789
22790     SvREFCNT_dec_NN(invlist);
22791     SvREFCNT_dec(only_utf8);
22792     SvREFCNT_dec(not_utf8);
22793     SvREFCNT_dec(posixes);
22794     SvREFCNT_dec(only_utf8_locale);
22795
22796     return SvCUR(sv) > orig_sv_cur;
22797 }
22798
22799 #define CLEAR_OPTSTART                                                       \
22800     if (optstart) STMT_START {                                               \
22801         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
22802                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22803         optstart=NULL;                                                       \
22804     } STMT_END
22805
22806 #define DUMPUNTIL(b,e)                                                       \
22807                     CLEAR_OPTSTART;                                          \
22808                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22809
22810 STATIC const regnode *
22811 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22812             const regnode *last, const regnode *plast,
22813             SV* sv, I32 indent, U32 depth)
22814 {
22815     U8 op = PSEUDO;     /* Arbitrary non-END op. */
22816     const regnode *next;
22817     const regnode *optstart= NULL;
22818
22819     RXi_GET_DECL(r, ri);
22820     DECLARE_AND_GET_RE_DEBUG_FLAGS;
22821
22822     PERL_ARGS_ASSERT_DUMPUNTIL;
22823
22824 #ifdef DEBUG_DUMPUNTIL
22825     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
22826         last ? last-start : 0, plast ? plast-start : 0);
22827 #endif
22828
22829     if (plast && plast < last)
22830         last= plast;
22831
22832     while (PL_regkind[op] != END && (!last || node < last)) {
22833         assert(node);
22834         /* While that wasn't END last time... */
22835         NODE_ALIGN(node);
22836         op = OP(node);
22837         if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22838             indent--;
22839         next = regnext((regnode *)node);
22840
22841         /* Where, what. */
22842         if (OP(node) == OPTIMIZED) {
22843             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22844                 optstart = node;
22845             else
22846                 goto after_print;
22847         } else
22848             CLEAR_OPTSTART;
22849
22850         regprop(r, sv, node, NULL, NULL);
22851         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
22852                       (int)(2*indent + 1), "", SvPVX_const(sv));
22853
22854         if (OP(node) != OPTIMIZED) {
22855             if (next == NULL)           /* Next ptr. */
22856                 Perl_re_printf( aTHX_  " (0)");
22857             else if (PL_regkind[(U8)op] == BRANCH
22858                      && PL_regkind[OP(next)] != BRANCH )
22859                 Perl_re_printf( aTHX_  " (FAIL)");
22860             else
22861                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
22862             Perl_re_printf( aTHX_ "\n");
22863         }
22864
22865       after_print:
22866         if (PL_regkind[(U8)op] == BRANCHJ) {
22867             assert(next);
22868             {
22869                 const regnode *nnode = (OP(next) == LONGJMP
22870                                        ? regnext((regnode *)next)
22871                                        : next);
22872                 if (last && nnode > last)
22873                     nnode = last;
22874                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22875             }
22876         }
22877         else if (PL_regkind[(U8)op] == BRANCH) {
22878             assert(next);
22879             DUMPUNTIL(NEXTOPER(node), next);
22880         }
22881         else if ( PL_regkind[(U8)op]  == TRIE ) {
22882             const regnode *this_trie = node;
22883             const char op = OP(node);
22884             const U32 n = ARG(node);
22885             const reg_ac_data * const ac = op>=AHOCORASICK ?
22886                (reg_ac_data *)ri->data->data[n] :
22887                NULL;
22888             const reg_trie_data * const trie =
22889                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22890 #ifdef DEBUGGING
22891             AV *const trie_words
22892                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22893 #endif
22894             const regnode *nextbranch= NULL;
22895             I32 word_idx;
22896             SvPVCLEAR(sv);
22897             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22898                 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22899
22900                 Perl_re_indentf( aTHX_  "%s ",
22901                     indent+3,
22902                     elem_ptr
22903                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22904                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
22905                                 PL_colors[0], PL_colors[1],
22906                                 (SvUTF8(*elem_ptr)
22907                                  ? PERL_PV_ESCAPE_UNI
22908                                  : 0)
22909                                 | PERL_PV_PRETTY_ELLIPSES
22910                                 | PERL_PV_PRETTY_LTGT
22911                             )
22912                     : "???"
22913                 );
22914                 if (trie->jump) {
22915                     U16 dist= trie->jump[word_idx+1];
22916                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
22917                                (UV)((dist ? this_trie + dist : next) - start));
22918                     if (dist) {
22919                         if (!nextbranch)
22920                             nextbranch= this_trie + trie->jump[0];
22921                         DUMPUNTIL(this_trie + dist, nextbranch);
22922                     }
22923                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22924                         nextbranch= regnext((regnode *)nextbranch);
22925                 } else {
22926                     Perl_re_printf( aTHX_  "\n");
22927                 }
22928             }
22929             if (last && next > last)
22930                 node= last;
22931             else
22932                 node= next;
22933         }
22934         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
22935             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22936                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22937         }
22938         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22939             assert(next);
22940             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22941         }
22942         else if ( op == PLUS || op == STAR) {
22943             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22944         }
22945         else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
22946             /* Literal string, where present. */
22947             node += NODE_SZ_STR(node) - 1;
22948             node = NEXTOPER(node);
22949         }
22950         else {
22951             node = NEXTOPER(node);
22952             node += regarglen[(U8)op];
22953         }
22954         if (op == CURLYX || op == OPEN || op == SROPEN)
22955             indent++;
22956     }
22957     CLEAR_OPTSTART;
22958 #ifdef DEBUG_DUMPUNTIL
22959     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
22960 #endif
22961     return node;
22962 }
22963
22964 #endif  /* DEBUGGING */
22965
22966 #ifndef PERL_IN_XSUB_RE
22967
22968 #  include "uni_keywords.h"
22969
22970 void
22971 Perl_init_uniprops(pTHX)
22972 {
22973
22974 #  ifdef DEBUGGING
22975     char * dump_len_string;
22976
22977     dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
22978     if (   ! dump_len_string
22979         || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
22980     {
22981         PL_dump_re_max_len = 60;    /* A reasonable default */
22982     }
22983 #  endif
22984
22985     PL_user_def_props = newHV();
22986
22987 #  ifdef USE_ITHREADS
22988
22989     HvSHAREKEYS_off(PL_user_def_props);
22990     PL_user_def_props_aTHX = aTHX;
22991
22992 #  endif
22993
22994     /* Set up the inversion list interpreter-level variables */
22995
22996     PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22997     PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
22998     PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
22999     PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
23000     PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
23001     PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
23002     PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
23003     PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
23004     PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
23005     PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
23006     PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
23007     PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
23008     PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
23009     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
23010     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
23011     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
23012
23013     PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23014     PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
23015     PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
23016     PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
23017     PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
23018     PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
23019     PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
23020     PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
23021     PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
23022     PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
23023     PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
23024     PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
23025     PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
23026     PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
23027     PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
23028     PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
23029
23030     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
23031     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
23032     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
23033     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
23034     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
23035
23036     PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
23037     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
23038     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
23039     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
23040
23041     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
23042
23043     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
23044     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
23045
23046     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
23047     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
23048
23049     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
23050     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23051                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
23052     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23053                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
23054     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
23055     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
23056     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
23057     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
23058     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
23059     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
23060     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
23061     PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
23062     PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
23063
23064 #  ifdef UNI_XIDC
23065     /* The below are used only by deprecated functions.  They could be removed */
23066     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
23067     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
23068     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
23069 #  endif
23070 }
23071
23072 /* These four functions are compiled only in regcomp.c, where they have access
23073  * to the data they return.  They are a way for re_comp.c to get access to that
23074  * data without having to compile the whole data structures. */
23075
23076 I16
23077 Perl_do_uniprop_match(const char * const key, const U16 key_len)
23078 {
23079     PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
23080
23081     return match_uniprop((U8 *) key, key_len);
23082 }
23083
23084 SV *
23085 Perl_get_prop_definition(pTHX_ const int table_index)
23086 {
23087     PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
23088
23089     /* Create and return the inversion list */
23090     return _new_invlist_C_array(uni_prop_ptrs[table_index]);
23091 }
23092
23093 const char * const *
23094 Perl_get_prop_values(const int table_index)
23095 {
23096     PERL_ARGS_ASSERT_GET_PROP_VALUES;
23097
23098     return UNI_prop_value_ptrs[table_index];
23099 }
23100
23101 const char *
23102 Perl_get_deprecated_property_msg(const Size_t warning_offset)
23103 {
23104     PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
23105
23106     return deprecated_property_msgs[warning_offset];
23107 }
23108
23109 #  if 0
23110
23111 This code was mainly added for backcompat to give a warning for non-portable
23112 code points in user-defined properties.  But experiments showed that the
23113 warning in earlier perls were only omitted on overflow, which should be an
23114 error, so there really isnt a backcompat issue, and actually adding the
23115 warning when none was present before might cause breakage, for little gain.  So
23116 khw left this code in, but not enabled.  Tests were never added.
23117
23118 embed.fnc entry:
23119 Ei      |const char *|get_extended_utf8_msg|const UV cp
23120
23121 PERL_STATIC_INLINE const char *
23122 S_get_extended_utf8_msg(pTHX_ const UV cp)
23123 {
23124     U8 dummy[UTF8_MAXBYTES + 1];
23125     HV *msgs;
23126     SV **msg;
23127
23128     uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
23129                              &msgs);
23130
23131     msg = hv_fetchs(msgs, "text", 0);
23132     assert(msg);
23133
23134     (void) sv_2mortal((SV *) msgs);
23135
23136     return SvPVX(*msg);
23137 }
23138
23139 #  endif
23140 #endif /* end of ! PERL_IN_XSUB_RE */
23141
23142 STATIC REGEXP *
23143 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
23144                          const bool ignore_case)
23145 {
23146     /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
23147      * possibly with /i if the 'ignore_case' parameter is true.  Use /aa
23148      * because nothing outside of ASCII will match.  Use /m because the input
23149      * string may be a bunch of lines strung together.
23150      *
23151      * Also sets up the debugging info */
23152
23153     U32 flags = PMf_MULTILINE|PMf_WILDCARD;
23154     U32 rx_flags;
23155     SV * subpattern_sv = sv_2mortal(newSVpvn(subpattern, len));
23156     REGEXP * subpattern_re;
23157     DECLARE_AND_GET_RE_DEBUG_FLAGS;
23158
23159     PERL_ARGS_ASSERT_COMPILE_WILDCARD;
23160
23161     if (ignore_case) {
23162         flags |= PMf_FOLD;
23163     }
23164     set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
23165
23166     /* Like in op.c, we copy the compile time pm flags to the rx ones */
23167     rx_flags = flags & RXf_PMf_COMPILETIME;
23168
23169 #ifndef PERL_IN_XSUB_RE
23170     /* Use the core engine if this file is regcomp.c.  That means no
23171      * 'use re "Debug ..." is in effect, so the core engine is sufficient */
23172     subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23173                                              &PL_core_reg_engine,
23174                                              NULL, NULL,
23175                                              rx_flags, flags);
23176 #else
23177     if (isDEBUG_WILDCARD) {
23178         /* Use the special debugging engine if this file is re_comp.c and wants
23179          * to output the wildcard matching.  This uses whatever
23180          * 'use re "Debug ..." is in effect */
23181         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23182                                                  &my_reg_engine,
23183                                                  NULL, NULL,
23184                                                  rx_flags, flags);
23185     }
23186     else {
23187         /* Use the special wildcard engine if this file is re_comp.c and
23188          * doesn't want to output the wildcard matching.  This uses whatever
23189          * 'use re "Debug ..." is in effect for compilation, but this engine
23190          * structure has been set up so that it uses the core engine for
23191          * execution, so no execution debugging as a result of re.pm will be
23192          * displayed. */
23193         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23194                                                  &wild_reg_engine,
23195                                                  NULL, NULL,
23196                                                  rx_flags, flags);
23197         /* XXX The above has the effect that any user-supplied regex engine
23198          * won't be called for matching wildcards.  That might be good, or bad.
23199          * It could be changed in several ways.  The reason it is done the
23200          * current way is to avoid having to save and restore
23201          * ^{^RE_DEBUG_FLAGS} around the execution.  save_scalar() perhaps
23202          * could be used.  Another suggestion is to keep the authoritative
23203          * value of the debug flags in a thread-local variable and add set/get
23204          * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
23205          * Still another is to pass a flag, say in the engine's intflags that
23206          * would be checked each time before doing the debug output */
23207     }
23208 #endif
23209
23210     assert(subpattern_re);  /* Should have died if didn't compile successfully */
23211     return subpattern_re;
23212 }
23213
23214 STATIC I32
23215 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
23216          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
23217 {
23218     I32 result;
23219     DECLARE_AND_GET_RE_DEBUG_FLAGS;
23220
23221     PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
23222
23223     ENTER;
23224
23225     /* The compilation has set things up so that if the program doesn't want to
23226      * see the wildcard matching procedure, it will get the core execution
23227      * engine, which is subject only to -Dr.  So we have to turn that off
23228      * around this procedure */
23229     if (! isDEBUG_WILDCARD) {
23230         /* Note! Casts away 'volatile' */
23231         SAVEI32(PL_debug);
23232         PL_debug &= ~ DEBUG_r_FLAG;
23233     }
23234
23235     result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
23236                          NULL, nosave);
23237     LEAVE;
23238
23239     return result;
23240 }
23241
23242 SV *
23243 S_handle_user_defined_property(pTHX_
23244
23245     /* Parses the contents of a user-defined property definition; returning the
23246      * expanded definition if possible.  If so, the return is an inversion
23247      * list.
23248      *
23249      * If there are subroutines that are part of the expansion and which aren't
23250      * known at the time of the call to this function, this returns what
23251      * parse_uniprop_string() returned for the first one encountered.
23252      *
23253      * If an error was found, NULL is returned, and 'msg' gets a suitable
23254      * message appended to it.  (Appending allows the back trace of how we got
23255      * to the faulty definition to be displayed through nested calls of
23256      * user-defined subs.)
23257      *
23258      * The caller IS responsible for freeing any returned SV.
23259      *
23260      * The syntax of the contents is pretty much described in perlunicode.pod,
23261      * but we also allow comments on each line */
23262
23263     const char * name,          /* Name of property */
23264     const STRLEN name_len,      /* The name's length in bytes */
23265     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23266     const bool to_fold,         /* ? Is this under /i */
23267     const bool runtime,         /* ? Are we in compile- or run-time */
23268     const bool deferrable,      /* Is it ok for this property's full definition
23269                                    to be deferred until later? */
23270     SV* contents,               /* The property's definition */
23271     bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
23272                                    getting called unless this is thought to be
23273                                    a user-defined property */
23274     SV * msg,                   /* Any error or warning msg(s) are appended to
23275                                    this */
23276     const STRLEN level)         /* Recursion level of this call */
23277 {
23278     STRLEN len;
23279     const char * string         = SvPV_const(contents, len);
23280     const char * const e        = string + len;
23281     const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
23282     const STRLEN msgs_length_on_entry = SvCUR(msg);
23283
23284     const char * s0 = string;   /* Points to first byte in the current line
23285                                    being parsed in 'string' */
23286     const char overflow_msg[] = "Code point too large in \"";
23287     SV* running_definition = NULL;
23288
23289     PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
23290
23291     *user_defined_ptr = TRUE;
23292
23293     /* Look at each line */
23294     while (s0 < e) {
23295         const char * s;     /* Current byte */
23296         char op = '+';      /* Default operation is 'union' */
23297         IV   min = 0;       /* range begin code point */
23298         IV   max = -1;      /* and range end */
23299         SV* this_definition;
23300
23301         /* Skip comment lines */
23302         if (*s0 == '#') {
23303             s0 = strchr(s0, '\n');
23304             if (s0 == NULL) {
23305                 break;
23306             }
23307             s0++;
23308             continue;
23309         }
23310
23311         /* For backcompat, allow an empty first line */
23312         if (*s0 == '\n') {
23313             s0++;
23314             continue;
23315         }
23316
23317         /* First character in the line may optionally be the operation */
23318         if (   *s0 == '+'
23319             || *s0 == '!'
23320             || *s0 == '-'
23321             || *s0 == '&')
23322         {
23323             op = *s0++;
23324         }
23325
23326         /* If the line is one or two hex digits separated by blank space, its
23327          * a range; otherwise it is either another user-defined property or an
23328          * error */
23329
23330         s = s0;
23331
23332         if (! isXDIGIT(*s)) {
23333             goto check_if_property;
23334         }
23335
23336         do { /* Each new hex digit will add 4 bits. */
23337             if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
23338                 s = strchr(s, '\n');
23339                 if (s == NULL) {
23340                     s = e;
23341                 }
23342                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23343                 sv_catpv(msg, overflow_msg);
23344                 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23345                                      UTF8fARG(is_contents_utf8, s - s0, s0));
23346                 sv_catpvs(msg, "\"");
23347                 goto return_failure;
23348             }
23349
23350             /* Accumulate this digit into the value */
23351             min = (min << 4) + READ_XDIGIT(s);
23352         } while (isXDIGIT(*s));
23353
23354         while (isBLANK(*s)) { s++; }
23355
23356         /* We allow comments at the end of the line */
23357         if (*s == '#') {
23358             s = strchr(s, '\n');
23359             if (s == NULL) {
23360                 s = e;
23361             }
23362             s++;
23363         }
23364         else if (s < e && *s != '\n') {
23365             if (! isXDIGIT(*s)) {
23366                 goto check_if_property;
23367             }
23368
23369             /* Look for the high point of the range */
23370             max = 0;
23371             do {
23372                 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
23373                     s = strchr(s, '\n');
23374                     if (s == NULL) {
23375                         s = e;
23376                     }
23377                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23378                     sv_catpv(msg, overflow_msg);
23379                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23380                                       UTF8fARG(is_contents_utf8, s - s0, s0));
23381                     sv_catpvs(msg, "\"");
23382                     goto return_failure;
23383                 }
23384
23385                 max = (max << 4) + READ_XDIGIT(s);
23386             } while (isXDIGIT(*s));
23387
23388             while (isBLANK(*s)) { s++; }
23389
23390             if (*s == '#') {
23391                 s = strchr(s, '\n');
23392                 if (s == NULL) {
23393                     s = e;
23394                 }
23395             }
23396             else if (s < e && *s != '\n') {
23397                 goto check_if_property;
23398             }
23399         }
23400
23401         if (max == -1) {    /* The line only had one entry */
23402             max = min;
23403         }
23404         else if (max < min) {
23405             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23406             sv_catpvs(msg, "Illegal range in \"");
23407             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23408                                 UTF8fARG(is_contents_utf8, s - s0, s0));
23409             sv_catpvs(msg, "\"");
23410             goto return_failure;
23411         }
23412
23413 #  if 0   /* See explanation at definition above of get_extended_utf8_msg() */
23414
23415         if (   UNICODE_IS_PERL_EXTENDED(min)
23416             || UNICODE_IS_PERL_EXTENDED(max))
23417         {
23418             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23419
23420             /* If both code points are non-portable, warn only on the lower
23421              * one. */
23422             sv_catpv(msg, get_extended_utf8_msg(
23423                                             (UNICODE_IS_PERL_EXTENDED(min))
23424                                             ? min : max));
23425             sv_catpvs(msg, " in \"");
23426             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23427                                  UTF8fARG(is_contents_utf8, s - s0, s0));
23428             sv_catpvs(msg, "\"");
23429         }
23430
23431 #  endif
23432
23433         /* Here, this line contains a legal range */
23434         this_definition = sv_2mortal(_new_invlist(2));
23435         this_definition = _add_range_to_invlist(this_definition, min, max);
23436         goto calculate;
23437
23438       check_if_property:
23439
23440         /* Here it isn't a legal range line.  See if it is a legal property
23441          * line.  First find the end of the meat of the line */
23442         s = strpbrk(s, "#\n");
23443         if (s == NULL) {
23444             s = e;
23445         }
23446
23447         /* Ignore trailing blanks in keeping with the requirements of
23448          * parse_uniprop_string() */
23449         s--;
23450         while (s > s0 && isBLANK_A(*s)) {
23451             s--;
23452         }
23453         s++;
23454
23455         this_definition = parse_uniprop_string(s0, s - s0,
23456                                                is_utf8, to_fold, runtime,
23457                                                deferrable,
23458                                                NULL,
23459                                                user_defined_ptr, msg,
23460                                                (name_len == 0)
23461                                                 ? level /* Don't increase level
23462                                                            if input is empty */
23463                                                 : level + 1
23464                                               );
23465         if (this_definition == NULL) {
23466             goto return_failure;    /* 'msg' should have had the reason
23467                                        appended to it by the above call */
23468         }
23469
23470         if (! is_invlist(this_definition)) {    /* Unknown at this time */
23471             return newSVsv(this_definition);
23472         }
23473
23474         if (*s != '\n') {
23475             s = strchr(s, '\n');
23476             if (s == NULL) {
23477                 s = e;
23478             }
23479         }
23480
23481       calculate:
23482
23483         switch (op) {
23484             case '+':
23485                 _invlist_union(running_definition, this_definition,
23486                                                         &running_definition);
23487                 break;
23488             case '-':
23489                 _invlist_subtract(running_definition, this_definition,
23490                                                         &running_definition);
23491                 break;
23492             case '&':
23493                 _invlist_intersection(running_definition, this_definition,
23494                                                         &running_definition);
23495                 break;
23496             case '!':
23497                 _invlist_union_complement_2nd(running_definition,
23498                                         this_definition, &running_definition);
23499                 break;
23500             default:
23501                 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
23502                                  __FILE__, __LINE__, op);
23503                 break;
23504         }
23505
23506         /* Position past the '\n' */
23507         s0 = s + 1;
23508     }   /* End of loop through the lines of 'contents' */
23509
23510     /* Here, we processed all the lines in 'contents' without error.  If we
23511      * didn't add any warnings, simply return success */
23512     if (msgs_length_on_entry == SvCUR(msg)) {
23513
23514         /* If the expansion was empty, the answer isn't nothing: its an empty
23515          * inversion list */
23516         if (running_definition == NULL) {
23517             running_definition = _new_invlist(1);
23518         }
23519
23520         return running_definition;
23521     }
23522
23523     /* Otherwise, add some explanatory text, but we will return success */
23524     goto return_msg;
23525
23526   return_failure:
23527     running_definition = NULL;
23528
23529   return_msg:
23530
23531     if (name_len > 0) {
23532         sv_catpvs(msg, " in expansion of ");
23533         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23534     }
23535
23536     return running_definition;
23537 }
23538
23539 /* As explained below, certain operations need to take place in the first
23540  * thread created.  These macros switch contexts */
23541 #  ifdef USE_ITHREADS
23542 #    define DECLARATION_FOR_GLOBAL_CONTEXT                                  \
23543                                         PerlInterpreter * save_aTHX = aTHX;
23544 #    define SWITCH_TO_GLOBAL_CONTEXT                                        \
23545                            PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23546 #    define RESTORE_CONTEXT  PERL_SET_CONTEXT((aTHX = save_aTHX));
23547 #    define CUR_CONTEXT      aTHX
23548 #    define ORIGINAL_CONTEXT save_aTHX
23549 #  else
23550 #    define DECLARATION_FOR_GLOBAL_CONTEXT    dNOOP
23551 #    define SWITCH_TO_GLOBAL_CONTEXT          NOOP
23552 #    define RESTORE_CONTEXT                   NOOP
23553 #    define CUR_CONTEXT                       NULL
23554 #    define ORIGINAL_CONTEXT                  NULL
23555 #  endif
23556
23557 STATIC void
23558 S_delete_recursion_entry(pTHX_ void *key)
23559 {
23560     /* Deletes the entry used to detect recursion when expanding user-defined
23561      * properties.  This is a function so it can be set up to be called even if
23562      * the program unexpectedly quits */
23563
23564     SV ** current_entry;
23565     const STRLEN key_len = strlen((const char *) key);
23566     DECLARATION_FOR_GLOBAL_CONTEXT;
23567
23568     SWITCH_TO_GLOBAL_CONTEXT;
23569
23570     /* If the entry is one of these types, it is a permanent entry, and not the
23571      * one used to detect recursions.  This function should delete only the
23572      * recursion entry */
23573     current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23574     if (     current_entry
23575         && ! is_invlist(*current_entry)
23576         && ! SvPOK(*current_entry))
23577     {
23578         (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23579                                                                     G_DISCARD);
23580     }
23581
23582     RESTORE_CONTEXT;
23583 }
23584
23585 STATIC SV *
23586 S_get_fq_name(pTHX_
23587               const char * const name,    /* The first non-blank in the \p{}, \P{} */
23588               const Size_t name_len,      /* Its length in bytes, not including any trailing space */
23589               const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23590               const bool has_colon_colon
23591              )
23592 {
23593     /* Returns a mortal SV containing the fully qualified version of the input
23594      * name */
23595
23596     SV * fq_name;
23597
23598     fq_name = newSVpvs_flags("", SVs_TEMP);
23599
23600     /* Use the current package if it wasn't included in our input */
23601     if (! has_colon_colon) {
23602         const HV * pkg = (IN_PERL_COMPILETIME)
23603                          ? PL_curstash
23604                          : CopSTASH(PL_curcop);
23605         const char* pkgname = HvNAME(pkg);
23606
23607         Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23608                       UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23609         sv_catpvs(fq_name, "::");
23610     }
23611
23612     Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23613                          UTF8fARG(is_utf8, name_len, name));
23614     return fq_name;
23615 }
23616
23617 STATIC SV *
23618 S_parse_uniprop_string(pTHX_
23619
23620     /* Parse the interior of a \p{}, \P{}.  Returns its definition if knowable
23621      * now.  If so, the return is an inversion list.
23622      *
23623      * If the property is user-defined, it is a subroutine, which in turn
23624      * may call other subroutines.  This function will call the whole nest of
23625      * them to get the definition they return; if some aren't known at the time
23626      * of the call to this function, the fully qualified name of the highest
23627      * level sub is returned.  It is an error to call this function at runtime
23628      * without every sub defined.
23629      *
23630      * If an error was found, NULL is returned, and 'msg' gets a suitable
23631      * message appended to it.  (Appending allows the back trace of how we got
23632      * to the faulty definition to be displayed through nested calls of
23633      * user-defined subs.)
23634      *
23635      * The caller should NOT try to free any returned inversion list.
23636      *
23637      * Other parameters will be set on return as described below */
23638
23639     const char * const name,    /* The first non-blank in the \p{}, \P{} */
23640     Size_t name_len,            /* Its length in bytes, not including any
23641                                    trailing space */
23642     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23643     const bool to_fold,         /* ? Is this under /i */
23644     const bool runtime,         /* TRUE if this is being called at run time */
23645     const bool deferrable,      /* TRUE if it's ok for the definition to not be
23646                                    known at this call */
23647     AV ** strings,              /* To return string property values, like named
23648                                    sequences */
23649     bool *user_defined_ptr,     /* Upon return from this function it will be
23650                                    set to TRUE if any component is a
23651                                    user-defined property */
23652     SV * msg,                   /* Any error or warning msg(s) are appended to
23653                                    this */
23654     const STRLEN level)         /* Recursion level of this call */
23655 {
23656     char* lookup_name;          /* normalized name for lookup in our tables */
23657     unsigned lookup_len;        /* Its length */
23658     enum { Not_Strict = 0,      /* Some properties have stricter name */
23659            Strict,              /* normalization rules, which we decide */
23660            As_Is                /* upon based on parsing */
23661          } stricter = Not_Strict;
23662
23663     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23664      * (though it requires extra effort to download them from Unicode and
23665      * compile perl to know about them) */
23666     bool is_nv_type = FALSE;
23667
23668     unsigned int i, j = 0;
23669     int equals_pos = -1;    /* Where the '=' is found, or negative if none */
23670     int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
23671     int table_index = 0;    /* The entry number for this property in the table
23672                                of all Unicode property names */
23673     bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
23674     Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
23675                                    the normalized name in certain situations */
23676     Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
23677                                    part of a package name */
23678     Size_t lun_non_pkg_begin = 0;   /* Similarly for 'lookup_name' */
23679     bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
23680                                              property rather than a Unicode
23681                                              one. */
23682     SV * prop_definition = NULL;  /* The returned definition of 'name' or NULL
23683                                      if an error.  If it is an inversion list,
23684                                      it is the definition.  Otherwise it is a
23685                                      string containing the fully qualified sub
23686                                      name of 'name' */
23687     SV * fq_name = NULL;        /* For user-defined properties, the fully
23688                                    qualified name */
23689     bool invert_return = FALSE; /* ? Do we need to complement the result before
23690                                      returning it */
23691     bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
23692                                        explicit utf8:: package that we strip
23693                                        off  */
23694     /* The expansion of properties that could be either user-defined or
23695      * official unicode ones is deferred until runtime, including a marker for
23696      * those that might be in the latter category.  This boolean indicates if
23697      * we've seen that marker.  If not, what we're parsing can't be such an
23698      * official Unicode property whose expansion was deferred */
23699     bool could_be_deferred_official = FALSE;
23700
23701     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23702
23703     /* The input will be normalized into 'lookup_name' */
23704     Newx(lookup_name, name_len, char);
23705     SAVEFREEPV(lookup_name);
23706
23707     /* Parse the input. */
23708     for (i = 0; i < name_len; i++) {
23709         char cur = name[i];
23710
23711         /* Most of the characters in the input will be of this ilk, being parts
23712          * of a name */
23713         if (isIDCONT_A(cur)) {
23714
23715             /* Case differences are ignored.  Our lookup routine assumes
23716              * everything is lowercase, so normalize to that */
23717             if (isUPPER_A(cur)) {
23718                 lookup_name[j++] = toLOWER_A(cur);
23719                 continue;
23720             }
23721
23722             if (cur == '_') { /* Don't include these in the normalized name */
23723                 continue;
23724             }
23725
23726             lookup_name[j++] = cur;
23727
23728             /* The first character in a user-defined name must be of this type.
23729              * */
23730             if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23731                 could_be_user_defined = FALSE;
23732             }
23733
23734             continue;
23735         }
23736
23737         /* Here, the character is not something typically in a name,  But these
23738          * two types of characters (and the '_' above) can be freely ignored in
23739          * most situations.  Later it may turn out we shouldn't have ignored
23740          * them, and we have to reparse, but we don't have enough information
23741          * yet to make that decision */
23742         if (cur == '-' || isSPACE_A(cur)) {
23743             could_be_user_defined = FALSE;
23744             continue;
23745         }
23746
23747         /* An equals sign or single colon mark the end of the first part of
23748          * the property name */
23749         if (    cur == '='
23750             || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23751         {
23752             lookup_name[j++] = '='; /* Treat the colon as an '=' */
23753             equals_pos = j; /* Note where it occurred in the input */
23754             could_be_user_defined = FALSE;
23755             break;
23756         }
23757
23758         /* If this looks like it is a marker we inserted at compile time,
23759          * set a flag and otherwise ignore it.  If it isn't in the final
23760          * position, keep it as it would have been user input. */
23761         if (     UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
23762             && ! deferrable
23763             &&   could_be_user_defined
23764             &&   i == name_len - 1)
23765         {
23766             name_len--;
23767             could_be_deferred_official = TRUE;
23768             continue;
23769         }
23770
23771         /* Otherwise, this character is part of the name. */
23772         lookup_name[j++] = cur;
23773
23774         /* Here it isn't a single colon, so if it is a colon, it must be a
23775          * double colon */
23776         if (cur == ':') {
23777
23778             /* A double colon should be a package qualifier.  We note its
23779              * position and continue.  Note that one could have
23780              *      pkg1::pkg2::...::foo
23781              * so that the position at the end of the loop will be just after
23782              * the final qualifier */
23783
23784             i++;
23785             non_pkg_begin = i + 1;
23786             lookup_name[j++] = ':';
23787             lun_non_pkg_begin = j;
23788         }
23789         else { /* Only word chars (and '::') can be in a user-defined name */
23790             could_be_user_defined = FALSE;
23791         }
23792     } /* End of parsing through the lhs of the property name (or all of it if
23793          no rhs) */
23794
23795 #  define STRLENs(s)  (sizeof("" s "") - 1)
23796
23797     /* If there is a single package name 'utf8::', it is ambiguous.  It could
23798      * be for a user-defined property, or it could be a Unicode property, as
23799      * all of them are considered to be for that package.  For the purposes of
23800      * parsing the rest of the property, strip it off */
23801     if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23802         lookup_name +=  STRLENs("utf8::");
23803         j -=  STRLENs("utf8::");
23804         equals_pos -=  STRLENs("utf8::");
23805         stripped_utf8_pkg = TRUE;
23806     }
23807
23808     /* Here, we are either done with the whole property name, if it was simple;
23809      * or are positioned just after the '=' if it is compound. */
23810
23811     if (equals_pos >= 0) {
23812         assert(stricter == Not_Strict); /* We shouldn't have set this yet */
23813
23814         /* Space immediately after the '=' is ignored */
23815         i++;
23816         for (; i < name_len; i++) {
23817             if (! isSPACE_A(name[i])) {
23818                 break;
23819             }
23820         }
23821
23822         /* Most punctuation after the equals indicates a subpattern, like
23823          * \p{foo=/bar/} */
23824         if (   isPUNCT_A(name[i])
23825             &&  name[i] != '-'
23826             &&  name[i] != '+'
23827             &&  name[i] != '_'
23828             &&  name[i] != '{'
23829                 /* A backslash means the real delimitter is the next character,
23830                  * but it must be punctuation */
23831             && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
23832         {
23833             bool special_property = memEQs(lookup_name, j - 1, "name")
23834                                  || memEQs(lookup_name, j - 1, "na");
23835             if (! special_property) {
23836                 /* Find the property.  The table includes the equals sign, so
23837                  * we use 'j' as-is */
23838                 table_index = do_uniprop_match(lookup_name, j);
23839             }
23840             if (special_property || table_index) {
23841                 REGEXP * subpattern_re;
23842                 char open = name[i++];
23843                 char close;
23844                 const char * pos_in_brackets;
23845                 const char * const * prop_values;
23846                 bool escaped = 0;
23847
23848                 /* Backslash => delimitter is the character following.  We
23849                  * already checked that it is punctuation */
23850                 if (open == '\\') {
23851                     open = name[i++];
23852                     escaped = 1;
23853                 }
23854
23855                 /* This data structure is constructed so that the matching
23856                  * closing bracket is 3 past its matching opening.  The second
23857                  * set of closing is so that if the opening is something like
23858                  * ']', the closing will be that as well.  Something similar is
23859                  * done in toke.c */
23860                 pos_in_brackets = memCHRs("([<)]>)]>", open);
23861                 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
23862
23863                 if (    i >= name_len
23864                     ||  name[name_len-1] != close
23865                     || (escaped && name[name_len-2] != '\\')
23866                         /* Also make sure that there are enough characters.
23867                          * e.g., '\\\' would show up incorrectly as legal even
23868                          * though it is too short */
23869                     || (SSize_t) (name_len - i - 1 - escaped) < 0)
23870                 {
23871                     sv_catpvs(msg, "Unicode property wildcard not terminated");
23872                     goto append_name_to_msg;
23873                 }
23874
23875                 Perl_ck_warner_d(aTHX_
23876                     packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
23877                     "The Unicode property wildcards feature is experimental");
23878
23879                 if (special_property) {
23880                     const char * error_msg;
23881                     const char * revised_name = name + i;
23882                     Size_t revised_name_len = name_len - (i + 1 + escaped);
23883
23884                     /* Currently, the only 'special_property' is name, which we
23885                      * lookup in _charnames.pm */
23886
23887                     if (! load_charnames(newSVpvs("placeholder"),
23888                                          revised_name, revised_name_len,
23889                                          &error_msg))
23890                     {
23891                         sv_catpv(msg, error_msg);
23892                         goto append_name_to_msg;
23893                     }
23894
23895                     /* Farm this out to a function just to make the current
23896                      * function less unwieldy */
23897                     if (handle_names_wildcard(revised_name, revised_name_len,
23898                                               &prop_definition,
23899                                               strings))
23900                     {
23901                         return prop_definition;
23902                     }
23903
23904                     goto failed;
23905                 }
23906
23907                 prop_values = get_prop_values(table_index);
23908
23909                 /* Now create and compile the wildcard subpattern.  Use /i
23910                  * because the property values are supposed to match with case
23911                  * ignored. */
23912                 subpattern_re = compile_wildcard(name + i,
23913                                                  name_len - i - 1 - escaped,
23914                                                  TRUE /* /i */
23915                                                 );
23916
23917                 /* For each legal property value, see if the supplied pattern
23918                  * matches it. */
23919                 while (*prop_values) {
23920                     const char * const entry = *prop_values;
23921                     const Size_t len = strlen(entry);
23922                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
23923
23924                     if (execute_wildcard(subpattern_re,
23925                                  (char *) entry,
23926                                  (char *) entry + len,
23927                                  (char *) entry, 0,
23928                                  entry_sv,
23929                                  0))
23930                     { /* Here, matched.  Add to the returned list */
23931                         Size_t total_len = j + len;
23932                         SV * sub_invlist = NULL;
23933                         char * this_string;
23934
23935                         /* We know this is a legal \p{property=value}.  Call
23936                          * the function to return the list of code points that
23937                          * match it */
23938                         Newxz(this_string, total_len + 1, char);
23939                         Copy(lookup_name, this_string, j, char);
23940                         my_strlcat(this_string, entry, total_len + 1);
23941                         SAVEFREEPV(this_string);
23942                         sub_invlist = parse_uniprop_string(this_string,
23943                                                            total_len,
23944                                                            is_utf8,
23945                                                            to_fold,
23946                                                            runtime,
23947                                                            deferrable,
23948                                                            NULL,
23949                                                            user_defined_ptr,
23950                                                            msg,
23951                                                            level + 1);
23952                         _invlist_union(prop_definition, sub_invlist,
23953                                        &prop_definition);
23954                     }
23955
23956                     prop_values++;  /* Next iteration, look at next propvalue */
23957                 } /* End of looking through property values; (the data
23958                      structure is terminated by a NULL ptr) */
23959
23960                 SvREFCNT_dec_NN(subpattern_re);
23961
23962                 if (prop_definition) {
23963                     return prop_definition;
23964                 }
23965
23966                 sv_catpvs(msg, "No Unicode property value wildcard matches:");
23967                 goto append_name_to_msg;
23968             }
23969
23970             /* Here's how khw thinks we should proceed to handle the properties
23971              * not yet done:    Bidi Mirroring Glyph        can map to ""
23972                                 Bidi Paired Bracket         can map to ""
23973                                 Case Folding  (both full and simple)
23974                                             Shouldn't /i be good enough for Full
23975                                 Decomposition Mapping
23976                                 Equivalent Unified Ideograph    can map to ""
23977                                 Lowercase Mapping  (both full and simple)
23978                                 NFKC Case Fold                  can map to ""
23979                                 Titlecase Mapping  (both full and simple)
23980                                 Uppercase Mapping  (both full and simple)
23981              * Handle these the same way Name is done, using say, _wild.pm, but
23982              * having both loose and full, like in charclass_invlists.h.
23983              * Perhaps move block and script to that as they are somewhat large
23984              * in charclass_invlists.h.
23985              * For properties where the default is the code point itself, such
23986              * as any of the case changing mappings, the string would otherwise
23987              * consist of all Unicode code points in UTF-8 strung together.
23988              * This would be impractical.  So instead, examine their compiled
23989              * pattern, looking at the ssc.  If none, reject the pattern as an
23990              * error.  Otherwise run the pattern against every code point in
23991              * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
23992              * And it might be good to create an API to return the ssc.
23993              * Or handle them like the algorithmic names are done
23994              */
23995         } /* End of is a wildcard subppattern */
23996
23997         /* \p{name=...} is handled specially.  Instead of using the normal
23998          * mechanism involving charclass_invlists.h, it uses _charnames.pm
23999          * which has the necessary (huge) data accessible to it, and which
24000          * doesn't get loaded unless necessary.  The legal syntax for names is
24001          * somewhat different than other properties due both to the vagaries of
24002          * a few outlier official names, and the fact that only a few ASCII
24003          * characters are permitted in them */
24004         if (   memEQs(lookup_name, j - 1, "name")
24005             || memEQs(lookup_name, j - 1, "na"))
24006         {
24007             dSP;
24008             HV * table;
24009             SV * character;
24010             const char * error_msg;
24011             CV* lookup_loose;
24012             SV * character_name;
24013             STRLEN character_len;
24014             UV cp;
24015
24016             stricter = As_Is;
24017
24018             /* Since the RHS (after skipping initial space) is passed unchanged
24019              * to charnames, and there are different criteria for what are
24020              * legal characters in the name, just parse it here.  A character
24021              * name must begin with an ASCII alphabetic */
24022             if (! isALPHA(name[i])) {
24023                 goto failed;
24024             }
24025             lookup_name[j++] = name[i];
24026
24027             for (++i; i < name_len; i++) {
24028                 /* Official names can only be in the ASCII range, and only
24029                  * certain characters */
24030                 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
24031                     goto failed;
24032                 }
24033                 lookup_name[j++] = name[i];
24034             }
24035
24036             /* Finished parsing, save the name into an SV */
24037             character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
24038
24039             /* Make sure _charnames is loaded.  (The parameters give context
24040              * for any errors generated */
24041             table = load_charnames(character_name, name, name_len, &error_msg);
24042             if (table == NULL) {
24043                 sv_catpv(msg, error_msg);
24044                 goto append_name_to_msg;
24045             }
24046
24047             lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
24048             if (! lookup_loose) {
24049                 Perl_croak(aTHX_
24050                        "panic: Can't find '_charnames::_loose_regcomp_lookup");
24051             }
24052
24053             PUSHSTACKi(PERLSI_REGCOMP);
24054             ENTER ;
24055             SAVETMPS;
24056             save_re_context();
24057
24058             PUSHMARK(SP) ;
24059             XPUSHs(character_name);
24060             PUTBACK;
24061             call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
24062
24063             SPAGAIN ;
24064
24065             character = POPs;
24066             SvREFCNT_inc_simple_void_NN(character);
24067
24068             PUTBACK ;
24069             FREETMPS ;
24070             LEAVE ;
24071             POPSTACK;
24072
24073             if (! SvOK(character)) {
24074                 goto failed;
24075             }
24076
24077             cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
24078             if (character_len == SvCUR(character)) {
24079                 prop_definition = add_cp_to_invlist(NULL, cp);
24080             }
24081             else {
24082                 AV * this_string;
24083
24084                 /* First of the remaining characters in the string. */
24085                 char * remaining = SvPVX(character) + character_len;
24086
24087                 if (strings == NULL) {
24088                     goto failed;    /* XXX Perhaps a specific msg instead, like
24089                                        'not available here' */
24090                 }
24091
24092                 if (*strings == NULL) {
24093                     *strings = newAV();
24094                 }
24095
24096                 this_string = newAV();
24097                 av_push(this_string, newSVuv(cp));
24098
24099                 do {
24100                     cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
24101                     av_push(this_string, newSVuv(cp));
24102                     remaining += character_len;
24103                 } while (remaining < SvEND(character));
24104
24105                 av_push(*strings, (SV *) this_string);
24106             }
24107
24108             return prop_definition;
24109         }
24110
24111         /* Certain properties whose values are numeric need special handling.
24112          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
24113          * purposes of checking if this is one of those properties */
24114         if (memBEGINPs(lookup_name, j, "is")) {
24115             lookup_offset = 2;
24116         }
24117
24118         /* Then check if it is one of these specially-handled properties.  The
24119          * possibilities are hard-coded because easier this way, and the list
24120          * is unlikely to change.
24121          *
24122          * All numeric value type properties are of this ilk, and are also
24123          * special in a different way later on.  So find those first.  There
24124          * are several numeric value type properties in the Unihan DB (which is
24125          * unlikely to be compiled with perl, but we handle it here in case it
24126          * does get compiled).  They all end with 'numeric'.  The interiors
24127          * aren't checked for the precise property.  This would stop working if
24128          * a cjk property were to be created that ended with 'numeric' and
24129          * wasn't a numeric type */
24130         is_nv_type = memEQs(lookup_name + lookup_offset,
24131                        j - 1 - lookup_offset, "numericvalue")
24132                   || memEQs(lookup_name + lookup_offset,
24133                       j - 1 - lookup_offset, "nv")
24134                   || (   memENDPs(lookup_name + lookup_offset,
24135                             j - 1 - lookup_offset, "numeric")
24136                       && (   memBEGINPs(lookup_name + lookup_offset,
24137                                       j - 1 - lookup_offset, "cjk")
24138                           || memBEGINPs(lookup_name + lookup_offset,
24139                                       j - 1 - lookup_offset, "k")));
24140         if (   is_nv_type
24141             || memEQs(lookup_name + lookup_offset,
24142                       j - 1 - lookup_offset, "canonicalcombiningclass")
24143             || memEQs(lookup_name + lookup_offset,
24144                       j - 1 - lookup_offset, "ccc")
24145             || memEQs(lookup_name + lookup_offset,
24146                       j - 1 - lookup_offset, "age")
24147             || memEQs(lookup_name + lookup_offset,
24148                       j - 1 - lookup_offset, "in")
24149             || memEQs(lookup_name + lookup_offset,
24150                       j - 1 - lookup_offset, "presentin"))
24151         {
24152             unsigned int k;
24153
24154             /* Since the stuff after the '=' is a number, we can't throw away
24155              * '-' willy-nilly, as those could be a minus sign.  Other stricter
24156              * rules also apply.  However, these properties all can have the
24157              * rhs not be a number, in which case they contain at least one
24158              * alphabetic.  In those cases, the stricter rules don't apply.
24159              * But the numeric type properties can have the alphas [Ee] to
24160              * signify an exponent, and it is still a number with stricter
24161              * rules.  So look for an alpha that signifies not-strict */
24162             stricter = Strict;
24163             for (k = i; k < name_len; k++) {
24164                 if (   isALPHA_A(name[k])
24165                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
24166                 {
24167                     stricter = Not_Strict;
24168                     break;
24169                 }
24170             }
24171         }
24172
24173         if (stricter) {
24174
24175             /* A number may have a leading '+' or '-'.  The latter is retained
24176              * */
24177             if (name[i] == '+') {
24178                 i++;
24179             }
24180             else if (name[i] == '-') {
24181                 lookup_name[j++] = '-';
24182                 i++;
24183             }
24184
24185             /* Skip leading zeros including single underscores separating the
24186              * zeros, or between the final leading zero and the first other
24187              * digit */
24188             for (; i < name_len - 1; i++) {
24189                 if (    name[i] != '0'
24190                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24191                 {
24192                     break;
24193                 }
24194             }
24195         }
24196     }
24197     else {  /* No '=' */
24198
24199        /* Only a few properties without an '=' should be parsed with stricter
24200         * rules.  The list is unlikely to change. */
24201         if (   memBEGINPs(lookup_name, j, "perl")
24202             && memNEs(lookup_name + 4, j - 4, "space")
24203             && memNEs(lookup_name + 4, j - 4, "word"))
24204         {
24205             stricter = Strict;
24206
24207             /* We set the inputs back to 0 and the code below will reparse,
24208              * using strict */
24209             i = j = 0;
24210         }
24211     }
24212
24213     /* Here, we have either finished the property, or are positioned to parse
24214      * the remainder, and we know if stricter rules apply.  Finish out, if not
24215      * already done */
24216     for (; i < name_len; i++) {
24217         char cur = name[i];
24218
24219         /* In all instances, case differences are ignored, and we normalize to
24220          * lowercase */
24221         if (isUPPER_A(cur)) {
24222             lookup_name[j++] = toLOWER(cur);
24223             continue;
24224         }
24225
24226         /* An underscore is skipped, but not under strict rules unless it
24227          * separates two digits */
24228         if (cur == '_') {
24229             if (    stricter
24230                 && (     i == 0 || (int) i == equals_pos || i == name_len- 1
24231                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
24232             {
24233                 lookup_name[j++] = '_';
24234             }
24235             continue;
24236         }
24237
24238         /* Hyphens are skipped except under strict */
24239         if (cur == '-' && ! stricter) {
24240             continue;
24241         }
24242
24243         /* XXX Bug in documentation.  It says white space skipped adjacent to
24244          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
24245          * in a number */
24246         if (isSPACE_A(cur) && ! stricter) {
24247             continue;
24248         }
24249
24250         lookup_name[j++] = cur;
24251
24252         /* Unless this is a non-trailing slash, we are done with it */
24253         if (i >= name_len - 1 || cur != '/') {
24254             continue;
24255         }
24256
24257         slash_pos = j;
24258
24259         /* A slash in the 'numeric value' property indicates that what follows
24260          * is a denominator.  It can have a leading '+' and '0's that should be
24261          * skipped.  But we have never allowed a negative denominator, so treat
24262          * a minus like every other character.  (No need to rule out a second
24263          * '/', as that won't match anything anyway */
24264         if (is_nv_type) {
24265             i++;
24266             if (i < name_len && name[i] == '+') {
24267                 i++;
24268             }
24269
24270             /* Skip leading zeros including underscores separating digits */
24271             for (; i < name_len - 1; i++) {
24272                 if (   name[i] != '0'
24273                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24274                 {
24275                     break;
24276                 }
24277             }
24278
24279             /* Store the first real character in the denominator */
24280             if (i < name_len) {
24281                 lookup_name[j++] = name[i];
24282             }
24283         }
24284     }
24285
24286     /* Here are completely done parsing the input 'name', and 'lookup_name'
24287      * contains a copy, normalized.
24288      *
24289      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
24290      * different from without the underscores.  */
24291     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
24292            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
24293         && UNLIKELY(name[name_len-1] == '_'))
24294     {
24295         lookup_name[j++] = '&';
24296     }
24297
24298     /* If the original input began with 'In' or 'Is', it could be a subroutine
24299      * call to a user-defined property instead of a Unicode property name. */
24300     if (    name_len - non_pkg_begin > 2
24301         &&  name[non_pkg_begin+0] == 'I'
24302         && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
24303     {
24304         /* Names that start with In have different characterstics than those
24305          * that start with Is */
24306         if (name[non_pkg_begin+1] == 's') {
24307             starts_with_Is = TRUE;
24308         }
24309     }
24310     else {
24311         could_be_user_defined = FALSE;
24312     }
24313
24314     if (could_be_user_defined) {
24315         CV* user_sub;
24316
24317         /* If the user defined property returns the empty string, it could
24318          * easily be because the pattern is being compiled before the data it
24319          * actually needs to compile is available.  This could be argued to be
24320          * a bug in the perl code, but this is a change of behavior for Perl,
24321          * so we handle it.  This means that intentionally returning nothing
24322          * will not be resolved until runtime */
24323         bool empty_return = FALSE;
24324
24325         /* Here, the name could be for a user defined property, which are
24326          * implemented as subs. */
24327         user_sub = get_cvn_flags(name, name_len, 0);
24328         if (! user_sub) {
24329
24330             /* Here, the property name could be a user-defined one, but there
24331              * is no subroutine to handle it (as of now).   Defer handling it
24332              * until runtime.  Otherwise, a block defined by Unicode in a later
24333              * release would get the synonym InFoo added for it, and existing
24334              * code that used that name would suddenly break if it referred to
24335              * the property before the sub was declared.  See [perl #134146] */
24336             if (deferrable) {
24337                 goto definition_deferred;
24338             }
24339
24340             /* Here, we are at runtime, and didn't find the user property.  It
24341              * could be an official property, but only if no package was
24342              * specified, or just the utf8:: package. */
24343             if (could_be_deferred_official) {
24344                 lookup_name += lun_non_pkg_begin;
24345                 j -= lun_non_pkg_begin;
24346             }
24347             else if (! stripped_utf8_pkg) {
24348                 goto unknown_user_defined;
24349             }
24350
24351             /* Drop down to look up in the official properties */
24352         }
24353         else {
24354             const char insecure[] = "Insecure user-defined property";
24355
24356             /* Here, there is a sub by the correct name.  Normally we call it
24357              * to get the property definition */
24358             dSP;
24359             SV * user_sub_sv = MUTABLE_SV(user_sub);
24360             SV * error;     /* Any error returned by calling 'user_sub' */
24361             SV * key;       /* The key into the hash of user defined sub names
24362                              */
24363             SV * placeholder;
24364             SV ** saved_user_prop_ptr;      /* Hash entry for this property */
24365
24366             /* How many times to retry when another thread is in the middle of
24367              * expanding the same definition we want */
24368             PERL_INT_FAST8_T retry_countdown = 10;
24369
24370             DECLARATION_FOR_GLOBAL_CONTEXT;
24371
24372             /* If we get here, we know this property is user-defined */
24373             *user_defined_ptr = TRUE;
24374
24375             /* We refuse to call a potentially tainted subroutine; returning an
24376              * error instead */
24377             if (TAINT_get) {
24378                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24379                 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24380                 goto append_name_to_msg;
24381             }
24382
24383             /* In principal, we only call each subroutine property definition
24384              * once during the life of the program.  This guarantees that the
24385              * property definition never changes.  The results of the single
24386              * sub call are stored in a hash, which is used instead for future
24387              * references to this property.  The property definition is thus
24388              * immutable.  But, to allow the user to have a /i-dependent
24389              * definition, we call the sub once for non-/i, and once for /i,
24390              * should the need arise, passing the /i status as a parameter.
24391              *
24392              * We start by constructing the hash key name, consisting of the
24393              * fully qualified subroutine name, preceded by the /i status, so
24394              * that there is a key for /i and a different key for non-/i */
24395             key = newSVpvn(((to_fold) ? "1" : "0"), 1);
24396             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24397                                           non_pkg_begin != 0);
24398             sv_catsv(key, fq_name);
24399             sv_2mortal(key);
24400
24401             /* We only call the sub once throughout the life of the program
24402              * (with the /i, non-/i exception noted above).  That means the
24403              * hash must be global and accessible to all threads.  It is
24404              * created at program start-up, before any threads are created, so
24405              * is accessible to all children.  But this creates some
24406              * complications.
24407              *
24408              * 1) The keys can't be shared, or else problems arise; sharing is
24409              *    turned off at hash creation time
24410              * 2) All SVs in it are there for the remainder of the life of the
24411              *    program, and must be created in the same interpreter context
24412              *    as the hash, or else they will be freed from the wrong pool
24413              *    at global destruction time.  This is handled by switching to
24414              *    the hash's context to create each SV going into it, and then
24415              *    immediately switching back
24416              * 3) All accesses to the hash must be controlled by a mutex, to
24417              *    prevent two threads from getting an unstable state should
24418              *    they simultaneously be accessing it.  The code below is
24419              *    crafted so that the mutex is locked whenever there is an
24420              *    access and unlocked only when the next stable state is
24421              *    achieved.
24422              *
24423              * The hash stores either the definition of the property if it was
24424              * valid, or, if invalid, the error message that was raised.  We
24425              * use the type of SV to distinguish.
24426              *
24427              * There's also the need to guard against the definition expansion
24428              * from infinitely recursing.  This is handled by storing the aTHX
24429              * of the expanding thread during the expansion.  Again the SV type
24430              * is used to distinguish this from the other two cases.  If we
24431              * come to here and the hash entry for this property is our aTHX,
24432              * it means we have recursed, and the code assumes that we would
24433              * infinitely recurse, so instead stops and raises an error.
24434              * (Any recursion has always been treated as infinite recursion in
24435              * this feature.)
24436              *
24437              * If instead, the entry is for a different aTHX, it means that
24438              * that thread has gotten here first, and hasn't finished expanding
24439              * the definition yet.  We just have to wait until it is done.  We
24440              * sleep and retry a few times, returning an error if the other
24441              * thread doesn't complete. */
24442
24443           re_fetch:
24444             USER_PROP_MUTEX_LOCK;
24445
24446             /* If we have an entry for this key, the subroutine has already
24447              * been called once with this /i status. */
24448             saved_user_prop_ptr = hv_fetch(PL_user_def_props,
24449                                                    SvPVX(key), SvCUR(key), 0);
24450             if (saved_user_prop_ptr) {
24451
24452                 /* If the saved result is an inversion list, it is the valid
24453                  * definition of this property */
24454                 if (is_invlist(*saved_user_prop_ptr)) {
24455                     prop_definition = *saved_user_prop_ptr;
24456
24457                     /* The SV in the hash won't be removed until global
24458                      * destruction, so it is stable and we can unlock */
24459                     USER_PROP_MUTEX_UNLOCK;
24460
24461                     /* The caller shouldn't try to free this SV */
24462                     return prop_definition;
24463                 }
24464
24465                 /* Otherwise, if it is a string, it is the error message
24466                  * that was returned when we first tried to evaluate this
24467                  * property.  Fail, and append the message */
24468                 if (SvPOK(*saved_user_prop_ptr)) {
24469                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24470                     sv_catsv(msg, *saved_user_prop_ptr);
24471
24472                     /* The SV in the hash won't be removed until global
24473                      * destruction, so it is stable and we can unlock */
24474                     USER_PROP_MUTEX_UNLOCK;
24475
24476                     return NULL;
24477                 }
24478
24479                 assert(SvIOK(*saved_user_prop_ptr));
24480
24481                 /* Here, we have an unstable entry in the hash.  Either another
24482                  * thread is in the middle of expanding the property's
24483                  * definition, or we are ourselves recursing.  We use the aTHX
24484                  * in it to distinguish */
24485                 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
24486
24487                     /* Here, it's another thread doing the expanding.  We've
24488                      * looked as much as we are going to at the contents of the
24489                      * hash entry.  It's safe to unlock. */
24490                     USER_PROP_MUTEX_UNLOCK;
24491
24492                     /* Retry a few times */
24493                     if (retry_countdown-- > 0) {
24494                         PerlProc_sleep(1);
24495                         goto re_fetch;
24496                     }
24497
24498                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24499                     sv_catpvs(msg, "Timeout waiting for another thread to "
24500                                    "define");
24501                     goto append_name_to_msg;
24502                 }
24503
24504                 /* Here, we are recursing; don't dig any deeper */
24505                 USER_PROP_MUTEX_UNLOCK;
24506
24507                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24508                 sv_catpvs(msg,
24509                           "Infinite recursion in user-defined property");
24510                 goto append_name_to_msg;
24511             }
24512
24513             /* Here, this thread has exclusive control, and there is no entry
24514              * for this property in the hash.  So we have the go ahead to
24515              * expand the definition ourselves. */
24516
24517             PUSHSTACKi(PERLSI_REGCOMP);
24518             ENTER;
24519
24520             /* Create a temporary placeholder in the hash to detect recursion
24521              * */
24522             SWITCH_TO_GLOBAL_CONTEXT;
24523             placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
24524             (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
24525             RESTORE_CONTEXT;
24526
24527             /* Now that we have a placeholder, we can let other threads
24528              * continue */
24529             USER_PROP_MUTEX_UNLOCK;
24530
24531             /* Make sure the placeholder always gets destroyed */
24532             SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
24533
24534             PUSHMARK(SP);
24535             SAVETMPS;
24536
24537             /* Call the user's function, with the /i status as a parameter.
24538              * Note that we have gone to a lot of trouble to keep this call
24539              * from being within the locked mutex region. */
24540             XPUSHs(boolSV(to_fold));
24541             PUTBACK;
24542
24543             /* The following block was taken from swash_init().  Presumably
24544              * they apply to here as well, though we no longer use a swash --
24545              * khw */
24546             SAVEHINTS();
24547             save_re_context();
24548             /* We might get here via a subroutine signature which uses a utf8
24549              * parameter name, at which point PL_subname will have been set
24550              * but not yet used. */
24551             save_item(PL_subname);
24552
24553             /* G_SCALAR guarantees a single return value */
24554             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
24555
24556             SPAGAIN;
24557
24558             error = ERRSV;
24559             if (TAINT_get || SvTRUE(error)) {
24560                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24561                 if (SvTRUE(error)) {
24562                     sv_catpvs(msg, "Error \"");
24563                     sv_catsv(msg, error);
24564                     sv_catpvs(msg, "\"");
24565                 }
24566                 if (TAINT_get) {
24567                     if (SvTRUE(error)) sv_catpvs(msg, "; ");
24568                     sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24569                 }
24570
24571                 if (name_len > 0) {
24572                     sv_catpvs(msg, " in expansion of ");
24573                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
24574                                                                   name_len,
24575                                                                   name));
24576                 }
24577
24578                 (void) POPs;
24579                 prop_definition = NULL;
24580             }
24581             else {
24582                 SV * contents = POPs;
24583
24584                 /* The contents is supposed to be the expansion of the property
24585                  * definition.  If the definition is deferrable, and we got an
24586                  * empty string back, set a flag to later defer it (after clean
24587                  * up below). */
24588                 if (      deferrable
24589                     && (! SvPOK(contents) || SvCUR(contents) == 0))
24590                 {
24591                         empty_return = TRUE;
24592                 }
24593                 else { /* Otherwise, call a function to check for valid syntax,
24594                           and handle it */
24595
24596                     prop_definition = handle_user_defined_property(
24597                                                     name, name_len,
24598                                                     is_utf8, to_fold, runtime,
24599                                                     deferrable,
24600                                                     contents, user_defined_ptr,
24601                                                     msg,
24602                                                     level);
24603                 }
24604             }
24605
24606             /* Here, we have the results of the expansion.  Delete the
24607              * placeholder, and if the definition is now known, replace it with
24608              * that definition.  We need exclusive access to the hash, and we
24609              * can't let anyone else in, between when we delete the placeholder
24610              * and add the permanent entry */
24611             USER_PROP_MUTEX_LOCK;
24612
24613             S_delete_recursion_entry(aTHX_ SvPVX(key));
24614
24615             if (    ! empty_return
24616                 && (! prop_definition || is_invlist(prop_definition)))
24617             {
24618                 /* If we got success we use the inversion list defining the
24619                  * property; otherwise use the error message */
24620                 SWITCH_TO_GLOBAL_CONTEXT;
24621                 (void) hv_store_ent(PL_user_def_props,
24622                                     key,
24623                                     ((prop_definition)
24624                                      ? newSVsv(prop_definition)
24625                                      : newSVsv(msg)),
24626                                     0);
24627                 RESTORE_CONTEXT;
24628             }
24629
24630             /* All done, and the hash now has a permanent entry for this
24631              * property.  Give up exclusive control */
24632             USER_PROP_MUTEX_UNLOCK;
24633
24634             FREETMPS;
24635             LEAVE;
24636             POPSTACK;
24637
24638             if (empty_return) {
24639                 goto definition_deferred;
24640             }
24641
24642             if (prop_definition) {
24643
24644                 /* If the definition is for something not known at this time,
24645                  * we toss it, and go return the main property name, as that's
24646                  * the one the user will be aware of */
24647                 if (! is_invlist(prop_definition)) {
24648                     SvREFCNT_dec_NN(prop_definition);
24649                     goto definition_deferred;
24650                 }
24651
24652                 sv_2mortal(prop_definition);
24653             }
24654
24655             /* And return */
24656             return prop_definition;
24657
24658         }   /* End of calling the subroutine for the user-defined property */
24659     }       /* End of it could be a user-defined property */
24660
24661     /* Here it wasn't a user-defined property that is known at this time.  See
24662      * if it is a Unicode property */
24663
24664     lookup_len = j;     /* This is a more mnemonic name than 'j' */
24665
24666     /* Get the index into our pointer table of the inversion list corresponding
24667      * to the property */
24668     table_index = do_uniprop_match(lookup_name, lookup_len);
24669
24670     /* If it didn't find the property ... */
24671     if (table_index == 0) {
24672
24673         /* Try again stripping off any initial 'Is'.  This is because we
24674          * promise that an initial Is is optional.  The same isn't true of
24675          * names that start with 'In'.  Those can match only blocks, and the
24676          * lookup table already has those accounted for.  The lookup table also
24677          * has already accounted for Perl extensions (without and = sign)
24678          * starting with 'i's'. */
24679         if (starts_with_Is && equals_pos >= 0) {
24680             lookup_name += 2;
24681             lookup_len -= 2;
24682             equals_pos -= 2;
24683             slash_pos -= 2;
24684
24685             table_index = do_uniprop_match(lookup_name, lookup_len);
24686         }
24687
24688         if (table_index == 0) {
24689             char * canonical;
24690
24691             /* Here, we didn't find it.  If not a numeric type property, and
24692              * can't be a user-defined one, it isn't a legal property */
24693             if (! is_nv_type) {
24694                 if (! could_be_user_defined) {
24695                     goto failed;
24696                 }
24697
24698                 /* Here, the property name is legal as a user-defined one.   At
24699                  * compile time, it might just be that the subroutine for that
24700                  * property hasn't been encountered yet, but at runtime, it's
24701                  * an error to try to use an undefined one */
24702                 if (! deferrable) {
24703                     goto unknown_user_defined;;
24704                 }
24705
24706                 goto definition_deferred;
24707             } /* End of isn't a numeric type property */
24708
24709             /* The numeric type properties need more work to decide.  What we
24710              * do is make sure we have the number in canonical form and look
24711              * that up. */
24712
24713             if (slash_pos < 0) {    /* No slash */
24714
24715                 /* When it isn't a rational, take the input, convert it to a
24716                  * NV, then create a canonical string representation of that
24717                  * NV. */
24718
24719                 NV value;
24720                 SSize_t value_len = lookup_len - equals_pos;
24721
24722                 /* Get the value */
24723                 if (   value_len <= 0
24724                     || my_atof3(lookup_name + equals_pos, &value,
24725                                 value_len)
24726                           != lookup_name + lookup_len)
24727                 {
24728                     goto failed;
24729                 }
24730
24731                 /* If the value is an integer, the canonical value is integral
24732                  * */
24733                 if (Perl_ceil(value) == value) {
24734                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24735                                             equals_pos, lookup_name, value);
24736                 }
24737                 else {  /* Otherwise, it is %e with a known precision */
24738                     char * exp_ptr;
24739
24740                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24741                                                 equals_pos, lookup_name,
24742                                                 PL_E_FORMAT_PRECISION, value);
24743
24744                     /* The exponent generated is expecting two digits, whereas
24745                      * %e on some systems will generate three.  Remove leading
24746                      * zeros in excess of 2 from the exponent.  We start
24747                      * looking for them after the '=' */
24748                     exp_ptr = strchr(canonical + equals_pos, 'e');
24749                     if (exp_ptr) {
24750                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24751                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24752
24753                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24754
24755                         if (excess_exponent_len > 0) {
24756                             SSize_t leading_zeros = strspn(cur_ptr, "0");
24757                             SSize_t excess_leading_zeros
24758                                     = MIN(leading_zeros, excess_exponent_len);
24759                             if (excess_leading_zeros > 0) {
24760                                 Move(cur_ptr + excess_leading_zeros,
24761                                      cur_ptr,
24762                                      strlen(cur_ptr) - excess_leading_zeros
24763                                        + 1,  /* Copy the NUL as well */
24764                                      char);
24765                             }
24766                         }
24767                     }
24768                 }
24769             }
24770             else {  /* Has a slash.  Create a rational in canonical form  */
24771                 UV numerator, denominator, gcd, trial;
24772                 const char * end_ptr;
24773                 const char * sign = "";
24774
24775                 /* We can't just find the numerator, denominator, and do the
24776                  * division, then use the method above, because that is
24777                  * inexact.  And the input could be a rational that is within
24778                  * epsilon (given our precision) of a valid rational, and would
24779                  * then incorrectly compare valid.
24780                  *
24781                  * We're only interested in the part after the '=' */
24782                 const char * this_lookup_name = lookup_name + equals_pos;
24783                 lookup_len -= equals_pos;
24784                 slash_pos -= equals_pos;
24785
24786                 /* Handle any leading minus */
24787                 if (this_lookup_name[0] == '-') {
24788                     sign = "-";
24789                     this_lookup_name++;
24790                     lookup_len--;
24791                     slash_pos--;
24792                 }
24793
24794                 /* Convert the numerator to numeric */
24795                 end_ptr = this_lookup_name + slash_pos;
24796                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24797                     goto failed;
24798                 }
24799
24800                 /* It better have included all characters before the slash */
24801                 if (*end_ptr != '/') {
24802                     goto failed;
24803                 }
24804
24805                 /* Set to look at just the denominator */
24806                 this_lookup_name += slash_pos;
24807                 lookup_len -= slash_pos;
24808                 end_ptr = this_lookup_name + lookup_len;
24809
24810                 /* Convert the denominator to numeric */
24811                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24812                     goto failed;
24813                 }
24814
24815                 /* It better be the rest of the characters, and don't divide by
24816                  * 0 */
24817                 if (   end_ptr != this_lookup_name + lookup_len
24818                     || denominator == 0)
24819                 {
24820                     goto failed;
24821                 }
24822
24823                 /* Get the greatest common denominator using
24824                    http://en.wikipedia.org/wiki/Euclidean_algorithm */
24825                 gcd = numerator;
24826                 trial = denominator;
24827                 while (trial != 0) {
24828                     UV temp = trial;
24829                     trial = gcd % trial;
24830                     gcd = temp;
24831                 }
24832
24833                 /* If already in lowest possible terms, we have already tried
24834                  * looking this up */
24835                 if (gcd == 1) {
24836                     goto failed;
24837                 }
24838
24839                 /* Reduce the rational, which should put it in canonical form
24840                  * */
24841                 numerator /= gcd;
24842                 denominator /= gcd;
24843
24844                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24845                         equals_pos, lookup_name, sign, numerator, denominator);
24846             }
24847
24848             /* Here, we have the number in canonical form.  Try that */
24849             table_index = do_uniprop_match(canonical, strlen(canonical));
24850             if (table_index == 0) {
24851                 goto failed;
24852             }
24853         }   /* End of still didn't find the property in our table */
24854     }       /* End of       didn't find the property in our table */
24855
24856     /* Here, we have a non-zero return, which is an index into a table of ptrs.
24857      * A negative return signifies that the real index is the absolute value,
24858      * but the result needs to be inverted */
24859     if (table_index < 0) {
24860         invert_return = TRUE;
24861         table_index = -table_index;
24862     }
24863
24864     /* Out-of band indices indicate a deprecated property.  The proper index is
24865      * modulo it with the table size.  And dividing by the table size yields
24866      * an offset into a table constructed by regen/mk_invlists.pl to contain
24867      * the corresponding warning message */
24868     if (table_index > MAX_UNI_KEYWORD_INDEX) {
24869         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
24870         table_index %= MAX_UNI_KEYWORD_INDEX;
24871         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
24872                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
24873                 (int) name_len, name,
24874                 get_deprecated_property_msg(warning_offset));
24875     }
24876
24877     /* In a few properties, a different property is used under /i.  These are
24878      * unlikely to change, so are hard-coded here. */
24879     if (to_fold) {
24880         if (   table_index == UNI_XPOSIXUPPER
24881             || table_index == UNI_XPOSIXLOWER
24882             || table_index == UNI_TITLE)
24883         {
24884             table_index = UNI_CASED;
24885         }
24886         else if (   table_index == UNI_UPPERCASELETTER
24887                  || table_index == UNI_LOWERCASELETTER
24888 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
24889                  || table_index == UNI_TITLECASELETTER
24890 #  endif
24891         ) {
24892             table_index = UNI_CASEDLETTER;
24893         }
24894         else if (  table_index == UNI_POSIXUPPER
24895                 || table_index == UNI_POSIXLOWER)
24896         {
24897             table_index = UNI_POSIXALPHA;
24898         }
24899     }
24900
24901     /* Create and return the inversion list */
24902     prop_definition = get_prop_definition(table_index);
24903     sv_2mortal(prop_definition);
24904
24905     /* See if there is a private use override to add to this definition */
24906     {
24907         COPHH * hinthash = (IN_PERL_COMPILETIME)
24908                            ? CopHINTHASH_get(&PL_compiling)
24909                            : CopHINTHASH_get(PL_curcop);
24910         SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
24911
24912         if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
24913
24914             /* See if there is an element in the hints hash for this table */
24915             SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
24916             const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
24917
24918             if (pos) {
24919                 bool dummy;
24920                 SV * pu_definition;
24921                 SV * pu_invlist;
24922                 SV * expanded_prop_definition =
24923                             sv_2mortal(invlist_clone(prop_definition, NULL));
24924
24925                 /* If so, it's definition is the string from here to the next
24926                  * \a character.  And its format is the same as a user-defined
24927                  * property */
24928                 pos += SvCUR(pu_lookup);
24929                 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
24930                 pu_invlist = handle_user_defined_property(lookup_name,
24931                                                           lookup_len,
24932                                                           0, /* Not UTF-8 */
24933                                                           0, /* Not folded */
24934                                                           runtime,
24935                                                           deferrable,
24936                                                           pu_definition,
24937                                                           &dummy,
24938                                                           msg,
24939                                                           level);
24940                 if (TAINT_get) {
24941                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24942                     sv_catpvs(msg, "Insecure private-use override");
24943                     goto append_name_to_msg;
24944                 }
24945
24946                 /* For now, as a safety measure, make sure that it doesn't
24947                  * override non-private use code points */
24948                 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
24949
24950                 /* Add it to the list to be returned */
24951                 _invlist_union(prop_definition, pu_invlist,
24952                                &expanded_prop_definition);
24953                 prop_definition = expanded_prop_definition;
24954                 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
24955             }
24956         }
24957     }
24958
24959     if (invert_return) {
24960         _invlist_invert(prop_definition);
24961     }
24962     return prop_definition;
24963
24964   unknown_user_defined:
24965     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24966     sv_catpvs(msg, "Unknown user-defined property name");
24967     goto append_name_to_msg;
24968
24969   failed:
24970     if (non_pkg_begin != 0) {
24971         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24972         sv_catpvs(msg, "Illegal user-defined property name");
24973     }
24974     else {
24975         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24976         sv_catpvs(msg, "Can't find Unicode property definition");
24977     }
24978     /* FALLTHROUGH */
24979
24980   append_name_to_msg:
24981     {
24982         const char * prefix = (runtime && level == 0) ?  " \\p{" : " \"";
24983         const char * suffix = (runtime && level == 0) ?  "}" : "\"";
24984
24985         sv_catpv(msg, prefix);
24986         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
24987         sv_catpv(msg, suffix);
24988     }
24989
24990     return NULL;
24991
24992   definition_deferred:
24993
24994     {
24995         bool is_qualified = non_pkg_begin != 0;  /* If has "::" */
24996
24997         /* Here it could yet to be defined, so defer evaluation of this until
24998          * its needed at runtime.  We need the fully qualified property name to
24999          * avoid ambiguity */
25000         if (! fq_name) {
25001             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
25002                                                                 is_qualified);
25003         }
25004
25005         /* If it didn't come with a package, or the package is utf8::, this
25006          * actually could be an official Unicode property whose inclusion we
25007          * are deferring until runtime to make sure that it isn't overridden by
25008          * a user-defined property of the same name (which we haven't
25009          * encountered yet).  Add a marker to indicate this possibility, for
25010          * use at such time when we first need the definition during pattern
25011          * matching execution */
25012         if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
25013             sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
25014         }
25015
25016         /* We also need a trailing newline */
25017         sv_catpvs(fq_name, "\n");
25018
25019         *user_defined_ptr = TRUE;
25020         return fq_name;
25021     }
25022 }
25023
25024 STATIC bool
25025 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
25026                               const STRLEN wname_len, /* Its length */
25027                               SV ** prop_definition,
25028                               AV ** strings)
25029 {
25030     /* Deal with Name property wildcard subpatterns; returns TRUE if there were
25031      * any matches, adding them to prop_definition */
25032
25033     dSP;
25034
25035     CV * get_names_info;        /* entry to charnames.pm to get info we need */
25036     SV * names_string;          /* Contains all character names, except algo */
25037     SV * algorithmic_names;     /* Contains info about algorithmically
25038                                    generated character names */
25039     REGEXP * subpattern_re;     /* The user's pattern to match with */
25040     struct regexp * prog;       /* The compiled pattern */
25041     char * all_names_start;     /* lib/unicore/Name.pl string of every
25042                                    (non-algorithmic) character name */
25043     char * cur_pos;             /* We match, effectively using /gc; this is
25044                                    where we are now */
25045     bool found_matches = FALSE; /* Did any name match so far? */
25046     SV * empty;                 /* For matching zero length names */
25047     SV * must_sv;               /* Contains the substring, if any, that must be
25048                                    in a name for the subpattern to match */
25049     const char * must;          /* The PV of 'must' */
25050     STRLEN must_len;            /* And its length */
25051     SV * syllable_name = NULL;  /* For Hangul syllables */
25052     const char hangul_prefix[] = "HANGUL SYLLABLE ";
25053     const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
25054
25055     /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
25056      * syllable name, and these are immutable and guaranteed by the Unicode
25057      * standard to never be extended */
25058     const STRLEN syl_max_len = hangul_prefix_len + 7;
25059
25060     IV i;
25061
25062     PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
25063
25064     /* Make sure _charnames is loaded.  (The parameters give context
25065      * for any errors generated */
25066     get_names_info = get_cv("_charnames::_get_names_info", 0);
25067     if (! get_names_info) {
25068         Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
25069     }
25070
25071     /* Get the charnames data */
25072     PUSHSTACKi(PERLSI_REGCOMP);
25073     ENTER ;
25074     SAVETMPS;
25075     save_re_context();
25076
25077     PUSHMARK(SP) ;
25078     PUTBACK;
25079
25080     /* Special _charnames entry point that returns the info this routine
25081      * requires */
25082     call_sv(MUTABLE_SV(get_names_info), G_ARRAY);
25083
25084     SPAGAIN ;
25085
25086     /* Data structure for names which end in their very own code points */
25087     algorithmic_names = POPs;
25088     SvREFCNT_inc_simple_void_NN(algorithmic_names);
25089
25090     /* The lib/unicore/Name.pl string */
25091     names_string = POPs;
25092     SvREFCNT_inc_simple_void_NN(names_string);
25093
25094     PUTBACK ;
25095     FREETMPS ;
25096     LEAVE ;
25097     POPSTACK;
25098
25099     if (   ! SvROK(names_string)
25100         || ! SvROK(algorithmic_names))
25101     {   /* Perhaps should panic instead XXX */
25102         SvREFCNT_dec(names_string);
25103         SvREFCNT_dec(algorithmic_names);
25104         return FALSE;
25105     }
25106
25107     names_string = sv_2mortal(SvRV(names_string));
25108     all_names_start = SvPVX(names_string);
25109     cur_pos = all_names_start;
25110
25111     algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
25112
25113     /* Compile the subpattern consisting of the name being looked for */
25114     subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
25115
25116     must_sv = re_intuit_string(subpattern_re);
25117     if (must_sv) {
25118         /* regexec.c can free the re_intuit_string() return. GH #17734 */
25119         must_sv = sv_2mortal(newSVsv(must_sv));
25120         must = SvPV(must_sv, must_len);
25121     }
25122     else {
25123         must = "";
25124         must_len = 0;
25125     }
25126
25127     /* (Note: 'must' could contain a NUL.  And yet we use strspn() below on it.
25128      * This works because the NUL causes the function to return early, thus
25129      * showing that there are characters in it other than the acceptable ones,
25130      * which is our desired result.) */
25131
25132     prog = ReANY(subpattern_re);
25133
25134     /* If only nothing is matched, skip to where empty names are looked for */
25135     if (prog->maxlen == 0) {
25136         goto check_empty;
25137     }
25138
25139     /* And match against the string of all names /gc.  Don't even try if it
25140      * must match a character not found in any name. */
25141     if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
25142     {
25143         while (execute_wildcard(subpattern_re,
25144                                 cur_pos,
25145                                 SvEND(names_string),
25146                                 all_names_start, 0,
25147                                 names_string,
25148                                 0))
25149         { /* Here, matched. */
25150
25151             /* Note the string entries look like
25152              *      00001\nSTART OF HEADING\n\n
25153              * so we could match anywhere in that string.  We have to rule out
25154              * matching a code point line */
25155             char * this_name_start = all_names_start
25156                                                 + RX_OFFS(subpattern_re)->start;
25157             char * this_name_end   = all_names_start
25158                                                 + RX_OFFS(subpattern_re)->end;
25159             char * cp_start;
25160             char * cp_end;
25161             UV cp = 0;      /* Silences some compilers */
25162             AV * this_string = NULL;
25163             bool is_multi = FALSE;
25164
25165             /* If matched nothing, advance to next possible match */
25166             if (this_name_start == this_name_end) {
25167                 cur_pos = (char *) memchr(this_name_end + 1, '\n',
25168                                           SvEND(names_string) - this_name_end);
25169                 if (cur_pos == NULL) {
25170                     break;
25171                 }
25172             }
25173             else {
25174                 /* Position the next match to start beyond the current returned
25175                  * entry */
25176                 cur_pos = (char *) memchr(this_name_end, '\n',
25177                                           SvEND(names_string) - this_name_end);
25178             }
25179
25180             /* Back up to the \n just before the beginning of the character. */
25181             cp_end = (char *) my_memrchr(all_names_start,
25182                                          '\n',
25183                                          this_name_start - all_names_start);
25184
25185             /* If we didn't find a \n, it means it matched somewhere in the
25186              * initial '00000' in the string, so isn't a real match */
25187             if (cp_end == NULL) {
25188                 continue;
25189             }
25190
25191             this_name_start = cp_end + 1;   /* The name starts just after */
25192             cp_end--;                       /* the \n, and the code point */
25193                                             /* ends just before it */
25194
25195             /* All code points are 5 digits long */
25196             cp_start = cp_end - 4;
25197
25198             /* This shouldn't happen, as we found a \n, and the first \n is
25199              * further along than what we subtracted */
25200             assert(cp_start >= all_names_start);
25201
25202             if (cp_start == all_names_start) {
25203                 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
25204                 continue;
25205             }
25206
25207             /* If the character is a blank, we either have a named sequence, or
25208              * something is wrong */
25209             if (*(cp_start - 1) == ' ') {
25210                 cp_start = (char *) my_memrchr(all_names_start,
25211                                                '\n',
25212                                                cp_start - all_names_start);
25213                 cp_start++;
25214             }
25215
25216             assert(cp_start != NULL && cp_start >= all_names_start + 2);
25217
25218             /* Except for the first line in the string, the sequence before the
25219              * code point is \n\n.  If that isn't the case here, we didn't
25220              * match the name of a character.  (We could have matched a named
25221              * sequence, not currently handled */
25222             if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
25223                 continue;
25224             }
25225
25226             /* We matched!  Add this to the list */
25227             found_matches = TRUE;
25228
25229             /* Loop through all the code points in the sequence */
25230             while (cp_start < cp_end) {
25231
25232                 /* Calculate this code point from its 5 digits */
25233                 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
25234                    + (XDIGIT_VALUE(cp_start[1]) << 12)
25235                    + (XDIGIT_VALUE(cp_start[2]) << 8)
25236                    + (XDIGIT_VALUE(cp_start[3]) << 4)
25237                    +  XDIGIT_VALUE(cp_start[4]);
25238
25239                 cp_start += 6;  /* Go past any blank */
25240
25241                 if (cp_start < cp_end || is_multi) {
25242                     if (this_string == NULL) {
25243                         this_string = newAV();
25244                     }
25245
25246                     is_multi = TRUE;
25247                     av_push(this_string, newSVuv(cp));
25248                 }
25249             }
25250
25251             if (is_multi) { /* Was more than one code point */
25252                 if (*strings == NULL) {
25253                     *strings = newAV();
25254                 }
25255
25256                 av_push(*strings, (SV *) this_string);
25257             }
25258             else {  /* Only a single code point */
25259                 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
25260             }
25261         } /* End of loop through the non-algorithmic names string */
25262     }
25263
25264     /* There are also character names not in 'names_string'.  These are
25265      * algorithmically generatable.  Try this pattern on each possible one.
25266      * (khw originally planned to leave this out given the large number of
25267      * matches attempted; but the speed turned out to be quite acceptable
25268      *
25269      * There are plenty of opportunities to optimize to skip many of the tests.
25270      * beyond the rudimentary ones already here */
25271
25272     /* First see if the subpattern matches any of the algorithmic generatable
25273      * Hangul syllable names.
25274      *
25275      * We know none of these syllable names will match if the input pattern
25276      * requires more bytes than any syllable has, or if the input pattern only
25277      * matches an empty name, or if the pattern has something it must match and
25278      * one of the characters in that isn't in any Hangul syllable. */
25279     if (    prog->minlen <= (SSize_t) syl_max_len
25280         &&  prog->maxlen > 0
25281         && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
25282     {
25283         /* These constants, names, values, and algorithm are adapted from the
25284          * Unicode standard, version 5.1, section 3.12, and should never
25285          * change. */
25286         const char * JamoL[] = {
25287             "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
25288             "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
25289         };
25290         const int LCount = C_ARRAY_LENGTH(JamoL);
25291
25292         const char * JamoV[] = {
25293             "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
25294             "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
25295             "I"
25296         };
25297         const int VCount = C_ARRAY_LENGTH(JamoV);
25298
25299         const char * JamoT[] = {
25300             "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
25301             "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
25302             "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
25303         };
25304         const int TCount = C_ARRAY_LENGTH(JamoT);
25305
25306         int L, V, T;
25307
25308         /* This is the initial Hangul syllable code point; each time through the
25309          * inner loop, it maps to the next higher code point.  For more info,
25310          * see the Hangul syllable section of the Unicode standard. */
25311         int cp = 0xAC00;
25312
25313         syllable_name = sv_2mortal(newSV(syl_max_len));
25314         sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
25315
25316         for (L = 0; L < LCount; L++) {
25317             for (V = 0; V < VCount; V++) {
25318                 for (T = 0; T < TCount; T++) {
25319
25320                     /* Truncate back to the prefix, which is unvarying */
25321                     SvCUR_set(syllable_name, hangul_prefix_len);
25322
25323                     sv_catpv(syllable_name, JamoL[L]);
25324                     sv_catpv(syllable_name, JamoV[V]);
25325                     sv_catpv(syllable_name, JamoT[T]);
25326
25327                     if (execute_wildcard(subpattern_re,
25328                                 SvPVX(syllable_name),
25329                                 SvEND(syllable_name),
25330                                 SvPVX(syllable_name), 0,
25331                                 syllable_name,
25332                                 0))
25333                     {
25334                         *prop_definition = add_cp_to_invlist(*prop_definition,
25335                                                              cp);
25336                         found_matches = TRUE;
25337                     }
25338
25339                     cp++;
25340                 }
25341             }
25342         }
25343     }
25344
25345     /* The rest of the algorithmically generatable names are of the form
25346      * "PREFIX-code_point".  The prefixes and the code point limits of each
25347      * were returned to us in the array 'algorithmic_names' from data in
25348      * lib/unicore/Name.pm.  'code_point' in the name is expressed in hex. */
25349     for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
25350         IV j;
25351
25352         /* Each element of the array is a hash, giving the details for the
25353          * series of names it covers.  There is the base name of the characters
25354          * in the series, and the low and high code points in the series.  And,
25355          * for optimization purposes a string containing all the legal
25356          * characters that could possibly be in a name in this series. */
25357         HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
25358         SV * prefix = * hv_fetchs(this_series, "name", 0);
25359         IV low = SvIV(* hv_fetchs(this_series, "low", 0));
25360         IV high = SvIV(* hv_fetchs(this_series, "high", 0));
25361         char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
25362
25363         /* Pre-allocate an SV with enough space */
25364         SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
25365                                                         SvPVX(prefix)));
25366         if (high >= 0x10000) {
25367             sv_catpvs(algo_name, "0");
25368         }
25369
25370         /* This series can be skipped entirely if the pattern requires
25371          * something longer than any name in the series, or can only match an
25372          * empty name, or contains a character not found in any name in the
25373          * series */
25374         if (    prog->minlen <= (SSize_t) SvCUR(algo_name)
25375             &&  prog->maxlen > 0
25376             && (strspn(must, legal) == must_len))
25377         {
25378             for (j = low; j <= high; j++) { /* For each code point in the series */
25379
25380                 /* Get its name, and see if it matches the subpattern */
25381                 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
25382                                      (unsigned) j);
25383
25384                 if (execute_wildcard(subpattern_re,
25385                                     SvPVX(algo_name),
25386                                     SvEND(algo_name),
25387                                     SvPVX(algo_name), 0,
25388                                     algo_name,
25389                                     0))
25390                 {
25391                     *prop_definition = add_cp_to_invlist(*prop_definition, j);
25392                     found_matches = TRUE;
25393                 }
25394             }
25395         }
25396     }
25397
25398   check_empty:
25399     /* Finally, see if the subpattern matches an empty string */
25400     empty = newSVpvs("");
25401     if (execute_wildcard(subpattern_re,
25402                          SvPVX(empty),
25403                          SvEND(empty),
25404                          SvPVX(empty), 0,
25405                          empty,
25406                          0))
25407     {
25408         /* Many code points have empty names.  Currently these are the \p{GC=C}
25409          * ones, minus CC and CF */
25410
25411         SV * empty_names_ref = get_prop_definition(UNI_C);
25412         SV * empty_names = invlist_clone(empty_names_ref, NULL);
25413
25414         SV * subtract = get_prop_definition(UNI_CC);
25415
25416         _invlist_subtract(empty_names, subtract, &empty_names);
25417         SvREFCNT_dec_NN(empty_names_ref);
25418         SvREFCNT_dec_NN(subtract);
25419
25420         subtract = get_prop_definition(UNI_CF);
25421         _invlist_subtract(empty_names, subtract, &empty_names);
25422         SvREFCNT_dec_NN(subtract);
25423
25424         _invlist_union(*prop_definition, empty_names, prop_definition);
25425         found_matches = TRUE;
25426         SvREFCNT_dec_NN(empty_names);
25427     }
25428     SvREFCNT_dec_NN(empty);
25429
25430 #if 0
25431     /* If we ever were to accept aliases for, say private use names, we would
25432      * need to do something fancier to find empty names.  The code below works
25433      * (at the time it was written), and is slower than the above */
25434     const char empties_pat[] = "^.";
25435     if (strNE(name, empties_pat)) {
25436         SV * empty = newSVpvs("");
25437         if (execute_wildcard(subpattern_re,
25438                     SvPVX(empty),
25439                     SvEND(empty),
25440                     SvPVX(empty), 0,
25441                     empty,
25442                     0))
25443         {
25444             SV * empties = NULL;
25445
25446             (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
25447
25448             _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
25449             SvREFCNT_dec_NN(empties);
25450
25451             found_matches = TRUE;
25452         }
25453         SvREFCNT_dec_NN(empty);
25454     }
25455 #endif
25456
25457     SvREFCNT_dec_NN(subpattern_re);
25458     return found_matches;
25459 }
25460
25461 /*
25462  * ex: set ts=8 sts=4 sw=4 et:
25463  */