regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #ifdef HAS_ISBLANK
  95 #   define hasISBLANK 1
  96 #else
  97 #   define hasISBLANK 0
  98 #endif
  99
 100 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 101 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 102 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 103
 104 #ifdef op
 105 #undef op
 106 #endif /* op */
 107
 108 #ifdef MSDOS
 109 #  if defined(BUGGY_MSC6)
 110  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 111 #    pragma optimize("a",off)
 112  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 113 #    pragma optimize("w",on )
 114 #  endif /* BUGGY_MSC6 */
 115 #endif /* MSDOS */
 116
 117 #ifndef STATIC
 118 #define STATIC  static
 119 #endif
 120
 121
 122 typedef struct RExC_state_t {
 123     U32         flags;                  /* RXf_* are we folding, multilining? */
 124     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 125     char        *precomp;               /* uncompiled string. */
 126     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 127     regexp      *rx;                    /* perl core regexp structure */
 128     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 129     char        *start;                 /* Start of input for compile */
 130     char        *end;                   /* End of input for compile */
 131     char        *parse;                 /* Input-scan pointer. */
 132     I32         whilem_seen;            /* number of WHILEM in this expr */
 133     regnode     *emit_start;            /* Start of emitted-code area */
 134     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 135     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 136     I32         naughty;                /* How bad is this pattern? */
 137     I32         sawback;                /* Did we see \1, ...? */
 138     U32         seen;
 139     I32         size;                   /* Code size. */
 140     I32         npar;                   /* Capture buffer count, (OPEN). */
 141     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 142     I32         nestroot;               /* root parens we are in - used by accept */
 143     I32         extralen;
 144     I32         seen_zerolen;
 145     regnode     **open_parens;          /* pointers to open parens */
 146     regnode     **close_parens;         /* pointers to close parens */
 147     regnode     *opend;                 /* END node in program */
 148     I32         utf8;           /* whether the pattern is utf8 or not */
 149     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 150                                 /* XXX use this for future optimisation of case
 151                                  * where pattern must be upgraded to utf8. */
 152     I32         uni_semantics;  /* If a d charset modifier should use unicode
 153                                    rules, even if the pattern is not in
 154                                    utf8 */
 155     HV          *paren_names;           /* Paren names */
 156
 157     regnode     **recurse;              /* Recurse regops */
 158     I32         recurse_count;          /* Number of recurse regops */
 159     I32         in_lookbehind;
 160     I32         contains_locale;
 161     I32         override_recoding;
 162     I32         in_multi_char_class;
 163     struct reg_code_block *code_blocks; /* positions of literal (?{})
 164                                             within pattern */
 165     int         num_code_blocks;        /* size of code_blocks[] */
 166     int         code_index;             /* next code_blocks[] slot */
 167 #if ADD_TO_REGEXEC
 168     char        *starttry;              /* -Dr: where regtry was called. */
 169 #define RExC_starttry   (pRExC_state->starttry)
 170 #endif
 171     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 172 #ifdef DEBUGGING
 173     const char  *lastparse;
 174     I32         lastnum;
 175     AV          *paren_name_list;       /* idx -> name */
 176 #define RExC_lastparse  (pRExC_state->lastparse)
 177 #define RExC_lastnum    (pRExC_state->lastnum)
 178 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 179 #endif
 180 } RExC_state_t;
 181
 182 #define RExC_flags      (pRExC_state->flags)
 183 #define RExC_pm_flags   (pRExC_state->pm_flags)
 184 #define RExC_precomp    (pRExC_state->precomp)
 185 #define RExC_rx_sv      (pRExC_state->rx_sv)
 186 #define RExC_rx         (pRExC_state->rx)
 187 #define RExC_rxi        (pRExC_state->rxi)
 188 #define RExC_start      (pRExC_state->start)
 189 #define RExC_end        (pRExC_state->end)
 190 #define RExC_parse      (pRExC_state->parse)
 191 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 192 #ifdef RE_TRACK_PATTERN_OFFSETS
 193 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 194 #endif
 195 #define RExC_emit       (pRExC_state->emit)
 196 #define RExC_emit_start (pRExC_state->emit_start)
 197 #define RExC_emit_bound (pRExC_state->emit_bound)
 198 #define RExC_naughty    (pRExC_state->naughty)
 199 #define RExC_sawback    (pRExC_state->sawback)
 200 #define RExC_seen       (pRExC_state->seen)
 201 #define RExC_size       (pRExC_state->size)
 202 #define RExC_npar       (pRExC_state->npar)
 203 #define RExC_nestroot   (pRExC_state->nestroot)
 204 #define RExC_extralen   (pRExC_state->extralen)
 205 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 206 #define RExC_utf8       (pRExC_state->utf8)
 207 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 208 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 209 #define RExC_open_parens        (pRExC_state->open_parens)
 210 #define RExC_close_parens       (pRExC_state->close_parens)
 211 #define RExC_opend      (pRExC_state->opend)
 212 #define RExC_paren_names        (pRExC_state->paren_names)
 213 #define RExC_recurse    (pRExC_state->recurse)
 214 #define RExC_recurse_count      (pRExC_state->recurse_count)
 215 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 216 #define RExC_contains_locale    (pRExC_state->contains_locale)
 217 #define RExC_override_recoding (pRExC_state->override_recoding)
 218 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 219
 220
 221 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 222 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 223         ((*s) == '{' && regcurly(s)))
 224
 225 #ifdef SPSTART
 226 #undef SPSTART          /* dratted cpp namespace... */
 227 #endif
 228 /*
 229  * Flags to be passed up and down.
 230  */
 231 #define WORST           0       /* Worst case. */
 232 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 233
 234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 235  * character.  (There needs to be a case: in the switch statement in regexec.c
 236  * for any node marked SIMPLE.)  Note that this is not the same thing as
 237  * REGNODE_SIMPLE */
 238 #define SIMPLE          0x02
 239 #define SPSTART         0x04    /* Starts with * or + */
 240 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 241 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 242
 243 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 244
 245 /* whether trie related optimizations are enabled */
 246 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 247 #define TRIE_STUDY_OPT
 248 #define FULL_TRIE_STUDY
 249 #define TRIE_STCLASS
 250 #endif
 251
 252
 253
 254 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 255 #define PBITVAL(paren) (1 << ((paren) & 7))
 256 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 257 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 258 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 259
 260 /* If not already in utf8, do a longjmp back to the beginning */
 261 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 262 #define REQUIRE_UTF8    STMT_START {                                       \
 263                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 264                         } STMT_END
 265
 266 /* About scan_data_t.
 267
 268   During optimisation we recurse through the regexp program performing
 269   various inplace (keyhole style) optimisations. In addition study_chunk
 270   and scan_commit populate this data structure with information about
 271   what strings MUST appear in the pattern. We look for the longest
 272   string that must appear at a fixed location, and we look for the
 273   longest string that may appear at a floating location. So for instance
 274   in the pattern:
 275
 276     /FOO[xX]A.*B[xX]BAR/
 277
 278   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 279   strings (because they follow a .* construct). study_chunk will identify
 280   both FOO and BAR as being the longest fixed and floating strings respectively.
 281
 282   The strings can be composites, for instance
 283
 284      /(f)(o)(o)/
 285
 286   will result in a composite fixed substring 'foo'.
 287
 288   For each string some basic information is maintained:
 289
 290   - offset or min_offset
 291     This is the position the string must appear at, or not before.
 292     It also implicitly (when combined with minlenp) tells us how many
 293     characters must match before the string we are searching for.
 294     Likewise when combined with minlenp and the length of the string it
 295     tells us how many characters must appear after the string we have
 296     found.
 297
 298   - max_offset
 299     Only used for floating strings. This is the rightmost point that
 300     the string can appear at. If set to I32 max it indicates that the
 301     string can occur infinitely far to the right.
 302
 303   - minlenp
 304     A pointer to the minimum number of characters of the pattern that the
 305     string was found inside. This is important as in the case of positive
 306     lookahead or positive lookbehind we can have multiple patterns
 307     involved. Consider
 308
 309     /(?=FOO).*F/
 310
 311     The minimum length of the pattern overall is 3, the minimum length
 312     of the lookahead part is 3, but the minimum length of the part that
 313     will actually match is 1. So 'FOO's minimum length is 3, but the
 314     minimum length for the F is 1. This is important as the minimum length
 315     is used to determine offsets in front of and behind the string being
 316     looked for.  Since strings can be composites this is the length of the
 317     pattern at the time it was committed with a scan_commit. Note that
 318     the length is calculated by study_chunk, so that the minimum lengths
 319     are not known until the full pattern has been compiled, thus the
 320     pointer to the value.
 321
 322   - lookbehind
 323
 324     In the case of lookbehind the string being searched for can be
 325     offset past the start point of the final matching string.
 326     If this value was just blithely removed from the min_offset it would
 327     invalidate some of the calculations for how many chars must match
 328     before or after (as they are derived from min_offset and minlen and
 329     the length of the string being searched for).
 330     When the final pattern is compiled and the data is moved from the
 331     scan_data_t structure into the regexp structure the information
 332     about lookbehind is factored in, with the information that would
 333     have been lost precalculated in the end_shift field for the
 334     associated string.
 335
 336   The fields pos_min and pos_delta are used to store the minimum offset
 337   and the delta to the maximum offset at the current point in the pattern.
 338
 339 */
 340
 341 typedef struct scan_data_t {
 342     /*I32 len_min;      unused */
 343     /*I32 len_delta;    unused */
 344     I32 pos_min;
 345     I32 pos_delta;
 346     SV *last_found;
 347     I32 last_end;           /* min value, <0 unless valid. */
 348     I32 last_start_min;
 349     I32 last_start_max;
 350     SV **longest;           /* Either &l_fixed, or &l_float. */
 351     SV *longest_fixed;      /* longest fixed string found in pattern */
 352     I32 offset_fixed;       /* offset where it starts */
 353     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 354     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 355     SV *longest_float;      /* longest floating string found in pattern */
 356     I32 offset_float_min;   /* earliest point in string it can appear */
 357     I32 offset_float_max;   /* latest point in string it can appear */
 358     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 359     I32 lookbehind_float;   /* is the position of the string modified by LB */
 360     I32 flags;
 361     I32 whilem_c;
 362     I32 *last_closep;
 363     struct regnode_charclass_class *start_class;
 364 } scan_data_t;
 365
 366 /*
 367  * Forward declarations for pregcomp()'s friends.
 368  */
 369
 370 static const scan_data_t zero_scan_data =
 371   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 372
 373 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 374 #define SF_BEFORE_SEOL          0x0001
 375 #define SF_BEFORE_MEOL          0x0002
 376 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 377 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 378
 379 #ifdef NO_UNARY_PLUS
 380 #  define SF_FIX_SHIFT_EOL      (0+2)
 381 #  define SF_FL_SHIFT_EOL               (0+4)
 382 #else
 383 #  define SF_FIX_SHIFT_EOL      (+2)
 384 #  define SF_FL_SHIFT_EOL               (+4)
 385 #endif
 386
 387 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 388 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 389
 390 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 391 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 392 #define SF_IS_INF               0x0040
 393 #define SF_HAS_PAR              0x0080
 394 #define SF_IN_PAR               0x0100
 395 #define SF_HAS_EVAL             0x0200
 396 #define SCF_DO_SUBSTR           0x0400
 397 #define SCF_DO_STCLASS_AND      0x0800
 398 #define SCF_DO_STCLASS_OR       0x1000
 399 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 400 #define SCF_WHILEM_VISITED_POS  0x2000
 401
 402 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 403 #define SCF_SEEN_ACCEPT         0x8000
 404
 405 #define UTF cBOOL(RExC_utf8)
 406
 407 /* The enums for all these are ordered so things work out correctly */
 408 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 409 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 410 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 411 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 412 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 413 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 414 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 415
 416 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 417
 418 #define OOB_NAMEDCLASS          -1
 419
 420 /* There is no code point that is out-of-bounds, so this is problematic.  But
 421  * its only current use is to initialize a variable that is always set before
 422  * looked at. */
 423 #define OOB_UNICODE             0xDEADBEEF
 424
 425 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 426 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 427
 428
 429 /* length of regex to show in messages that don't mark a position within */
 430 #define RegexLengthToShowInErrorMessages 127
 431
 432 /*
 433  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 434  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 435  * op/pragma/warn/regcomp.
 436  */
 437 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 438 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 439
 440 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 441
 442 /*
 443  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 444  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 445  * "...".
 446  */
 447 #define _FAIL(code) STMT_START {                                        \
 448     const char *ellipses = "";                                          \
 449     IV len = RExC_end - RExC_precomp;                                   \
 450                                                                         \
 451     if (!SIZE_ONLY)                                                     \
 452         SAVEFREESV(RExC_rx_sv);                                         \
 453     if (len > RegexLengthToShowInErrorMessages) {                       \
 454         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 455         len = RegexLengthToShowInErrorMessages - 10;                    \
 456         ellipses = "...";                                               \
 457     }                                                                   \
 458     code;                                                               \
 459 } STMT_END
 460
 461 #define FAIL(msg) _FAIL(                            \
 462     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 463             msg, (int)len, RExC_precomp, ellipses))
 464
 465 #define FAIL2(msg,arg) _FAIL(                       \
 466     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 467             arg, (int)len, RExC_precomp, ellipses))
 468
 469 /*
 470  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 471  */
 472 #define Simple_vFAIL(m) STMT_START {                                    \
 473     const IV offset = RExC_parse - RExC_precomp;                        \
 474     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 475             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 476 } STMT_END
 477
 478 /*
 479  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 480  */
 481 #define vFAIL(m) STMT_START {                           \
 482     if (!SIZE_ONLY)                                     \
 483         SAVEFREESV(RExC_rx_sv);                         \
 484     Simple_vFAIL(m);                                    \
 485 } STMT_END
 486
 487 /*
 488  * Like Simple_vFAIL(), but accepts two arguments.
 489  */
 490 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 491     const IV offset = RExC_parse - RExC_precomp;                        \
 492     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 493             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 494 } STMT_END
 495
 496 /*
 497  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 498  */
 499 #define vFAIL2(m,a1) STMT_START {                       \
 500     if (!SIZE_ONLY)                                     \
 501         SAVEFREESV(RExC_rx_sv);                         \
 502     Simple_vFAIL2(m, a1);                               \
 503 } STMT_END
 504
 505
 506 /*
 507  * Like Simple_vFAIL(), but accepts three arguments.
 508  */
 509 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 510     const IV offset = RExC_parse - RExC_precomp;                \
 511     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 512             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 513 } STMT_END
 514
 515 /*
 516  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 517  */
 518 #define vFAIL3(m,a1,a2) STMT_START {                    \
 519     if (!SIZE_ONLY)                                     \
 520         SAVEFREESV(RExC_rx_sv);                         \
 521     Simple_vFAIL3(m, a1, a2);                           \
 522 } STMT_END
 523
 524 /*
 525  * Like Simple_vFAIL(), but accepts four arguments.
 526  */
 527 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 528     const IV offset = RExC_parse - RExC_precomp;                \
 529     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 530             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 531 } STMT_END
 532
 533 #define ckWARNreg(loc,m) STMT_START {                                   \
 534     const IV offset = loc - RExC_precomp;                               \
 535     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 536             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 537 } STMT_END
 538
 539 #define ckWARNregdep(loc,m) STMT_START {                                \
 540     const IV offset = loc - RExC_precomp;                               \
 541     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 542             m REPORT_LOCATION,                                          \
 543             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 544 } STMT_END
 545
 546 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 547     const IV offset = loc - RExC_precomp;                               \
 548     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 549             m REPORT_LOCATION,                                          \
 550             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 551 } STMT_END
 552
 553 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 554     const IV offset = loc - RExC_precomp;                               \
 555     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 556             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 557 } STMT_END
 558
 559 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 560     const IV offset = loc - RExC_precomp;                               \
 561     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 562             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 563 } STMT_END
 564
 565 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 566     const IV offset = loc - RExC_precomp;                               \
 567     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 568             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 569 } STMT_END
 570
 571 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 572     const IV offset = loc - RExC_precomp;                               \
 573     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 574             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 575 } STMT_END
 576
 577 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 578     const IV offset = loc - RExC_precomp;                               \
 579     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 580             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 581 } STMT_END
 582
 583 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 584     const IV offset = loc - RExC_precomp;                               \
 585     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 586             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 587 } STMT_END
 588
 589
 590 /* Allow for side effects in s */
 591 #define REGC(c,s) STMT_START {                  \
 592     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 593 } STMT_END
 594
 595 /* Macros for recording node offsets.   20001227 mjd@plover.com
 596  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 597  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 598  * Element 0 holds the number n.
 599  * Position is 1 indexed.
 600  */
 601 #ifndef RE_TRACK_PATTERN_OFFSETS
 602 #define Set_Node_Offset_To_R(node,byte)
 603 #define Set_Node_Offset(node,byte)
 604 #define Set_Cur_Node_Offset
 605 #define Set_Node_Length_To_R(node,len)
 606 #define Set_Node_Length(node,len)
 607 #define Set_Node_Cur_Length(node)
 608 #define Node_Offset(n)
 609 #define Node_Length(n)
 610 #define Set_Node_Offset_Length(node,offset,len)
 611 #define ProgLen(ri) ri->u.proglen
 612 #define SetProgLen(ri,x) ri->u.proglen = x
 613 #else
 614 #define ProgLen(ri) ri->u.offsets[0]
 615 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 616 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 617     if (! SIZE_ONLY) {                                                  \
 618         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 619                     __LINE__, (int)(node), (int)(byte)));               \
 620         if((node) < 0) {                                                \
 621             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 622         } else {                                                        \
 623             RExC_offsets[2*(node)-1] = (byte);                          \
 624         }                                                               \
 625     }                                                                   \
 626 } STMT_END
 627
 628 #define Set_Node_Offset(node,byte) \
 629     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 630 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 631
 632 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 633     if (! SIZE_ONLY) {                                                  \
 634         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 635                 __LINE__, (int)(node), (int)(len)));                    \
 636         if((node) < 0) {                                                \
 637             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 638         } else {                                                        \
 639             RExC_offsets[2*(node)] = (len);                             \
 640         }                                                               \
 641     }                                                                   \
 642 } STMT_END
 643
 644 #define Set_Node_Length(node,len) \
 645     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 646 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 647 #define Set_Node_Cur_Length(node) \
 648     Set_Node_Length(node, RExC_parse - parse_start)
 649
 650 /* Get offsets and lengths */
 651 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 652 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 653
 654 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 655     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 656     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 657 } STMT_END
 658 #endif
 659
 660 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 661 #define EXPERIMENTAL_INPLACESCAN
 662 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 663
 664 #define DEBUG_STUDYDATA(str,data,depth)                              \
 665 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 666     PerlIO_printf(Perl_debug_log,                                    \
 667         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 668         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 669         (int)(depth)*2, "",                                          \
 670         (IV)((data)->pos_min),                                       \
 671         (IV)((data)->pos_delta),                                     \
 672         (UV)((data)->flags),                                         \
 673         (IV)((data)->whilem_c),                                      \
 674         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 675         is_inf ? "INF " : ""                                         \
 676     );                                                               \
 677     if ((data)->last_found)                                          \
 678         PerlIO_printf(Perl_debug_log,                                \
 679             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 680             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 681             SvPVX_const((data)->last_found),                         \
 682             (IV)((data)->last_end),                                  \
 683             (IV)((data)->last_start_min),                            \
 684             (IV)((data)->last_start_max),                            \
 685             ((data)->longest &&                                      \
 686              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 687             SvPVX_const((data)->longest_fixed),                      \
 688             (IV)((data)->offset_fixed),                              \
 689             ((data)->longest &&                                      \
 690              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 691             SvPVX_const((data)->longest_float),                      \
 692             (IV)((data)->offset_float_min),                          \
 693             (IV)((data)->offset_float_max)                           \
 694         );                                                           \
 695     PerlIO_printf(Perl_debug_log,"\n");                              \
 696 });
 697
 698 /* Mark that we cannot extend a found fixed substring at this point.
 699    Update the longest found anchored substring and the longest found
 700    floating substrings if needed. */
 701
 702 STATIC void
 703 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 704 {
 705     const STRLEN l = CHR_SVLEN(data->last_found);
 706     const STRLEN old_l = CHR_SVLEN(*data->longest);
 707     GET_RE_DEBUG_FLAGS_DECL;
 708
 709     PERL_ARGS_ASSERT_SCAN_COMMIT;
 710
 711     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 712         SvSetMagicSV(*data->longest, data->last_found);
 713         if (*data->longest == data->longest_fixed) {
 714             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 715             if (data->flags & SF_BEFORE_EOL)
 716                 data->flags
 717                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 718             else
 719                 data->flags &= ~SF_FIX_BEFORE_EOL;
 720             data->minlen_fixed=minlenp;
 721             data->lookbehind_fixed=0;
 722         }
 723         else { /* *data->longest == data->longest_float */
 724             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 725             data->offset_float_max = (l
 726                                       ? data->last_start_max
 727                                       : data->pos_min + data->pos_delta);
 728             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 729                 data->offset_float_max = I32_MAX;
 730             if (data->flags & SF_BEFORE_EOL)
 731                 data->flags
 732                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 733             else
 734                 data->flags &= ~SF_FL_BEFORE_EOL;
 735             data->minlen_float=minlenp;
 736             data->lookbehind_float=0;
 737         }
 738     }
 739     SvCUR_set(data->last_found, 0);
 740     {
 741         SV * const sv = data->last_found;
 742         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 743             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 744             if (mg)
 745                 mg->mg_len = 0;
 746         }
 747     }
 748     data->last_end = -1;
 749     data->flags &= ~SF_BEFORE_EOL;
 750     DEBUG_STUDYDATA("commit: ",data,0);
 751 }
 752
 753 /* Can match anything (initialization) */
 754 STATIC void
 755 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 756 {
 757     PERL_ARGS_ASSERT_CL_ANYTHING;
 758
 759     ANYOF_BITMAP_SETALL(cl);
 760     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 761                 |ANYOF_NON_UTF8_LATIN1_ALL;
 762
 763     /* If any portion of the regex is to operate under locale rules,
 764      * initialization includes it.  The reason this isn't done for all regexes
 765      * is that the optimizer was written under the assumption that locale was
 766      * all-or-nothing.  Given the complexity and lack of documentation in the
 767      * optimizer, and that there are inadequate test cases for locale, so many
 768      * parts of it may not work properly, it is safest to avoid locale unless
 769      * necessary. */
 770     if (RExC_contains_locale) {
 771         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 772         cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
 773     }
 774     else {
 775         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 776     }
 777 }
 778
 779 /* Can match anything (initialization) */
 780 STATIC int
 781 S_cl_is_anything(const struct regnode_charclass_class *cl)
 782 {
 783     int value;
 784
 785     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 786
 787     for (value = 0; value <= ANYOF_MAX; value += 2)
 788         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 789             return 1;
 790     if (!(cl->flags & ANYOF_UNICODE_ALL))
 791         return 0;
 792     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 793         return 0;
 794     return 1;
 795 }
 796
 797 /* Can match anything (initialization) */
 798 STATIC void
 799 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 800 {
 801     PERL_ARGS_ASSERT_CL_INIT;
 802
 803     Zero(cl, 1, struct regnode_charclass_class);
 804     cl->type = ANYOF;
 805     cl_anything(pRExC_state, cl);
 806     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 807 }
 808
 809 /* These two functions currently do the exact same thing */
 810 #define cl_init_zero            S_cl_init
 811
 812 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 813  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 814  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 815 STATIC void
 816 S_cl_and(struct regnode_charclass_class *cl,
 817         const struct regnode_charclass_class *and_with)
 818 {
 819     PERL_ARGS_ASSERT_CL_AND;
 820
 821     assert(and_with->type == ANYOF);
 822
 823     /* I (khw) am not sure all these restrictions are necessary XXX */
 824     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 825         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 826         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 827         && !(and_with->flags & ANYOF_LOC_FOLD)
 828         && !(cl->flags & ANYOF_LOC_FOLD)) {
 829         int i;
 830
 831         if (and_with->flags & ANYOF_INVERT)
 832             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 833                 cl->bitmap[i] &= ~and_with->bitmap[i];
 834         else
 835             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 836                 cl->bitmap[i] &= and_with->bitmap[i];
 837     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 838
 839     if (and_with->flags & ANYOF_INVERT) {
 840
 841         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 842          * aren't affected by the inversion.  Those that are affected are
 843          * handled individually below */
 844         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 845         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 846         cl->flags |= affected_flags;
 847
 848         /* We currently don't know how to deal with things that aren't in the
 849          * bitmap, but we know that the intersection is no greater than what
 850          * is already in cl, so let there be false positives that get sorted
 851          * out after the synthetic start class succeeds, and the node is
 852          * matched for real. */
 853
 854         /* The inversion of these two flags indicate that the resulting
 855          * intersection doesn't have them */
 856         if (and_with->flags & ANYOF_UNICODE_ALL) {
 857             cl->flags &= ~ANYOF_UNICODE_ALL;
 858         }
 859         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 860             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 861         }
 862     }
 863     else {   /* and'd node is not inverted */
 864         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 865
 866         if (! ANYOF_NONBITMAP(and_with)) {
 867
 868             /* Here 'and_with' doesn't match anything outside the bitmap
 869              * (except possibly ANYOF_UNICODE_ALL), which means the
 870              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 871              * which case we don't know what the intersection is, but it's no
 872              * greater than what cl already has, so can just leave it alone,
 873              * with possible false positives */
 874             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 875                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 876                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 877             }
 878         }
 879         else if (! ANYOF_NONBITMAP(cl)) {
 880
 881             /* Here, 'and_with' does match something outside the bitmap, and cl
 882              * doesn't have a list of things to match outside the bitmap.  If
 883              * cl can match all code points above 255, the intersection will
 884              * be those above-255 code points that 'and_with' matches.  If cl
 885              * can't match all Unicode code points, it means that it can't
 886              * match anything outside the bitmap (since the 'if' that got us
 887              * into this block tested for that), so we leave the bitmap empty.
 888              */
 889             if (cl->flags & ANYOF_UNICODE_ALL) {
 890                 ARG_SET(cl, ARG(and_with));
 891
 892                 /* and_with's ARG may match things that don't require UTF8.
 893                  * And now cl's will too, in spite of this being an 'and'.  See
 894                  * the comments below about the kludge */
 895                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 896             }
 897         }
 898         else {
 899             /* Here, both 'and_with' and cl match something outside the
 900              * bitmap.  Currently we do not do the intersection, so just match
 901              * whatever cl had at the beginning.  */
 902         }
 903
 904
 905         /* Take the intersection of the two sets of flags.  However, the
 906          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 907          * kludge around the fact that this flag is not treated like the others
 908          * which are initialized in cl_anything().  The way the optimizer works
 909          * is that the synthetic start class (SSC) is initialized to match
 910          * anything, and then the first time a real node is encountered, its
 911          * values are AND'd with the SSC's with the result being the values of
 912          * the real node.  However, there are paths through the optimizer where
 913          * the AND never gets called, so those initialized bits are set
 914          * inappropriately, which is not usually a big deal, as they just cause
 915          * false positives in the SSC, which will just mean a probably
 916          * imperceptible slow down in execution.  However this bit has a
 917          * higher false positive consequence in that it can cause utf8.pm,
 918          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 919          * bigger slowdown and also causes significant extra memory to be used.
 920          * In order to prevent this, the code now takes a different tack.  The
 921          * bit isn't set unless some part of the regular expression needs it,
 922          * but once set it won't get cleared.  This means that these extra
 923          * modules won't get loaded unless there was some path through the
 924          * pattern that would have required them anyway, and  so any false
 925          * positives that occur by not ANDing them out when they could be
 926          * aren't as severe as they would be if we treated this bit like all
 927          * the others */
 928         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 929                                       & ANYOF_NONBITMAP_NON_UTF8;
 930         cl->flags &= and_with->flags;
 931         cl->flags |= outside_bitmap_but_not_utf8;
 932     }
 933 }
 934
 935 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 936  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 937  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 938 STATIC void
 939 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 940 {
 941     PERL_ARGS_ASSERT_CL_OR;
 942
 943     if (or_with->flags & ANYOF_INVERT) {
 944
 945         /* Here, the or'd node is to be inverted.  This means we take the
 946          * complement of everything not in the bitmap, but currently we don't
 947          * know what that is, so give up and match anything */
 948         if (ANYOF_NONBITMAP(or_with)) {
 949             cl_anything(pRExC_state, cl);
 950         }
 951         /* We do not use
 952          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 953          *   <= (B1 | !B2) | (CL1 | !CL2)
 954          * which is wasteful if CL2 is small, but we ignore CL2:
 955          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 956          * XXXX Can we handle case-fold?  Unclear:
 957          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 958          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 959          */
 960         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 961              && !(or_with->flags & ANYOF_LOC_FOLD)
 962              && !(cl->flags & ANYOF_LOC_FOLD) ) {
 963             int i;
 964
 965             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 966                 cl->bitmap[i] |= ~or_with->bitmap[i];
 967         } /* XXXX: logic is complicated otherwise */
 968         else {
 969             cl_anything(pRExC_state, cl);
 970         }
 971
 972         /* And, we can just take the union of the flags that aren't affected
 973          * by the inversion */
 974         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 975
 976         /* For the remaining flags:
 977             ANYOF_UNICODE_ALL and inverted means to not match anything above
 978                     255, which means that the union with cl should just be
 979                     what cl has in it, so can ignore this flag
 980             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 981                     is 127-255 to match them, but then invert that, so the
 982                     union with cl should just be what cl has in it, so can
 983                     ignore this flag
 984          */
 985     } else {    /* 'or_with' is not inverted */
 986         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 987         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 988              && (!(or_with->flags & ANYOF_LOC_FOLD)
 989                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
 990             int i;
 991
 992             /* OR char bitmap and class bitmap separately */
 993             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 994                 cl->bitmap[i] |= or_with->bitmap[i];
 995             ANYOF_CLASS_OR(or_with, cl);
 996         }
 997         else { /* XXXX: logic is complicated, leave it along for a moment. */
 998             cl_anything(pRExC_state, cl);
 999         }
1000
1001         if (ANYOF_NONBITMAP(or_with)) {
1002
1003             /* Use the added node's outside-the-bit-map match if there isn't a
1004              * conflict.  If there is a conflict (both nodes match something
1005              * outside the bitmap, but what they match outside is not the same
1006              * pointer, and hence not easily compared until XXX we extend
1007              * inversion lists this far), give up and allow the start class to
1008              * match everything outside the bitmap.  If that stuff is all above
1009              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1010             if (! ANYOF_NONBITMAP(cl)) {
1011                 ARG_SET(cl, ARG(or_with));
1012             }
1013             else if (ARG(cl) != ARG(or_with)) {
1014
1015                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1016                     cl_anything(pRExC_state, cl);
1017                 }
1018                 else {
1019                     cl->flags |= ANYOF_UNICODE_ALL;
1020                 }
1021             }
1022         }
1023
1024         /* Take the union */
1025         cl->flags |= or_with->flags;
1026     }
1027 }
1028
1029 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1030 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1031 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1032 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1033
1034
1035 #ifdef DEBUGGING
1036 /*
1037    dump_trie(trie,widecharmap,revcharmap)
1038    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1039    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1040
1041    These routines dump out a trie in a somewhat readable format.
1042    The _interim_ variants are used for debugging the interim
1043    tables that are used to generate the final compressed
1044    representation which is what dump_trie expects.
1045
1046    Part of the reason for their existence is to provide a form
1047    of documentation as to how the different representations function.
1048
1049 */
1050
1051 /*
1052   Dumps the final compressed table form of the trie to Perl_debug_log.
1053   Used for debugging make_trie().
1054 */
1055
1056 STATIC void
1057 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1058             AV *revcharmap, U32 depth)
1059 {
1060     U32 state;
1061     SV *sv=sv_newmortal();
1062     int colwidth= widecharmap ? 6 : 4;
1063     U16 word;
1064     GET_RE_DEBUG_FLAGS_DECL;
1065
1066     PERL_ARGS_ASSERT_DUMP_TRIE;
1067
1068     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1069         (int)depth * 2 + 2,"",
1070         "Match","Base","Ofs" );
1071
1072     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1073         SV ** const tmp = av_fetch( revcharmap, state, 0);
1074         if ( tmp ) {
1075             PerlIO_printf( Perl_debug_log, "%*s",
1076                 colwidth,
1077                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1078                             PL_colors[0], PL_colors[1],
1079                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1080                             PERL_PV_ESCAPE_FIRSTCHAR
1081                 )
1082             );
1083         }
1084     }
1085     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1086         (int)depth * 2 + 2,"");
1087
1088     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1089         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1090     PerlIO_printf( Perl_debug_log, "\n");
1091
1092     for( state = 1 ; state < trie->statecount ; state++ ) {
1093         const U32 base = trie->states[ state ].trans.base;
1094
1095         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1096
1097         if ( trie->states[ state ].wordnum ) {
1098             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1099         } else {
1100             PerlIO_printf( Perl_debug_log, "%6s", "" );
1101         }
1102
1103         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1104
1105         if ( base ) {
1106             U32 ofs = 0;
1107
1108             while( ( base + ofs  < trie->uniquecharcount ) ||
1109                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1110                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1111                     ofs++;
1112
1113             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1114
1115             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1116                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1117                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1118                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1119                 {
1120                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1121                     colwidth,
1122                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1123                 } else {
1124                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1125                 }
1126             }
1127
1128             PerlIO_printf( Perl_debug_log, "]");
1129
1130         }
1131         PerlIO_printf( Perl_debug_log, "\n" );
1132     }
1133     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1134     for (word=1; word <= trie->wordcount; word++) {
1135         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1136             (int)word, (int)(trie->wordinfo[word].prev),
1137             (int)(trie->wordinfo[word].len));
1138     }
1139     PerlIO_printf(Perl_debug_log, "\n" );
1140 }
1141 /*
1142   Dumps a fully constructed but uncompressed trie in list form.
1143   List tries normally only are used for construction when the number of
1144   possible chars (trie->uniquecharcount) is very high.
1145   Used for debugging make_trie().
1146 */
1147 STATIC void
1148 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1149                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1150                          U32 depth)
1151 {
1152     U32 state;
1153     SV *sv=sv_newmortal();
1154     int colwidth= widecharmap ? 6 : 4;
1155     GET_RE_DEBUG_FLAGS_DECL;
1156
1157     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1158
1159     /* print out the table precompression.  */
1160     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1161         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1162         "------:-----+-----------------\n" );
1163
1164     for( state=1 ; state < next_alloc ; state ++ ) {
1165         U16 charid;
1166
1167         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1168             (int)depth * 2 + 2,"", (UV)state  );
1169         if ( ! trie->states[ state ].wordnum ) {
1170             PerlIO_printf( Perl_debug_log, "%5s| ","");
1171         } else {
1172             PerlIO_printf( Perl_debug_log, "W%4x| ",
1173                 trie->states[ state ].wordnum
1174             );
1175         }
1176         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1177             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1178             if ( tmp ) {
1179                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1180                     colwidth,
1181                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1182                             PL_colors[0], PL_colors[1],
1183                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1184                             PERL_PV_ESCAPE_FIRSTCHAR
1185                     ) ,
1186                     TRIE_LIST_ITEM(state,charid).forid,
1187                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1188                 );
1189                 if (!(charid % 10))
1190                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1191                         (int)((depth * 2) + 14), "");
1192             }
1193         }
1194         PerlIO_printf( Perl_debug_log, "\n");
1195     }
1196 }
1197
1198 /*
1199   Dumps a fully constructed but uncompressed trie in table form.
1200   This is the normal DFA style state transition table, with a few
1201   twists to facilitate compression later.
1202   Used for debugging make_trie().
1203 */
1204 STATIC void
1205 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1206                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1207                           U32 depth)
1208 {
1209     U32 state;
1210     U16 charid;
1211     SV *sv=sv_newmortal();
1212     int colwidth= widecharmap ? 6 : 4;
1213     GET_RE_DEBUG_FLAGS_DECL;
1214
1215     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1216
1217     /*
1218        print out the table precompression so that we can do a visual check
1219        that they are identical.
1220      */
1221
1222     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1223
1224     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1225         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1226         if ( tmp ) {
1227             PerlIO_printf( Perl_debug_log, "%*s",
1228                 colwidth,
1229                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1230                             PL_colors[0], PL_colors[1],
1231                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1232                             PERL_PV_ESCAPE_FIRSTCHAR
1233                 )
1234             );
1235         }
1236     }
1237
1238     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1239
1240     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1241         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1242     }
1243
1244     PerlIO_printf( Perl_debug_log, "\n" );
1245
1246     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1247
1248         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1249             (int)depth * 2 + 2,"",
1250             (UV)TRIE_NODENUM( state ) );
1251
1252         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1253             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1254             if (v)
1255                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1256             else
1257                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1258         }
1259         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1260             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1261         } else {
1262             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1263             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1264         }
1265     }
1266 }
1267
1268 #endif
1269
1270
1271 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1272   startbranch: the first branch in the whole branch sequence
1273   first      : start branch of sequence of branch-exact nodes.
1274                May be the same as startbranch
1275   last       : Thing following the last branch.
1276                May be the same as tail.
1277   tail       : item following the branch sequence
1278   count      : words in the sequence
1279   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1280   depth      : indent depth
1281
1282 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1283
1284 A trie is an N'ary tree where the branches are determined by digital
1285 decomposition of the key. IE, at the root node you look up the 1st character and
1286 follow that branch repeat until you find the end of the branches. Nodes can be
1287 marked as "accepting" meaning they represent a complete word. Eg:
1288
1289   /he|she|his|hers/
1290
1291 would convert into the following structure. Numbers represent states, letters
1292 following numbers represent valid transitions on the letter from that state, if
1293 the number is in square brackets it represents an accepting state, otherwise it
1294 will be in parenthesis.
1295
1296       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1297       |    |
1298       |   (2)
1299       |    |
1300      (1)   +-i->(6)-+-s->[7]
1301       |
1302       +-s->(3)-+-h->(4)-+-e->[5]
1303
1304       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1305
1306 This shows that when matching against the string 'hers' we will begin at state 1
1307 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1308 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1309 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1310 single traverse. We store a mapping from accepting to state to which word was
1311 matched, and then when we have multiple possibilities we try to complete the
1312 rest of the regex in the order in which they occured in the alternation.
1313
1314 The only prior NFA like behaviour that would be changed by the TRIE support is
1315 the silent ignoring of duplicate alternations which are of the form:
1316
1317  / (DUPE|DUPE) X? (?{ ... }) Y /x
1318
1319 Thus EVAL blocks following a trie may be called a different number of times with
1320 and without the optimisation. With the optimisations dupes will be silently
1321 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1322 the following demonstrates:
1323
1324  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1325
1326 which prints out 'word' three times, but
1327
1328  'words'=~/(word|word|word)(?{ print $1 })S/
1329
1330 which doesnt print it out at all. This is due to other optimisations kicking in.
1331
1332 Example of what happens on a structural level:
1333
1334 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1335
1336    1: CURLYM[1] {1,32767}(18)
1337    5:   BRANCH(8)
1338    6:     EXACT <ac>(16)
1339    8:   BRANCH(11)
1340    9:     EXACT <ad>(16)
1341   11:   BRANCH(14)
1342   12:     EXACT <ab>(16)
1343   16:   SUCCEED(0)
1344   17:   NOTHING(18)
1345   18: END(0)
1346
1347 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1348 and should turn into:
1349
1350    1: CURLYM[1] {1,32767}(18)
1351    5:   TRIE(16)
1352         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1353           <ac>
1354           <ad>
1355           <ab>
1356   16:   SUCCEED(0)
1357   17:   NOTHING(18)
1358   18: END(0)
1359
1360 Cases where tail != last would be like /(?foo|bar)baz/:
1361
1362    1: BRANCH(4)
1363    2:   EXACT <foo>(8)
1364    4: BRANCH(7)
1365    5:   EXACT <bar>(8)
1366    7: TAIL(8)
1367    8: EXACT <baz>(10)
1368   10: END(0)
1369
1370 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1371 and would end up looking like:
1372
1373     1: TRIE(8)
1374       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1375         <foo>
1376         <bar>
1377    7: TAIL(8)
1378    8: EXACT <baz>(10)
1379   10: END(0)
1380
1381     d = uvuni_to_utf8_flags(d, uv, 0);
1382
1383 is the recommended Unicode-aware way of saying
1384
1385     *(d++) = uv;
1386 */
1387
1388 #define TRIE_STORE_REVCHAR(val)                                            \
1389     STMT_START {                                                           \
1390         if (UTF) {                                                         \
1391             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1392             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1393             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1394             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1395             SvPOK_on(zlopp);                                               \
1396             SvUTF8_on(zlopp);                                              \
1397             av_push(revcharmap, zlopp);                                    \
1398         } else {                                                           \
1399             char ooooff = (char)val;                                           \
1400             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1401         }                                                                  \
1402         } STMT_END
1403
1404 #define TRIE_READ_CHAR STMT_START {                                                     \
1405     wordlen++;                                                                          \
1406     if ( UTF ) {                                                                        \
1407         /* if it is UTF then it is either already folded, or does not need folding */   \
1408         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1409     }                                                                                   \
1410     else if (folder == PL_fold_latin1) {                                                \
1411         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1412         if ( foldlen > 0 ) {                                                            \
1413            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1414            foldlen -= len;                                                              \
1415            scan += len;                                                                 \
1416            len = 0;                                                                     \
1417         } else {                                                                        \
1418             len = 1;                                                                    \
1419             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1420             skiplen = UNISKIP(uvc);                                                     \
1421             foldlen -= skiplen;                                                         \
1422             scan = foldbuf + skiplen;                                                   \
1423         }                                                                               \
1424     } else {                                                                            \
1425         /* raw data, will be folded later if needed */                                  \
1426         uvc = (U32)*uc;                                                                 \
1427         len = 1;                                                                        \
1428     }                                                                                   \
1429 } STMT_END
1430
1431
1432
1433 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1434     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1435         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1436         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1437     }                                                           \
1438     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1439     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1440     TRIE_LIST_CUR( state )++;                                   \
1441 } STMT_END
1442
1443 #define TRIE_LIST_NEW(state) STMT_START {                       \
1444     Newxz( trie->states[ state ].trans.list,               \
1445         4, reg_trie_trans_le );                                 \
1446      TRIE_LIST_CUR( state ) = 1;                                \
1447      TRIE_LIST_LEN( state ) = 4;                                \
1448 } STMT_END
1449
1450 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1451     U16 dupe= trie->states[ state ].wordnum;                    \
1452     regnode * const noper_next = regnext( noper );              \
1453                                                                 \
1454     DEBUG_r({                                                   \
1455         /* store the word for dumping */                        \
1456         SV* tmp;                                                \
1457         if (OP(noper) != NOTHING)                               \
1458             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1459         else                                                    \
1460             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1461         av_push( trie_words, tmp );                             \
1462     });                                                         \
1463                                                                 \
1464     curword++;                                                  \
1465     trie->wordinfo[curword].prev   = 0;                         \
1466     trie->wordinfo[curword].len    = wordlen;                   \
1467     trie->wordinfo[curword].accept = state;                     \
1468                                                                 \
1469     if ( noper_next < tail ) {                                  \
1470         if (!trie->jump)                                        \
1471             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1472         trie->jump[curword] = (U16)(noper_next - convert);      \
1473         if (!jumper)                                            \
1474             jumper = noper_next;                                \
1475         if (!nextbranch)                                        \
1476             nextbranch= regnext(cur);                           \
1477     }                                                           \
1478                                                                 \
1479     if ( dupe ) {                                               \
1480         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1481         /* chain, so that when the bits of chain are later    */\
1482         /* linked together, the dups appear in the chain      */\
1483         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1484         trie->wordinfo[dupe].prev = curword;                    \
1485     } else {                                                    \
1486         /* we haven't inserted this word yet.                */ \
1487         trie->states[ state ].wordnum = curword;                \
1488     }                                                           \
1489 } STMT_END
1490
1491
1492 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1493      ( ( base + charid >=  ucharcount                                   \
1494          && base + charid < ubound                                      \
1495          && state == trie->trans[ base - ucharcount + charid ].check    \
1496          && trie->trans[ base - ucharcount + charid ].next )            \
1497            ? trie->trans[ base - ucharcount + charid ].next             \
1498            : ( state==1 ? special : 0 )                                 \
1499       )
1500
1501 #define MADE_TRIE       1
1502 #define MADE_JUMP_TRIE  2
1503 #define MADE_EXACT_TRIE 4
1504
1505 STATIC I32
1506 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1507 {
1508     dVAR;
1509     /* first pass, loop through and scan words */
1510     reg_trie_data *trie;
1511     HV *widecharmap = NULL;
1512     AV *revcharmap = newAV();
1513     regnode *cur;
1514     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1515     STRLEN len = 0;
1516     UV uvc = 0;
1517     U16 curword = 0;
1518     U32 next_alloc = 0;
1519     regnode *jumper = NULL;
1520     regnode *nextbranch = NULL;
1521     regnode *convert = NULL;
1522     U32 *prev_states; /* temp array mapping each state to previous one */
1523     /* we just use folder as a flag in utf8 */
1524     const U8 * folder = NULL;
1525
1526 #ifdef DEBUGGING
1527     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1528     AV *trie_words = NULL;
1529     /* along with revcharmap, this only used during construction but both are
1530      * useful during debugging so we store them in the struct when debugging.
1531      */
1532 #else
1533     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1534     STRLEN trie_charcount=0;
1535 #endif
1536     SV *re_trie_maxbuff;
1537     GET_RE_DEBUG_FLAGS_DECL;
1538
1539     PERL_ARGS_ASSERT_MAKE_TRIE;
1540 #ifndef DEBUGGING
1541     PERL_UNUSED_ARG(depth);
1542 #endif
1543
1544     switch (flags) {
1545         case EXACT: break;
1546         case EXACTFA:
1547         case EXACTFU_SS:
1548         case EXACTFU_TRICKYFOLD:
1549         case EXACTFU: folder = PL_fold_latin1; break;
1550         case EXACTF:  folder = PL_fold; break;
1551         case EXACTFL: folder = PL_fold_locale; break;
1552         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1553     }
1554
1555     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1556     trie->refcount = 1;
1557     trie->startstate = 1;
1558     trie->wordcount = word_count;
1559     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1560     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1561     if (flags == EXACT)
1562         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1563     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1564                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1565
1566     DEBUG_r({
1567         trie_words = newAV();
1568     });
1569
1570     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1571     if (!SvIOK(re_trie_maxbuff)) {
1572         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1573     }
1574     DEBUG_TRIE_COMPILE_r({
1575                 PerlIO_printf( Perl_debug_log,
1576                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1577                   (int)depth * 2 + 2, "",
1578                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1579                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1580                   (int)depth);
1581     });
1582
1583    /* Find the node we are going to overwrite */
1584     if ( first == startbranch && OP( last ) != BRANCH ) {
1585         /* whole branch chain */
1586         convert = first;
1587     } else {
1588         /* branch sub-chain */
1589         convert = NEXTOPER( first );
1590     }
1591
1592     /*  -- First loop and Setup --
1593
1594        We first traverse the branches and scan each word to determine if it
1595        contains widechars, and how many unique chars there are, this is
1596        important as we have to build a table with at least as many columns as we
1597        have unique chars.
1598
1599        We use an array of integers to represent the character codes 0..255
1600        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1601        native representation of the character value as the key and IV's for the
1602        coded index.
1603
1604        *TODO* If we keep track of how many times each character is used we can
1605        remap the columns so that the table compression later on is more
1606        efficient in terms of memory by ensuring the most common value is in the
1607        middle and the least common are on the outside.  IMO this would be better
1608        than a most to least common mapping as theres a decent chance the most
1609        common letter will share a node with the least common, meaning the node
1610        will not be compressible. With a middle is most common approach the worst
1611        case is when we have the least common nodes twice.
1612
1613      */
1614
1615     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1616         regnode *noper = NEXTOPER( cur );
1617         const U8 *uc = (U8*)STRING( noper );
1618         const U8 *e  = uc + STR_LEN( noper );
1619         STRLEN foldlen = 0;
1620         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1621         STRLEN skiplen = 0;
1622         const U8 *scan = (U8*)NULL;
1623         U32 wordlen      = 0;         /* required init */
1624         STRLEN chars = 0;
1625         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1626
1627         if (OP(noper) == NOTHING) {
1628             regnode *noper_next= regnext(noper);
1629             if (noper_next != tail && OP(noper_next) == flags) {
1630                 noper = noper_next;
1631                 uc= (U8*)STRING(noper);
1632                 e= uc + STR_LEN(noper);
1633                 trie->minlen= STR_LEN(noper);
1634             } else {
1635                 trie->minlen= 0;
1636                 continue;
1637             }
1638         }
1639
1640         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1641             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1642                                           regardless of encoding */
1643             if (OP( noper ) == EXACTFU_SS) {
1644                 /* false positives are ok, so just set this */
1645                 TRIE_BITMAP_SET(trie,0xDF);
1646             }
1647         }
1648         for ( ; uc < e ; uc += len ) {
1649             TRIE_CHARCOUNT(trie)++;
1650             TRIE_READ_CHAR;
1651             chars++;
1652             if ( uvc < 256 ) {
1653                 if ( folder ) {
1654                     U8 folded= folder[ (U8) uvc ];
1655                     if ( !trie->charmap[ folded ] ) {
1656                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1657                         TRIE_STORE_REVCHAR( folded );
1658                     }
1659                 }
1660                 if ( !trie->charmap[ uvc ] ) {
1661                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1662                     TRIE_STORE_REVCHAR( uvc );
1663                 }
1664                 if ( set_bit ) {
1665                     /* store the codepoint in the bitmap, and its folded
1666                      * equivalent. */
1667                     TRIE_BITMAP_SET(trie, uvc);
1668
1669                     /* store the folded codepoint */
1670                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1671
1672                     if ( !UTF ) {
1673                         /* store first byte of utf8 representation of
1674                            variant codepoints */
1675                         if (! UNI_IS_INVARIANT(uvc)) {
1676                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1677                         }
1678                     }
1679                     set_bit = 0; /* We've done our bit :-) */
1680                 }
1681             } else {
1682                 SV** svpp;
1683                 if ( !widecharmap )
1684                     widecharmap = newHV();
1685
1686                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1687
1688                 if ( !svpp )
1689                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1690
1691                 if ( !SvTRUE( *svpp ) ) {
1692                     sv_setiv( *svpp, ++trie->uniquecharcount );
1693                     TRIE_STORE_REVCHAR(uvc);
1694                 }
1695             }
1696         }
1697         if( cur == first ) {
1698             trie->minlen = chars;
1699             trie->maxlen = chars;
1700         } else if (chars < trie->minlen) {
1701             trie->minlen = chars;
1702         } else if (chars > trie->maxlen) {
1703             trie->maxlen = chars;
1704         }
1705         if (OP( noper ) == EXACTFU_SS) {
1706             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1707             if (trie->minlen > 1)
1708                 trie->minlen= 1;
1709         }
1710         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1711             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1712              *                - We assume that any such sequence might match a 2 byte string */
1713             if (trie->minlen > 2 )
1714                 trie->minlen= 2;
1715         }
1716
1717     } /* end first pass */
1718     DEBUG_TRIE_COMPILE_r(
1719         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1720                 (int)depth * 2 + 2,"",
1721                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1722                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1723                 (int)trie->minlen, (int)trie->maxlen )
1724     );
1725
1726     /*
1727         We now know what we are dealing with in terms of unique chars and
1728         string sizes so we can calculate how much memory a naive
1729         representation using a flat table  will take. If it's over a reasonable
1730         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1731         conservative but potentially much slower representation using an array
1732         of lists.
1733
1734         At the end we convert both representations into the same compressed
1735         form that will be used in regexec.c for matching with. The latter
1736         is a form that cannot be used to construct with but has memory
1737         properties similar to the list form and access properties similar
1738         to the table form making it both suitable for fast searches and
1739         small enough that its feasable to store for the duration of a program.
1740
1741         See the comment in the code where the compressed table is produced
1742         inplace from the flat tabe representation for an explanation of how
1743         the compression works.
1744
1745     */
1746
1747
1748     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1749     prev_states[1] = 0;
1750
1751     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1752         /*
1753             Second Pass -- Array Of Lists Representation
1754
1755             Each state will be represented by a list of charid:state records
1756             (reg_trie_trans_le) the first such element holds the CUR and LEN
1757             points of the allocated array. (See defines above).
1758
1759             We build the initial structure using the lists, and then convert
1760             it into the compressed table form which allows faster lookups
1761             (but cant be modified once converted).
1762         */
1763
1764         STRLEN transcount = 1;
1765
1766         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1767             "%*sCompiling trie using list compiler\n",
1768             (int)depth * 2 + 2, ""));
1769
1770         trie->states = (reg_trie_state *)
1771             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1772                                   sizeof(reg_trie_state) );
1773         TRIE_LIST_NEW(1);
1774         next_alloc = 2;
1775
1776         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1777
1778             regnode *noper   = NEXTOPER( cur );
1779             U8 *uc           = (U8*)STRING( noper );
1780             const U8 *e      = uc + STR_LEN( noper );
1781             U32 state        = 1;         /* required init */
1782             U16 charid       = 0;         /* sanity init */
1783             U8 *scan         = (U8*)NULL; /* sanity init */
1784             STRLEN foldlen   = 0;         /* required init */
1785             U32 wordlen      = 0;         /* required init */
1786             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1787             STRLEN skiplen   = 0;
1788
1789             if (OP(noper) == NOTHING) {
1790                 regnode *noper_next= regnext(noper);
1791                 if (noper_next != tail && OP(noper_next) == flags) {
1792                     noper = noper_next;
1793                     uc= (U8*)STRING(noper);
1794                     e= uc + STR_LEN(noper);
1795                 }
1796             }
1797
1798             if (OP(noper) != NOTHING) {
1799                 for ( ; uc < e ; uc += len ) {
1800
1801                     TRIE_READ_CHAR;
1802
1803                     if ( uvc < 256 ) {
1804                         charid = trie->charmap[ uvc ];
1805                     } else {
1806                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1807                         if ( !svpp ) {
1808                             charid = 0;
1809                         } else {
1810                             charid=(U16)SvIV( *svpp );
1811                         }
1812                     }
1813                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1814                     if ( charid ) {
1815
1816                         U16 check;
1817                         U32 newstate = 0;
1818
1819                         charid--;
1820                         if ( !trie->states[ state ].trans.list ) {
1821                             TRIE_LIST_NEW( state );
1822                         }
1823                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1824                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1825                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1826                                 break;
1827                             }
1828                         }
1829                         if ( ! newstate ) {
1830                             newstate = next_alloc++;
1831                             prev_states[newstate] = state;
1832                             TRIE_LIST_PUSH( state, charid, newstate );
1833                             transcount++;
1834                         }
1835                         state = newstate;
1836                     } else {
1837                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1838                     }
1839                 }
1840             }
1841             TRIE_HANDLE_WORD(state);
1842
1843         } /* end second pass */
1844
1845         /* next alloc is the NEXT state to be allocated */
1846         trie->statecount = next_alloc;
1847         trie->states = (reg_trie_state *)
1848             PerlMemShared_realloc( trie->states,
1849                                    next_alloc
1850                                    * sizeof(reg_trie_state) );
1851
1852         /* and now dump it out before we compress it */
1853         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1854                                                          revcharmap, next_alloc,
1855                                                          depth+1)
1856         );
1857
1858         trie->trans = (reg_trie_trans *)
1859             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1860         {
1861             U32 state;
1862             U32 tp = 0;
1863             U32 zp = 0;
1864
1865
1866             for( state=1 ; state < next_alloc ; state ++ ) {
1867                 U32 base=0;
1868
1869                 /*
1870                 DEBUG_TRIE_COMPILE_MORE_r(
1871                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1872                 );
1873                 */
1874
1875                 if (trie->states[state].trans.list) {
1876                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1877                     U16 maxid=minid;
1878                     U16 idx;
1879
1880                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1881                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1882                         if ( forid < minid ) {
1883                             minid=forid;
1884                         } else if ( forid > maxid ) {
1885                             maxid=forid;
1886                         }
1887                     }
1888                     if ( transcount < tp + maxid - minid + 1) {
1889                         transcount *= 2;
1890                         trie->trans = (reg_trie_trans *)
1891                             PerlMemShared_realloc( trie->trans,
1892                                                      transcount
1893                                                      * sizeof(reg_trie_trans) );
1894                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1895                     }
1896                     base = trie->uniquecharcount + tp - minid;
1897                     if ( maxid == minid ) {
1898                         U32 set = 0;
1899                         for ( ; zp < tp ; zp++ ) {
1900                             if ( ! trie->trans[ zp ].next ) {
1901                                 base = trie->uniquecharcount + zp - minid;
1902                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1903                                 trie->trans[ zp ].check = state;
1904                                 set = 1;
1905                                 break;
1906                             }
1907                         }
1908                         if ( !set ) {
1909                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1910                             trie->trans[ tp ].check = state;
1911                             tp++;
1912                             zp = tp;
1913                         }
1914                     } else {
1915                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1916                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1917                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1918                             trie->trans[ tid ].check = state;
1919                         }
1920                         tp += ( maxid - minid + 1 );
1921                     }
1922                     Safefree(trie->states[ state ].trans.list);
1923                 }
1924                 /*
1925                 DEBUG_TRIE_COMPILE_MORE_r(
1926                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1927                 );
1928                 */
1929                 trie->states[ state ].trans.base=base;
1930             }
1931             trie->lasttrans = tp + 1;
1932         }
1933     } else {
1934         /*
1935            Second Pass -- Flat Table Representation.
1936
1937            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1938            We know that we will need Charcount+1 trans at most to store the data
1939            (one row per char at worst case) So we preallocate both structures
1940            assuming worst case.
1941
1942            We then construct the trie using only the .next slots of the entry
1943            structs.
1944
1945            We use the .check field of the first entry of the node temporarily to
1946            make compression both faster and easier by keeping track of how many non
1947            zero fields are in the node.
1948
1949            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1950            transition.
1951
1952            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1953            number representing the first entry of the node, and state as a
1954            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1955            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1956            are 2 entrys per node. eg:
1957
1958              A B       A B
1959           1. 2 4    1. 3 7
1960           2. 0 3    3. 0 5
1961           3. 0 0    5. 0 0
1962           4. 0 0    7. 0 0
1963
1964            The table is internally in the right hand, idx form. However as we also
1965            have to deal with the states array which is indexed by nodenum we have to
1966            use TRIE_NODENUM() to convert.
1967
1968         */
1969         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1970             "%*sCompiling trie using table compiler\n",
1971             (int)depth * 2 + 2, ""));
1972
1973         trie->trans = (reg_trie_trans *)
1974             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1975                                   * trie->uniquecharcount + 1,
1976                                   sizeof(reg_trie_trans) );
1977         trie->states = (reg_trie_state *)
1978             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1979                                   sizeof(reg_trie_state) );
1980         next_alloc = trie->uniquecharcount + 1;
1981
1982
1983         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1984
1985             regnode *noper   = NEXTOPER( cur );
1986             const U8 *uc     = (U8*)STRING( noper );
1987             const U8 *e      = uc + STR_LEN( noper );
1988
1989             U32 state        = 1;         /* required init */
1990
1991             U16 charid       = 0;         /* sanity init */
1992             U32 accept_state = 0;         /* sanity init */
1993             U8 *scan         = (U8*)NULL; /* sanity init */
1994
1995             STRLEN foldlen   = 0;         /* required init */
1996             U32 wordlen      = 0;         /* required init */
1997             STRLEN skiplen   = 0;
1998             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1999
2000             if (OP(noper) == NOTHING) {
2001                 regnode *noper_next= regnext(noper);
2002                 if (noper_next != tail && OP(noper_next) == flags) {
2003                     noper = noper_next;
2004                     uc= (U8*)STRING(noper);
2005                     e= uc + STR_LEN(noper);
2006                 }
2007             }
2008
2009             if ( OP(noper) != NOTHING ) {
2010                 for ( ; uc < e ; uc += len ) {
2011
2012                     TRIE_READ_CHAR;
2013
2014                     if ( uvc < 256 ) {
2015                         charid = trie->charmap[ uvc ];
2016                     } else {
2017                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2018                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2019                     }
2020                     if ( charid ) {
2021                         charid--;
2022                         if ( !trie->trans[ state + charid ].next ) {
2023                             trie->trans[ state + charid ].next = next_alloc;
2024                             trie->trans[ state ].check++;
2025                             prev_states[TRIE_NODENUM(next_alloc)]
2026                                     = TRIE_NODENUM(state);
2027                             next_alloc += trie->uniquecharcount;
2028                         }
2029                         state = trie->trans[ state + charid ].next;
2030                     } else {
2031                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2032                     }
2033                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2034                 }
2035             }
2036             accept_state = TRIE_NODENUM( state );
2037             TRIE_HANDLE_WORD(accept_state);
2038
2039         } /* end second pass */
2040
2041         /* and now dump it out before we compress it */
2042         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2043                                                           revcharmap,
2044                                                           next_alloc, depth+1));
2045
2046         {
2047         /*
2048            * Inplace compress the table.*
2049
2050            For sparse data sets the table constructed by the trie algorithm will
2051            be mostly 0/FAIL transitions or to put it another way mostly empty.
2052            (Note that leaf nodes will not contain any transitions.)
2053
2054            This algorithm compresses the tables by eliminating most such
2055            transitions, at the cost of a modest bit of extra work during lookup:
2056
2057            - Each states[] entry contains a .base field which indicates the
2058            index in the state[] array wheres its transition data is stored.
2059
2060            - If .base is 0 there are no valid transitions from that node.
2061
2062            - If .base is nonzero then charid is added to it to find an entry in
2063            the trans array.
2064
2065            -If trans[states[state].base+charid].check!=state then the
2066            transition is taken to be a 0/Fail transition. Thus if there are fail
2067            transitions at the front of the node then the .base offset will point
2068            somewhere inside the previous nodes data (or maybe even into a node
2069            even earlier), but the .check field determines if the transition is
2070            valid.
2071
2072            XXX - wrong maybe?
2073            The following process inplace converts the table to the compressed
2074            table: We first do not compress the root node 1,and mark all its
2075            .check pointers as 1 and set its .base pointer as 1 as well. This
2076            allows us to do a DFA construction from the compressed table later,
2077            and ensures that any .base pointers we calculate later are greater
2078            than 0.
2079
2080            - We set 'pos' to indicate the first entry of the second node.
2081
2082            - We then iterate over the columns of the node, finding the first and
2083            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2084            and set the .check pointers accordingly, and advance pos
2085            appropriately and repreat for the next node. Note that when we copy
2086            the next pointers we have to convert them from the original
2087            NODEIDX form to NODENUM form as the former is not valid post
2088            compression.
2089
2090            - If a node has no transitions used we mark its base as 0 and do not
2091            advance the pos pointer.
2092
2093            - If a node only has one transition we use a second pointer into the
2094            structure to fill in allocated fail transitions from other states.
2095            This pointer is independent of the main pointer and scans forward
2096            looking for null transitions that are allocated to a state. When it
2097            finds one it writes the single transition into the "hole".  If the
2098            pointer doesnt find one the single transition is appended as normal.
2099
2100            - Once compressed we can Renew/realloc the structures to release the
2101            excess space.
2102
2103            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2104            specifically Fig 3.47 and the associated pseudocode.
2105
2106            demq
2107         */
2108         const U32 laststate = TRIE_NODENUM( next_alloc );
2109         U32 state, charid;
2110         U32 pos = 0, zp=0;
2111         trie->statecount = laststate;
2112
2113         for ( state = 1 ; state < laststate ; state++ ) {
2114             U8 flag = 0;
2115             const U32 stateidx = TRIE_NODEIDX( state );
2116             const U32 o_used = trie->trans[ stateidx ].check;
2117             U32 used = trie->trans[ stateidx ].check;
2118             trie->trans[ stateidx ].check = 0;
2119
2120             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2121                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2122                     if ( trie->trans[ stateidx + charid ].next ) {
2123                         if (o_used == 1) {
2124                             for ( ; zp < pos ; zp++ ) {
2125                                 if ( ! trie->trans[ zp ].next ) {
2126                                     break;
2127                                 }
2128                             }
2129                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2130                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2131                             trie->trans[ zp ].check = state;
2132                             if ( ++zp > pos ) pos = zp;
2133                             break;
2134                         }
2135                         used--;
2136                     }
2137                     if ( !flag ) {
2138                         flag = 1;
2139                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2140                     }
2141                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2142                     trie->trans[ pos ].check = state;
2143                     pos++;
2144                 }
2145             }
2146         }
2147         trie->lasttrans = pos + 1;
2148         trie->states = (reg_trie_state *)
2149             PerlMemShared_realloc( trie->states, laststate
2150                                    * sizeof(reg_trie_state) );
2151         DEBUG_TRIE_COMPILE_MORE_r(
2152                 PerlIO_printf( Perl_debug_log,
2153                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2154                     (int)depth * 2 + 2,"",
2155                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2156                     (IV)next_alloc,
2157                     (IV)pos,
2158                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2159             );
2160
2161         } /* end table compress */
2162     }
2163     DEBUG_TRIE_COMPILE_MORE_r(
2164             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2165                 (int)depth * 2 + 2, "",
2166                 (UV)trie->statecount,
2167                 (UV)trie->lasttrans)
2168     );
2169     /* resize the trans array to remove unused space */
2170     trie->trans = (reg_trie_trans *)
2171         PerlMemShared_realloc( trie->trans, trie->lasttrans
2172                                * sizeof(reg_trie_trans) );
2173
2174     {   /* Modify the program and insert the new TRIE node */
2175         U8 nodetype =(U8)(flags & 0xFF);
2176         char *str=NULL;
2177
2178 #ifdef DEBUGGING
2179         regnode *optimize = NULL;
2180 #ifdef RE_TRACK_PATTERN_OFFSETS
2181
2182         U32 mjd_offset = 0;
2183         U32 mjd_nodelen = 0;
2184 #endif /* RE_TRACK_PATTERN_OFFSETS */
2185 #endif /* DEBUGGING */
2186         /*
2187            This means we convert either the first branch or the first Exact,
2188            depending on whether the thing following (in 'last') is a branch
2189            or not and whther first is the startbranch (ie is it a sub part of
2190            the alternation or is it the whole thing.)
2191            Assuming its a sub part we convert the EXACT otherwise we convert
2192            the whole branch sequence, including the first.
2193          */
2194         /* Find the node we are going to overwrite */
2195         if ( first != startbranch || OP( last ) == BRANCH ) {
2196             /* branch sub-chain */
2197             NEXT_OFF( first ) = (U16)(last - first);
2198 #ifdef RE_TRACK_PATTERN_OFFSETS
2199             DEBUG_r({
2200                 mjd_offset= Node_Offset((convert));
2201                 mjd_nodelen= Node_Length((convert));
2202             });
2203 #endif
2204             /* whole branch chain */
2205         }
2206 #ifdef RE_TRACK_PATTERN_OFFSETS
2207         else {
2208             DEBUG_r({
2209                 const  regnode *nop = NEXTOPER( convert );
2210                 mjd_offset= Node_Offset((nop));
2211                 mjd_nodelen= Node_Length((nop));
2212             });
2213         }
2214         DEBUG_OPTIMISE_r(
2215             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2216                 (int)depth * 2 + 2, "",
2217                 (UV)mjd_offset, (UV)mjd_nodelen)
2218         );
2219 #endif
2220         /* But first we check to see if there is a common prefix we can
2221            split out as an EXACT and put in front of the TRIE node.  */
2222         trie->startstate= 1;
2223         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2224             U32 state;
2225             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2226                 U32 ofs = 0;
2227                 I32 idx = -1;
2228                 U32 count = 0;
2229                 const U32 base = trie->states[ state ].trans.base;
2230
2231                 if ( trie->states[state].wordnum )
2232                         count = 1;
2233
2234                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2235                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2236                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2237                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2238                     {
2239                         if ( ++count > 1 ) {
2240                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2241                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2242                             if ( state == 1 ) break;
2243                             if ( count == 2 ) {
2244                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2245                                 DEBUG_OPTIMISE_r(
2246                                     PerlIO_printf(Perl_debug_log,
2247                                         "%*sNew Start State=%"UVuf" Class: [",
2248                                         (int)depth * 2 + 2, "",
2249                                         (UV)state));
2250                                 if (idx >= 0) {
2251                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2252                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2253
2254                                     TRIE_BITMAP_SET(trie,*ch);
2255                                     if ( folder )
2256                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2257                                     DEBUG_OPTIMISE_r(
2258                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2259                                     );
2260                                 }
2261                             }
2262                             TRIE_BITMAP_SET(trie,*ch);
2263                             if ( folder )
2264                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2265                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2266                         }
2267                         idx = ofs;
2268                     }
2269                 }
2270                 if ( count == 1 ) {
2271                     SV **tmp = av_fetch( revcharmap, idx, 0);
2272                     STRLEN len;
2273                     char *ch = SvPV( *tmp, len );
2274                     DEBUG_OPTIMISE_r({
2275                         SV *sv=sv_newmortal();
2276                         PerlIO_printf( Perl_debug_log,
2277                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2278                             (int)depth * 2 + 2, "",
2279                             (UV)state, (UV)idx,
2280                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2281                                 PL_colors[0], PL_colors[1],
2282                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2283                                 PERL_PV_ESCAPE_FIRSTCHAR
2284                             )
2285                         );
2286                     });
2287                     if ( state==1 ) {
2288                         OP( convert ) = nodetype;
2289                         str=STRING(convert);
2290                         STR_LEN(convert)=0;
2291                     }
2292                     STR_LEN(convert) += len;
2293                     while (len--)
2294                         *str++ = *ch++;
2295                 } else {
2296 #ifdef DEBUGGING
2297                     if (state>1)
2298                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2299 #endif
2300                     break;
2301                 }
2302             }
2303             trie->prefixlen = (state-1);
2304             if (str) {
2305                 regnode *n = convert+NODE_SZ_STR(convert);
2306                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2307                 trie->startstate = state;
2308                 trie->minlen -= (state - 1);
2309                 trie->maxlen -= (state - 1);
2310 #ifdef DEBUGGING
2311                /* At least the UNICOS C compiler choked on this
2312                 * being argument to DEBUG_r(), so let's just have
2313                 * it right here. */
2314                if (
2315 #ifdef PERL_EXT_RE_BUILD
2316                    1
2317 #else
2318                    DEBUG_r_TEST
2319 #endif
2320                    ) {
2321                    regnode *fix = convert;
2322                    U32 word = trie->wordcount;
2323                    mjd_nodelen++;
2324                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2325                    while( ++fix < n ) {
2326                        Set_Node_Offset_Length(fix, 0, 0);
2327                    }
2328                    while (word--) {
2329                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2330                        if (tmp) {
2331                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2332                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2333                            else
2334                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2335                        }
2336                    }
2337                }
2338 #endif
2339                 if (trie->maxlen) {
2340                     convert = n;
2341                 } else {
2342                     NEXT_OFF(convert) = (U16)(tail - convert);
2343                     DEBUG_r(optimize= n);
2344                 }
2345             }
2346         }
2347         if (!jumper)
2348             jumper = last;
2349         if ( trie->maxlen ) {
2350             NEXT_OFF( convert ) = (U16)(tail - convert);
2351             ARG_SET( convert, data_slot );
2352             /* Store the offset to the first unabsorbed branch in
2353                jump[0], which is otherwise unused by the jump logic.
2354                We use this when dumping a trie and during optimisation. */
2355             if (trie->jump)
2356                 trie->jump[0] = (U16)(nextbranch - convert);
2357
2358             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2359              *   and there is a bitmap
2360              *   and the first "jump target" node we found leaves enough room
2361              * then convert the TRIE node into a TRIEC node, with the bitmap
2362              * embedded inline in the opcode - this is hypothetically faster.
2363              */
2364             if ( !trie->states[trie->startstate].wordnum
2365                  && trie->bitmap
2366                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2367             {
2368                 OP( convert ) = TRIEC;
2369                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2370                 PerlMemShared_free(trie->bitmap);
2371                 trie->bitmap= NULL;
2372             } else
2373                 OP( convert ) = TRIE;
2374
2375             /* store the type in the flags */
2376             convert->flags = nodetype;
2377             DEBUG_r({
2378             optimize = convert
2379                       + NODE_STEP_REGNODE
2380                       + regarglen[ OP( convert ) ];
2381             });
2382             /* XXX We really should free up the resource in trie now,
2383                    as we won't use them - (which resources?) dmq */
2384         }
2385         /* needed for dumping*/
2386         DEBUG_r(if (optimize) {
2387             regnode *opt = convert;
2388
2389             while ( ++opt < optimize) {
2390                 Set_Node_Offset_Length(opt,0,0);
2391             }
2392             /*
2393                 Try to clean up some of the debris left after the
2394                 optimisation.
2395              */
2396             while( optimize < jumper ) {
2397                 mjd_nodelen += Node_Length((optimize));
2398                 OP( optimize ) = OPTIMIZED;
2399                 Set_Node_Offset_Length(optimize,0,0);
2400                 optimize++;
2401             }
2402             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2403         });
2404     } /* end node insert */
2405
2406     /*  Finish populating the prev field of the wordinfo array.  Walk back
2407      *  from each accept state until we find another accept state, and if
2408      *  so, point the first word's .prev field at the second word. If the
2409      *  second already has a .prev field set, stop now. This will be the
2410      *  case either if we've already processed that word's accept state,
2411      *  or that state had multiple words, and the overspill words were
2412      *  already linked up earlier.
2413      */
2414     {
2415         U16 word;
2416         U32 state;
2417         U16 prev;
2418
2419         for (word=1; word <= trie->wordcount; word++) {
2420             prev = 0;
2421             if (trie->wordinfo[word].prev)
2422                 continue;
2423             state = trie->wordinfo[word].accept;
2424             while (state) {
2425                 state = prev_states[state];
2426                 if (!state)
2427                     break;
2428                 prev = trie->states[state].wordnum;
2429                 if (prev)
2430                     break;
2431             }
2432             trie->wordinfo[word].prev = prev;
2433         }
2434         Safefree(prev_states);
2435     }
2436
2437
2438     /* and now dump out the compressed format */
2439     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2440
2441     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2442 #ifdef DEBUGGING
2443     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2444     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2445 #else
2446     SvREFCNT_dec(revcharmap);
2447 #endif
2448     return trie->jump
2449            ? MADE_JUMP_TRIE
2450            : trie->startstate>1
2451              ? MADE_EXACT_TRIE
2452              : MADE_TRIE;
2453 }
2454
2455 STATIC void
2456 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2457 {
2458 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2459
2460    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2461    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2462    ISBN 0-201-10088-6
2463
2464    We find the fail state for each state in the trie, this state is the longest proper
2465    suffix of the current state's 'word' that is also a proper prefix of another word in our
2466    trie. State 1 represents the word '' and is thus the default fail state. This allows
2467    the DFA not to have to restart after its tried and failed a word at a given point, it
2468    simply continues as though it had been matching the other word in the first place.
2469    Consider
2470       'abcdgu'=~/abcdefg|cdgu/
2471    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2472    fail, which would bring us to the state representing 'd' in the second word where we would
2473    try 'g' and succeed, proceeding to match 'cdgu'.
2474  */
2475  /* add a fail transition */
2476     const U32 trie_offset = ARG(source);
2477     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2478     U32 *q;
2479     const U32 ucharcount = trie->uniquecharcount;
2480     const U32 numstates = trie->statecount;
2481     const U32 ubound = trie->lasttrans + ucharcount;
2482     U32 q_read = 0;
2483     U32 q_write = 0;
2484     U32 charid;
2485     U32 base = trie->states[ 1 ].trans.base;
2486     U32 *fail;
2487     reg_ac_data *aho;
2488     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2489     GET_RE_DEBUG_FLAGS_DECL;
2490
2491     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2492 #ifndef DEBUGGING
2493     PERL_UNUSED_ARG(depth);
2494 #endif
2495
2496
2497     ARG_SET( stclass, data_slot );
2498     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2499     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2500     aho->trie=trie_offset;
2501     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2502     Copy( trie->states, aho->states, numstates, reg_trie_state );
2503     Newxz( q, numstates, U32);
2504     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2505     aho->refcount = 1;
2506     fail = aho->fail;
2507     /* initialize fail[0..1] to be 1 so that we always have
2508        a valid final fail state */
2509     fail[ 0 ] = fail[ 1 ] = 1;
2510
2511     for ( charid = 0; charid < ucharcount ; charid++ ) {
2512         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2513         if ( newstate ) {
2514             q[ q_write ] = newstate;
2515             /* set to point at the root */
2516             fail[ q[ q_write++ ] ]=1;
2517         }
2518     }
2519     while ( q_read < q_write) {
2520         const U32 cur = q[ q_read++ % numstates ];
2521         base = trie->states[ cur ].trans.base;
2522
2523         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2524             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2525             if (ch_state) {
2526                 U32 fail_state = cur;
2527                 U32 fail_base;
2528                 do {
2529                     fail_state = fail[ fail_state ];
2530                     fail_base = aho->states[ fail_state ].trans.base;
2531                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2532
2533                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2534                 fail[ ch_state ] = fail_state;
2535                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2536                 {
2537                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2538                 }
2539                 q[ q_write++ % numstates] = ch_state;
2540             }
2541         }
2542     }
2543     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2544        when we fail in state 1, this allows us to use the
2545        charclass scan to find a valid start char. This is based on the principle
2546        that theres a good chance the string being searched contains lots of stuff
2547        that cant be a start char.
2548      */
2549     fail[ 0 ] = fail[ 1 ] = 0;
2550     DEBUG_TRIE_COMPILE_r({
2551         PerlIO_printf(Perl_debug_log,
2552                       "%*sStclass Failtable (%"UVuf" states): 0",
2553                       (int)(depth * 2), "", (UV)numstates
2554         );
2555         for( q_read=1; q_read<numstates; q_read++ ) {
2556             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2557         }
2558         PerlIO_printf(Perl_debug_log, "\n");
2559     });
2560     Safefree(q);
2561     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2562 }
2563
2564
2565 /*
2566  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2567  * These need to be revisited when a newer toolchain becomes available.
2568  */
2569 #if defined(__sparc64__) && defined(__GNUC__)
2570 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2571 #       undef  SPARC64_GCC_WORKAROUND
2572 #       define SPARC64_GCC_WORKAROUND 1
2573 #   endif
2574 #endif
2575
2576 #define DEBUG_PEEP(str,scan,depth) \
2577     DEBUG_OPTIMISE_r({if (scan){ \
2578        SV * const mysv=sv_newmortal(); \
2579        regnode *Next = regnext(scan); \
2580        regprop(RExC_rx, mysv, scan); \
2581        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2582        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2583        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2584    }});
2585
2586
2587 /* The below joins as many adjacent EXACTish nodes as possible into a single
2588  * one.  The regop may be changed if the node(s) contain certain sequences that
2589  * require special handling.  The joining is only done if:
2590  * 1) there is room in the current conglomerated node to entirely contain the
2591  *    next one.
2592  * 2) they are the exact same node type
2593  *
2594  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2595  * these get optimized out
2596  *
2597  * If a node is to match under /i (folded), the number of characters it matches
2598  * can be different than its character length if it contains a multi-character
2599  * fold.  *min_subtract is set to the total delta of the input nodes.
2600  *
2601  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2602  * and contains LATIN SMALL LETTER SHARP S
2603  *
2604  * This is as good a place as any to discuss the design of handling these
2605  * multi-character fold sequences.  It's been wrong in Perl for a very long
2606  * time.  There are three code points in Unicode whose multi-character folds
2607  * were long ago discovered to mess things up.  The previous designs for
2608  * dealing with these involved assigning a special node for them.  This
2609  * approach doesn't work, as evidenced by this example:
2610  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2611  * Both these fold to "sss", but if the pattern is parsed to create a node that
2612  * would match just the \xDF, it won't be able to handle the case where a
2613  * successful match would have to cross the node's boundary.  The new approach
2614  * that hopefully generally solves the problem generates an EXACTFU_SS node
2615  * that is "sss".
2616  *
2617  * It turns out that there are problems with all multi-character folds, and not
2618  * just these three.  Now the code is general, for all such cases, but the
2619  * three still have some special handling.  The approach taken is:
2620  * 1)   This routine examines each EXACTFish node that could contain multi-
2621  *      character fold sequences.  It returns in *min_subtract how much to
2622  *      subtract from the the actual length of the string to get a real minimum
2623  *      match length; it is 0 if there are no multi-char folds.  This delta is
2624  *      used by the caller to adjust the min length of the match, and the delta
2625  *      between min and max, so that the optimizer doesn't reject these
2626  *      possibilities based on size constraints.
2627  * 2)   Certain of these sequences require special handling by the trie code,
2628  *      so, if found, this code changes the joined node type to special ops:
2629  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2630  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2631  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2632  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2633  *      there is a possible fold length change.  That means that a regular
2634  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2635  *      with length changes, and so can be processed faster.  regexec.c takes
2636  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2637  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2638  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2639  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2640  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2641  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2642  *      possibilities for the non-UTF8 patterns are quite simple, except for
2643  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2644  *      members of a fold-pair, and arrays are set up for all of them so that
2645  *      the other member of the pair can be found quickly.  Code elsewhere in
2646  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2647  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2648  *      described in the next item.
2649  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2650  *      'ss' or not is not knowable at compile time.  It will match iff the
2651  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2652  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2653  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2654  *      described in item 3).  An assumption that the optimizer part of
2655  *      regexec.c (probably unwittingly) makes is that a character in the
2656  *      pattern corresponds to at most a single character in the target string.
2657  *      (And I do mean character, and not byte here, unlike other parts of the
2658  *      documentation that have never been updated to account for multibyte
2659  *      Unicode.)  This assumption is wrong only in this case, as all other
2660  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2661  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2662  *      reluctant to try to change this assumption, so instead the code punts.
2663  *      This routine examines EXACTF nodes for the sharp s, and returns a
2664  *      boolean indicating whether or not the node is an EXACTF node that
2665  *      contains a sharp s.  When it is true, the caller sets a flag that later
2666  *      causes the optimizer in this file to not set values for the floating
2667  *      and fixed string lengths, and thus avoids the optimizer code in
2668  *      regexec.c that makes the invalid assumption.  Thus, there is no
2669  *      optimization based on string lengths for EXACTF nodes that contain the
2670  *      sharp s.  This only happens for /id rules (which means the pattern
2671  *      isn't in UTF-8).
2672  */
2673
2674 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2675     if (PL_regkind[OP(scan)] == EXACT) \
2676         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2677
2678 STATIC U32
2679 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2680     /* Merge several consecutive EXACTish nodes into one. */
2681     regnode *n = regnext(scan);
2682     U32 stringok = 1;
2683     regnode *next = scan + NODE_SZ_STR(scan);
2684     U32 merged = 0;
2685     U32 stopnow = 0;
2686 #ifdef DEBUGGING
2687     regnode *stop = scan;
2688     GET_RE_DEBUG_FLAGS_DECL;
2689 #else
2690     PERL_UNUSED_ARG(depth);
2691 #endif
2692
2693     PERL_ARGS_ASSERT_JOIN_EXACT;
2694 #ifndef EXPERIMENTAL_INPLACESCAN
2695     PERL_UNUSED_ARG(flags);
2696     PERL_UNUSED_ARG(val);
2697 #endif
2698     DEBUG_PEEP("join",scan,depth);
2699
2700     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2701      * EXACT ones that are mergeable to the current one. */
2702     while (n
2703            && (PL_regkind[OP(n)] == NOTHING
2704                || (stringok && OP(n) == OP(scan)))
2705            && NEXT_OFF(n)
2706            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2707     {
2708
2709         if (OP(n) == TAIL || n > next)
2710             stringok = 0;
2711         if (PL_regkind[OP(n)] == NOTHING) {
2712             DEBUG_PEEP("skip:",n,depth);
2713             NEXT_OFF(scan) += NEXT_OFF(n);
2714             next = n + NODE_STEP_REGNODE;
2715 #ifdef DEBUGGING
2716             if (stringok)
2717                 stop = n;
2718 #endif
2719             n = regnext(n);
2720         }
2721         else if (stringok) {
2722             const unsigned int oldl = STR_LEN(scan);
2723             regnode * const nnext = regnext(n);
2724
2725             /* XXX I (khw) kind of doubt that this works on platforms where
2726              * U8_MAX is above 255 because of lots of other assumptions */
2727             if (oldl + STR_LEN(n) > U8_MAX)
2728                 break;
2729
2730             DEBUG_PEEP("merg",n,depth);
2731             merged++;
2732
2733             NEXT_OFF(scan) += NEXT_OFF(n);
2734             STR_LEN(scan) += STR_LEN(n);
2735             next = n + NODE_SZ_STR(n);
2736             /* Now we can overwrite *n : */
2737             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2738 #ifdef DEBUGGING
2739             stop = next - 1;
2740 #endif
2741             n = nnext;
2742             if (stopnow) break;
2743         }
2744
2745 #ifdef EXPERIMENTAL_INPLACESCAN
2746         if (flags && !NEXT_OFF(n)) {
2747             DEBUG_PEEP("atch", val, depth);
2748             if (reg_off_by_arg[OP(n)]) {
2749                 ARG_SET(n, val - n);
2750             }
2751             else {
2752                 NEXT_OFF(n) = val - n;
2753             }
2754             stopnow = 1;
2755         }
2756 #endif
2757     }
2758
2759     *min_subtract = 0;
2760     *has_exactf_sharp_s = FALSE;
2761
2762     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2763      * can now analyze for sequences of problematic code points.  (Prior to
2764      * this final joining, sequences could have been split over boundaries, and
2765      * hence missed).  The sequences only happen in folding, hence for any
2766      * non-EXACT EXACTish node */
2767     if (OP(scan) != EXACT) {
2768         const U8 * const s0 = (U8*) STRING(scan);
2769         const U8 * s = s0;
2770         const U8 * const s_end = s0 + STR_LEN(scan);
2771
2772         /* One pass is made over the node's string looking for all the
2773          * possibilities.  to avoid some tests in the loop, there are two main
2774          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2775          * non-UTF-8 */
2776         if (UTF) {
2777
2778             /* Examine the string for a multi-character fold sequence.  UTF-8
2779              * patterns have all characters pre-folded by the time this code is
2780              * executed */
2781             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2782                                      length sequence we are looking for is 2 */
2783             {
2784                 int count = 0;
2785                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2786                 if (! len) {    /* Not a multi-char fold: get next char */
2787                     s += UTF8SKIP(s);
2788                     continue;
2789                 }
2790
2791                 /* Nodes with 'ss' require special handling, except for EXACTFL
2792                  * and EXACTFA for which there is no multi-char fold to this */
2793                 if (len == 2 && *s == 's' && *(s+1) == 's'
2794                     && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2795                 {
2796                     count = 2;
2797                     OP(scan) = EXACTFU_SS;
2798                     s += 2;
2799                 }
2800                 else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2801                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2802                                       COMBINING_DIAERESIS_UTF8
2803                                       COMBINING_ACUTE_ACCENT_UTF8,
2804                                    6)
2805                              || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2806                                          COMBINING_DIAERESIS_UTF8
2807                                          COMBINING_ACUTE_ACCENT_UTF8,
2808                                      6)))
2809                 {
2810                     count = 3;
2811
2812                     /* These two folds require special handling by trie's, so
2813                      * change the node type to indicate this.  If EXACTFA and
2814                      * EXACTFL were ever to be handled by trie's, this would
2815                      * have to be changed.  If this node has already been
2816                      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2817                      * (khw) think it doesn't matter in regexec.c for UTF
2818                      * patterns, but no need to change it */
2819                     if (OP(scan) == EXACTFU) {
2820                         OP(scan) = EXACTFU_TRICKYFOLD;
2821                     }
2822                     s += 6;
2823                 }
2824                 else { /* Here is a generic multi-char fold. */
2825                     const U8* multi_end  = s + len;
2826
2827                     /* Count how many characters in it.  In the case of /l and
2828                      * /aa, no folds which contain ASCII code points are
2829                      * allowed, so check for those, and skip if found.  (In
2830                      * EXACTFL, no folds are allowed to any Latin1 code point,
2831                      * not just ASCII.  But there aren't any of these
2832                      * currently, nor ever likely, so don't take the time to
2833                      * test for them.  The code that generates the
2834                      * is_MULTI_foo() macros croaks should one actually get put
2835                      * into Unicode .) */
2836                     if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2837                         count = utf8_length(s, multi_end);
2838                         s = multi_end;
2839                     }
2840                     else {
2841                         while (s < multi_end) {
2842                             if (isASCII(*s)) {
2843                                 s++;
2844                                 goto next_iteration;
2845                             }
2846                             else {
2847                                 s += UTF8SKIP(s);
2848                             }
2849                             count++;
2850                         }
2851                     }
2852                 }
2853
2854                 /* The delta is how long the sequence is minus 1 (1 is how long
2855                  * the character that folds to the sequence is) */
2856                 *min_subtract += count - 1;
2857             next_iteration: ;
2858             }
2859         }
2860         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2861
2862             /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2863              * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2864              * nodes can't have multi-char folds to this range (and there are
2865              * no existing ones in the upper latin1 range).  In the EXACTF
2866              * case we look also for the sharp s, which can be in the final
2867              * position.  Otherwise we can stop looking 1 byte earlier because
2868              * have to find at least two characters for a multi-fold */
2869             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2870
2871             /* The below is perhaps overboard, but this allows us to save a
2872              * test each time through the loop at the expense of a mask.  This
2873              * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2874              * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2875              * are 64.  This uses an exclusive 'or' to find that bit and then
2876              * inverts it to form a mask, with just a single 0, in the bit
2877              * position where 'S' and 's' differ. */
2878             const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2879             const U8 s_masked = 's' & S_or_s_mask;
2880
2881             while (s < upper) {
2882                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2883                 if (! len) {    /* Not a multi-char fold. */
2884                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2885                     {
2886                         *has_exactf_sharp_s = TRUE;
2887                     }
2888                     s++;
2889                     continue;
2890                 }
2891
2892                 if (len == 2
2893                     && ((*s & S_or_s_mask) == s_masked)
2894                     && ((*(s+1) & S_or_s_mask) == s_masked))
2895                 {
2896
2897                     /* EXACTF nodes need to know that the minimum length
2898                      * changed so that a sharp s in the string can match this
2899                      * ss in the pattern, but they remain EXACTF nodes, as they
2900                      * won't match this unless the target string is is UTF-8,
2901                      * which we don't know until runtime */
2902                     if (OP(scan) != EXACTF) {
2903                         OP(scan) = EXACTFU_SS;
2904                     }
2905                 }
2906
2907                 *min_subtract += len - 1;
2908                 s += len;
2909             }
2910         }
2911     }
2912
2913 #ifdef DEBUGGING
2914     /* Allow dumping but overwriting the collection of skipped
2915      * ops and/or strings with fake optimized ops */
2916     n = scan + NODE_SZ_STR(scan);
2917     while (n <= stop) {
2918         OP(n) = OPTIMIZED;
2919         FLAGS(n) = 0;
2920         NEXT_OFF(n) = 0;
2921         n++;
2922     }
2923 #endif
2924     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2925     return stopnow;
2926 }
2927
2928 /* REx optimizer.  Converts nodes into quicker variants "in place".
2929    Finds fixed substrings.  */
2930
2931 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2932    to the position after last scanned or to NULL. */
2933
2934 #define INIT_AND_WITHP \
2935     assert(!and_withp); \
2936     Newx(and_withp,1,struct regnode_charclass_class); \
2937     SAVEFREEPV(and_withp)
2938
2939 /* this is a chain of data about sub patterns we are processing that
2940    need to be handled separately/specially in study_chunk. Its so
2941    we can simulate recursion without losing state.  */
2942 struct scan_frame;
2943 typedef struct scan_frame {
2944     regnode *last;  /* last node to process in this frame */
2945     regnode *next;  /* next node to process when last is reached */
2946     struct scan_frame *prev; /*previous frame*/
2947     I32 stop; /* what stopparen do we use */
2948 } scan_frame;
2949
2950
2951 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2952
2953 #define CASE_SYNST_FNC(nAmE)                                       \
2954 case nAmE:                                                         \
2955     if (flags & SCF_DO_STCLASS_AND) {                              \
2956             for (value = 0; value < 256; value++)                  \
2957                 if (!is_ ## nAmE ## _cp(value))                       \
2958                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2959     }                                                              \
2960     else {                                                         \
2961             for (value = 0; value < 256; value++)                  \
2962                 if (is_ ## nAmE ## _cp(value))                        \
2963                     ANYOF_BITMAP_SET(data->start_class, value);    \
2964     }                                                              \
2965     break;                                                         \
2966 case N ## nAmE:                                                    \
2967     if (flags & SCF_DO_STCLASS_AND) {                              \
2968             for (value = 0; value < 256; value++)                   \
2969                 if (is_ ## nAmE ## _cp(value))                         \
2970                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2971     }                                                               \
2972     else {                                                          \
2973             for (value = 0; value < 256; value++)                   \
2974                 if (!is_ ## nAmE ## _cp(value))                        \
2975                     ANYOF_BITMAP_SET(data->start_class, value);     \
2976     }                                                               \
2977     break
2978
2979
2980
2981 STATIC I32
2982 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2983                         I32 *minlenp, I32 *deltap,
2984                         regnode *last,
2985                         scan_data_t *data,
2986                         I32 stopparen,
2987                         U8* recursed,
2988                         struct regnode_charclass_class *and_withp,
2989                         U32 flags, U32 depth)
2990                         /* scanp: Start here (read-write). */
2991                         /* deltap: Write maxlen-minlen here. */
2992                         /* last: Stop before this one. */
2993                         /* data: string data about the pattern */
2994                         /* stopparen: treat close N as END */
2995                         /* recursed: which subroutines have we recursed into */
2996                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2997 {
2998     dVAR;
2999     I32 min = 0;    /* There must be at least this number of characters to match */
3000     I32 pars = 0, code;
3001     regnode *scan = *scanp, *next;
3002     I32 delta = 0;
3003     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3004     int is_inf_internal = 0;            /* The studied chunk is infinite */
3005     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3006     scan_data_t data_fake;
3007     SV *re_trie_maxbuff = NULL;
3008     regnode *first_non_open = scan;
3009     I32 stopmin = I32_MAX;
3010     scan_frame *frame = NULL;
3011     GET_RE_DEBUG_FLAGS_DECL;
3012
3013     PERL_ARGS_ASSERT_STUDY_CHUNK;
3014
3015 #ifdef DEBUGGING
3016     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3017 #endif
3018
3019     if ( depth == 0 ) {
3020         while (first_non_open && OP(first_non_open) == OPEN)
3021             first_non_open=regnext(first_non_open);
3022     }
3023
3024
3025   fake_study_recurse:
3026     while ( scan && OP(scan) != END && scan < last ){
3027         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3028                                    node length to get a real minimum (because
3029                                    the folded version may be shorter) */
3030         bool has_exactf_sharp_s = FALSE;
3031         /* Peephole optimizer: */
3032         DEBUG_STUDYDATA("Peep:", data,depth);
3033         DEBUG_PEEP("Peep",scan,depth);
3034
3035         /* Its not clear to khw or hv why this is done here, and not in the
3036          * clauses that deal with EXACT nodes.  khw's guess is that it's
3037          * because of a previous design */
3038         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3039
3040         /* Follow the next-chain of the current node and optimize
3041            away all the NOTHINGs from it.  */
3042         if (OP(scan) != CURLYX) {
3043             const int max = (reg_off_by_arg[OP(scan)]
3044                        ? I32_MAX
3045                        /* I32 may be smaller than U16 on CRAYs! */
3046                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3047             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3048             int noff;
3049             regnode *n = scan;
3050
3051             /* Skip NOTHING and LONGJMP. */
3052             while ((n = regnext(n))
3053                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3054                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3055                    && off + noff < max)
3056                 off += noff;
3057             if (reg_off_by_arg[OP(scan)])
3058                 ARG(scan) = off;
3059             else
3060                 NEXT_OFF(scan) = off;
3061         }
3062
3063
3064
3065         /* The principal pseudo-switch.  Cannot be a switch, since we
3066            look into several different things.  */
3067         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3068                    || OP(scan) == IFTHEN) {
3069             next = regnext(scan);
3070             code = OP(scan);
3071             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3072
3073             if (OP(next) == code || code == IFTHEN) {
3074                 /* NOTE - There is similar code to this block below for handling
3075                    TRIE nodes on a re-study.  If you change stuff here check there
3076                    too. */
3077                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3078                 struct regnode_charclass_class accum;
3079                 regnode * const startbranch=scan;
3080
3081                 if (flags & SCF_DO_SUBSTR)
3082                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3083                 if (flags & SCF_DO_STCLASS)
3084                     cl_init_zero(pRExC_state, &accum);
3085
3086                 while (OP(scan) == code) {
3087                     I32 deltanext, minnext, f = 0, fake;
3088                     struct regnode_charclass_class this_class;
3089
3090                     num++;
3091                     data_fake.flags = 0;
3092                     if (data) {
3093                         data_fake.whilem_c = data->whilem_c;
3094                         data_fake.last_closep = data->last_closep;
3095                     }
3096                     else
3097                         data_fake.last_closep = &fake;
3098
3099                     data_fake.pos_delta = delta;
3100                     next = regnext(scan);
3101                     scan = NEXTOPER(scan);
3102                     if (code != BRANCH)
3103                         scan = NEXTOPER(scan);
3104                     if (flags & SCF_DO_STCLASS) {
3105                         cl_init(pRExC_state, &this_class);
3106                         data_fake.start_class = &this_class;
3107                         f = SCF_DO_STCLASS_AND;
3108                     }
3109                     if (flags & SCF_WHILEM_VISITED_POS)
3110                         f |= SCF_WHILEM_VISITED_POS;
3111
3112                     /* we suppose the run is continuous, last=next...*/
3113                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3114                                           next, &data_fake,
3115                                           stopparen, recursed, NULL, f,depth+1);
3116                     if (min1 > minnext)
3117                         min1 = minnext;
3118                     if (max1 < minnext + deltanext)
3119                         max1 = minnext + deltanext;
3120                     if (deltanext == I32_MAX)
3121                         is_inf = is_inf_internal = 1;
3122                     scan = next;
3123                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3124                         pars++;
3125                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3126                         if ( stopmin > minnext)
3127                             stopmin = min + min1;
3128                         flags &= ~SCF_DO_SUBSTR;
3129                         if (data)
3130                             data->flags |= SCF_SEEN_ACCEPT;
3131                     }
3132                     if (data) {
3133                         if (data_fake.flags & SF_HAS_EVAL)
3134                             data->flags |= SF_HAS_EVAL;
3135                         data->whilem_c = data_fake.whilem_c;
3136                     }
3137                     if (flags & SCF_DO_STCLASS)
3138                         cl_or(pRExC_state, &accum, &this_class);
3139                 }
3140                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3141                     min1 = 0;
3142                 if (flags & SCF_DO_SUBSTR) {
3143                     data->pos_min += min1;
3144                     data->pos_delta += max1 - min1;
3145                     if (max1 != min1 || is_inf)
3146                         data->longest = &(data->longest_float);
3147                 }
3148                 min += min1;
3149                 delta += max1 - min1;
3150                 if (flags & SCF_DO_STCLASS_OR) {
3151                     cl_or(pRExC_state, data->start_class, &accum);
3152                     if (min1) {
3153                         cl_and(data->start_class, and_withp);
3154                         flags &= ~SCF_DO_STCLASS;
3155                     }
3156                 }
3157                 else if (flags & SCF_DO_STCLASS_AND) {
3158                     if (min1) {
3159                         cl_and(data->start_class, &accum);
3160                         flags &= ~SCF_DO_STCLASS;
3161                     }
3162                     else {
3163                         /* Switch to OR mode: cache the old value of
3164                          * data->start_class */
3165                         INIT_AND_WITHP;
3166                         StructCopy(data->start_class, and_withp,
3167                                    struct regnode_charclass_class);
3168                         flags &= ~SCF_DO_STCLASS_AND;
3169                         StructCopy(&accum, data->start_class,
3170                                    struct regnode_charclass_class);
3171                         flags |= SCF_DO_STCLASS_OR;
3172                         data->start_class->flags |= ANYOF_EOS;
3173                     }
3174                 }
3175
3176                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3177                 /* demq.
3178
3179                    Assuming this was/is a branch we are dealing with: 'scan' now
3180                    points at the item that follows the branch sequence, whatever
3181                    it is. We now start at the beginning of the sequence and look
3182                    for subsequences of
3183
3184                    BRANCH->EXACT=>x1
3185                    BRANCH->EXACT=>x2
3186                    tail
3187
3188                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3189
3190                    If we can find such a subsequence we need to turn the first
3191                    element into a trie and then add the subsequent branch exact
3192                    strings to the trie.
3193
3194                    We have two cases
3195
3196                      1. patterns where the whole set of branches can be converted.
3197
3198                      2. patterns where only a subset can be converted.
3199
3200                    In case 1 we can replace the whole set with a single regop
3201                    for the trie. In case 2 we need to keep the start and end
3202                    branches so
3203
3204                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3205                      becomes BRANCH TRIE; BRANCH X;
3206
3207                   There is an additional case, that being where there is a
3208                   common prefix, which gets split out into an EXACT like node
3209                   preceding the TRIE node.
3210
3211                   If x(1..n)==tail then we can do a simple trie, if not we make
3212                   a "jump" trie, such that when we match the appropriate word
3213                   we "jump" to the appropriate tail node. Essentially we turn
3214                   a nested if into a case structure of sorts.
3215
3216                 */
3217
3218                     int made=0;
3219                     if (!re_trie_maxbuff) {
3220                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3221                         if (!SvIOK(re_trie_maxbuff))
3222                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3223                     }
3224                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3225                         regnode *cur;
3226                         regnode *first = (regnode *)NULL;
3227                         regnode *last = (regnode *)NULL;
3228                         regnode *tail = scan;
3229                         U8 trietype = 0;
3230                         U32 count=0;
3231
3232 #ifdef DEBUGGING
3233                         SV * const mysv = sv_newmortal();       /* for dumping */
3234 #endif
3235                         /* var tail is used because there may be a TAIL
3236                            regop in the way. Ie, the exacts will point to the
3237                            thing following the TAIL, but the last branch will
3238                            point at the TAIL. So we advance tail. If we
3239                            have nested (?:) we may have to move through several
3240                            tails.
3241                          */
3242
3243                         while ( OP( tail ) == TAIL ) {
3244                             /* this is the TAIL generated by (?:) */
3245                             tail = regnext( tail );
3246                         }
3247
3248
3249                         DEBUG_TRIE_COMPILE_r({
3250                             regprop(RExC_rx, mysv, tail );
3251                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3252                                 (int)depth * 2 + 2, "",
3253                                 "Looking for TRIE'able sequences. Tail node is: ",
3254                                 SvPV_nolen_const( mysv )
3255                             );
3256                         });
3257
3258                         /*
3259
3260                             Step through the branches
3261                                 cur represents each branch,
3262                                 noper is the first thing to be matched as part of that branch
3263                                 noper_next is the regnext() of that node.
3264
3265                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3266                             via a "jump trie" but we also support building with NOJUMPTRIE,
3267                             which restricts the trie logic to structures like /FOO|BAR/.
3268
3269                             If noper is a trieable nodetype then the branch is a possible optimization
3270                             target. If we are building under NOJUMPTRIE then we require that noper_next
3271                             is the same as scan (our current position in the regex program).
3272
3273                             Once we have two or more consecutive such branches we can create a
3274                             trie of the EXACT's contents and stitch it in place into the program.
3275
3276                             If the sequence represents all of the branches in the alternation we
3277                             replace the entire thing with a single TRIE node.
3278
3279                             Otherwise when it is a subsequence we need to stitch it in place and
3280                             replace only the relevant branches. This means the first branch has
3281                             to remain as it is used by the alternation logic, and its next pointer,
3282                             and needs to be repointed at the item on the branch chain following
3283                             the last branch we have optimized away.
3284
3285                             This could be either a BRANCH, in which case the subsequence is internal,
3286                             or it could be the item following the branch sequence in which case the
3287                             subsequence is at the end (which does not necessarily mean the first node
3288                             is the start of the alternation).
3289
3290                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3291
3292                                 optype          |  trietype
3293                                 ----------------+-----------
3294                                 NOTHING         | NOTHING
3295                                 EXACT           | EXACT
3296                                 EXACTFU         | EXACTFU
3297                                 EXACTFU_SS      | EXACTFU
3298                                 EXACTFU_TRICKYFOLD | EXACTFU
3299                                 EXACTFA         | 0
3300
3301
3302                         */
3303 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3304                        ( EXACT == (X) )   ? EXACT :        \
3305                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3306                        0 )
3307
3308                         /* dont use tail as the end marker for this traverse */
3309                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3310                             regnode * const noper = NEXTOPER( cur );
3311                             U8 noper_type = OP( noper );
3312                             U8 noper_trietype = TRIE_TYPE( noper_type );
3313 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3314                             regnode * const noper_next = regnext( noper );
3315                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3316                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3317 #endif
3318
3319                             DEBUG_TRIE_COMPILE_r({
3320                                 regprop(RExC_rx, mysv, cur);
3321                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3322                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3323
3324                                 regprop(RExC_rx, mysv, noper);
3325                                 PerlIO_printf( Perl_debug_log, " -> %s",
3326                                     SvPV_nolen_const(mysv));
3327
3328                                 if ( noper_next ) {
3329                                   regprop(RExC_rx, mysv, noper_next );
3330                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3331                                     SvPV_nolen_const(mysv));
3332                                 }
3333                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3334                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3335                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3336                                 );
3337                             });
3338
3339                             /* Is noper a trieable nodetype that can be merged with the
3340                              * current trie (if there is one)? */
3341                             if ( noper_trietype
3342                                   &&
3343                                   (
3344                                         ( noper_trietype == NOTHING)
3345                                         || ( trietype == NOTHING )
3346                                         || ( trietype == noper_trietype )
3347                                   )
3348 #ifdef NOJUMPTRIE
3349                                   && noper_next == tail
3350 #endif
3351                                   && count < U16_MAX)
3352                             {
3353                                 /* Handle mergable triable node
3354                                  * Either we are the first node in a new trieable sequence,
3355                                  * in which case we do some bookkeeping, otherwise we update
3356                                  * the end pointer. */
3357                                 if ( !first ) {
3358                                     first = cur;
3359                                     if ( noper_trietype == NOTHING ) {
3360 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3361                                         regnode * const noper_next = regnext( noper );
3362                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3363                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3364 #endif
3365
3366                                         if ( noper_next_trietype ) {
3367                                             trietype = noper_next_trietype;
3368                                         } else if (noper_next_type)  {
3369                                             /* a NOTHING regop is 1 regop wide. We need at least two
3370                                              * for a trie so we can't merge this in */
3371                                             first = NULL;
3372                                         }
3373                                     } else {
3374                                         trietype = noper_trietype;
3375                                     }
3376                                 } else {
3377                                     if ( trietype == NOTHING )
3378                                         trietype = noper_trietype;
3379                                     last = cur;
3380                                 }
3381                                 if (first)
3382                                     count++;
3383                             } /* end handle mergable triable node */
3384                             else {
3385                                 /* handle unmergable node -
3386                                  * noper may either be a triable node which can not be tried
3387                                  * together with the current trie, or a non triable node */
3388                                 if ( last ) {
3389                                     /* If last is set and trietype is not NOTHING then we have found
3390                                      * at least two triable branch sequences in a row of a similar
3391                                      * trietype so we can turn them into a trie. If/when we
3392                                      * allow NOTHING to start a trie sequence this condition will be
3393                                      * required, and it isn't expensive so we leave it in for now. */
3394                                     if ( trietype && trietype != NOTHING )
3395                                         make_trie( pRExC_state,
3396                                                 startbranch, first, cur, tail, count,
3397                                                 trietype, depth+1 );
3398                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3399                                 }
3400                                 if ( noper_trietype
3401 #ifdef NOJUMPTRIE
3402                                      && noper_next == tail
3403 #endif
3404                                 ){
3405                                     /* noper is triable, so we can start a new trie sequence */
3406                                     count = 1;
3407                                     first = cur;
3408                                     trietype = noper_trietype;
3409                                 } else if (first) {
3410                                     /* if we already saw a first but the current node is not triable then we have
3411                                      * to reset the first information. */
3412                                     count = 0;
3413                                     first = NULL;
3414                                     trietype = 0;
3415                                 }
3416                             } /* end handle unmergable node */
3417                         } /* loop over branches */
3418                         DEBUG_TRIE_COMPILE_r({
3419                             regprop(RExC_rx, mysv, cur);
3420                             PerlIO_printf( Perl_debug_log,
3421                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3422                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3423
3424                         });
3425                         if ( last && trietype ) {
3426                             if ( trietype != NOTHING ) {
3427                                 /* the last branch of the sequence was part of a trie,
3428                                  * so we have to construct it here outside of the loop
3429                                  */
3430                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3431 #ifdef TRIE_STUDY_OPT
3432                                 if ( ((made == MADE_EXACT_TRIE &&
3433                                      startbranch == first)
3434                                      || ( first_non_open == first )) &&
3435                                      depth==0 ) {
3436                                     flags |= SCF_TRIE_RESTUDY;
3437                                     if ( startbranch == first
3438                                          && scan == tail )
3439                                     {
3440                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3441                                     }
3442                                 }
3443 #endif
3444                             } else {
3445                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3446                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3447                                  */
3448                                 if ( startbranch == first ) {
3449                                     regnode *opt;
3450                                     /* the entire thing is a NOTHING sequence, something like this:
3451                                      * (?:|) So we can turn it into a plain NOTHING op. */
3452                                     DEBUG_TRIE_COMPILE_r({
3453                                         regprop(RExC_rx, mysv, cur);
3454                                         PerlIO_printf( Perl_debug_log,
3455                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3456                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3457
3458                                     });
3459                                     OP(startbranch)= NOTHING;
3460                                     NEXT_OFF(startbranch)= tail - startbranch;
3461                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3462                                         OP(opt)= OPTIMIZED;
3463                                 }
3464                             }
3465                         } /* end if ( last) */
3466                     } /* TRIE_MAXBUF is non zero */
3467
3468                 } /* do trie */
3469
3470             }
3471             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3472                 scan = NEXTOPER(NEXTOPER(scan));
3473             } else                      /* single branch is optimized. */
3474                 scan = NEXTOPER(scan);
3475             continue;
3476         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3477             scan_frame *newframe = NULL;
3478             I32 paren;
3479             regnode *start;
3480             regnode *end;
3481
3482             if (OP(scan) != SUSPEND) {
3483             /* set the pointer */
3484                 if (OP(scan) == GOSUB) {
3485                     paren = ARG(scan);
3486                     RExC_recurse[ARG2L(scan)] = scan;
3487                     start = RExC_open_parens[paren-1];
3488                     end   = RExC_close_parens[paren-1];
3489                 } else {
3490                     paren = 0;
3491                     start = RExC_rxi->program + 1;
3492                     end   = RExC_opend;
3493                 }
3494                 if (!recursed) {
3495                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3496                     SAVEFREEPV(recursed);
3497                 }
3498                 if (!PAREN_TEST(recursed,paren+1)) {
3499                     PAREN_SET(recursed,paren+1);
3500                     Newx(newframe,1,scan_frame);
3501                 } else {
3502                     if (flags & SCF_DO_SUBSTR) {
3503                         SCAN_COMMIT(pRExC_state,data,minlenp);
3504                         data->longest = &(data->longest_float);
3505                     }
3506                     is_inf = is_inf_internal = 1;
3507                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3508                         cl_anything(pRExC_state, data->start_class);
3509                     flags &= ~SCF_DO_STCLASS;
3510                 }
3511             } else {
3512                 Newx(newframe,1,scan_frame);
3513                 paren = stopparen;
3514                 start = scan+2;
3515                 end = regnext(scan);
3516             }
3517             if (newframe) {
3518                 assert(start);
3519                 assert(end);
3520                 SAVEFREEPV(newframe);
3521                 newframe->next = regnext(scan);
3522                 newframe->last = last;
3523                 newframe->stop = stopparen;
3524                 newframe->prev = frame;
3525
3526                 frame = newframe;
3527                 scan =  start;
3528                 stopparen = paren;
3529                 last = end;
3530
3531                 continue;
3532             }
3533         }
3534         else if (OP(scan) == EXACT) {
3535             I32 l = STR_LEN(scan);
3536             UV uc;
3537             if (UTF) {
3538                 const U8 * const s = (U8*)STRING(scan);
3539                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3540                 l = utf8_length(s, s + l);
3541             } else {
3542                 uc = *((U8*)STRING(scan));
3543             }
3544             min += l;
3545             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3546                 /* The code below prefers earlier match for fixed
3547                    offset, later match for variable offset.  */
3548                 if (data->last_end == -1) { /* Update the start info. */
3549                     data->last_start_min = data->pos_min;
3550                     data->last_start_max = is_inf
3551                         ? I32_MAX : data->pos_min + data->pos_delta;
3552                 }
3553                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3554                 if (UTF)
3555                     SvUTF8_on(data->last_found);
3556                 {
3557                     SV * const sv = data->last_found;
3558                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3559                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3560                     if (mg && mg->mg_len >= 0)
3561                         mg->mg_len += utf8_length((U8*)STRING(scan),
3562                                                   (U8*)STRING(scan)+STR_LEN(scan));
3563                 }
3564                 data->last_end = data->pos_min + l;
3565                 data->pos_min += l; /* As in the first entry. */
3566                 data->flags &= ~SF_BEFORE_EOL;
3567             }
3568             if (flags & SCF_DO_STCLASS_AND) {
3569                 /* Check whether it is compatible with what we know already! */
3570                 int compat = 1;
3571
3572
3573                 /* If compatible, we or it in below.  It is compatible if is
3574                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3575                  * it's for a locale.  Even if there isn't unicode semantics
3576                  * here, at runtime there may be because of matching against a
3577                  * utf8 string, so accept a possible false positive for
3578                  * latin1-range folds */
3579                 if (uc >= 0x100 ||
3580                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3581                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3582                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3583                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3584                     )
3585                 {
3586                     compat = 0;
3587                 }
3588                 ANYOF_CLASS_ZERO(data->start_class);
3589                 ANYOF_BITMAP_ZERO(data->start_class);
3590                 if (compat)
3591                     ANYOF_BITMAP_SET(data->start_class, uc);
3592                 else if (uc >= 0x100) {
3593                     int i;
3594
3595                     /* Some Unicode code points fold to the Latin1 range; as
3596                      * XXX temporary code, instead of figuring out if this is
3597                      * one, just assume it is and set all the start class bits
3598                      * that could be some such above 255 code point's fold
3599                      * which will generate fals positives.  As the code
3600                      * elsewhere that does compute the fold settles down, it
3601                      * can be extracted out and re-used here */
3602                     for (i = 0; i < 256; i++){
3603                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3604                             ANYOF_BITMAP_SET(data->start_class, i);
3605                         }
3606                     }
3607                 }
3608                 data->start_class->flags &= ~ANYOF_EOS;
3609                 if (uc < 0x100)
3610                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3611             }
3612             else if (flags & SCF_DO_STCLASS_OR) {
3613                 /* false positive possible if the class is case-folded */
3614                 if (uc < 0x100)
3615                     ANYOF_BITMAP_SET(data->start_class, uc);
3616                 else
3617                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3618                 data->start_class->flags &= ~ANYOF_EOS;
3619                 cl_and(data->start_class, and_withp);
3620             }
3621             flags &= ~SCF_DO_STCLASS;
3622         }
3623         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3624             I32 l = STR_LEN(scan);
3625             UV uc = *((U8*)STRING(scan));
3626
3627             /* Search for fixed substrings supports EXACT only. */
3628             if (flags & SCF_DO_SUBSTR) {
3629                 assert(data);
3630                 SCAN_COMMIT(pRExC_state, data, minlenp);
3631             }
3632             if (UTF) {
3633                 const U8 * const s = (U8 *)STRING(scan);
3634                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3635                 l = utf8_length(s, s + l);
3636             }
3637             if (has_exactf_sharp_s) {
3638                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3639             }
3640             min += l - min_subtract;
3641             assert (min >= 0);
3642             delta += min_subtract;
3643             if (flags & SCF_DO_SUBSTR) {
3644                 data->pos_min += l - min_subtract;
3645                 if (data->pos_min < 0) {
3646                     data->pos_min = 0;
3647                 }
3648                 data->pos_delta += min_subtract;
3649                 if (min_subtract) {
3650                     data->longest = &(data->longest_float);
3651                 }
3652             }
3653             if (flags & SCF_DO_STCLASS_AND) {
3654                 /* Check whether it is compatible with what we know already! */
3655                 int compat = 1;
3656                 if (uc >= 0x100 ||
3657                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3658                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3659                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3660                 {
3661                     compat = 0;
3662                 }
3663                 ANYOF_CLASS_ZERO(data->start_class);
3664                 ANYOF_BITMAP_ZERO(data->start_class);
3665                 if (compat) {
3666                     ANYOF_BITMAP_SET(data->start_class, uc);
3667                     data->start_class->flags &= ~ANYOF_EOS;
3668                     if (OP(scan) == EXACTFL) {
3669                         /* XXX This set is probably no longer necessary, and
3670                          * probably wrong as LOCALE now is on in the initial
3671                          * state */
3672                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3673                     }
3674                     else {
3675
3676                         /* Also set the other member of the fold pair.  In case
3677                          * that unicode semantics is called for at runtime, use
3678                          * the full latin1 fold.  (Can't do this for locale,
3679                          * because not known until runtime) */
3680                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3681
3682                         /* All other (EXACTFL handled above) folds except under
3683                          * /iaa that include s, S, and sharp_s also may include
3684                          * the others */
3685                         if (OP(scan) != EXACTFA) {
3686                             if (uc == 's' || uc == 'S') {
3687                                 ANYOF_BITMAP_SET(data->start_class,
3688                                                  LATIN_SMALL_LETTER_SHARP_S);
3689                             }
3690                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3691                                 ANYOF_BITMAP_SET(data->start_class, 's');
3692                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3693                             }
3694                         }
3695                     }
3696                 }
3697                 else if (uc >= 0x100) {
3698                     int i;
3699                     for (i = 0; i < 256; i++){
3700                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3701                             ANYOF_BITMAP_SET(data->start_class, i);
3702                         }
3703                     }
3704                 }
3705             }
3706             else if (flags & SCF_DO_STCLASS_OR) {
3707                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3708                     /* false positive possible if the class is case-folded.
3709                        Assume that the locale settings are the same... */
3710                     if (uc < 0x100) {
3711                         ANYOF_BITMAP_SET(data->start_class, uc);
3712                         if (OP(scan) != EXACTFL) {
3713
3714                             /* And set the other member of the fold pair, but
3715                              * can't do that in locale because not known until
3716                              * run-time */
3717                             ANYOF_BITMAP_SET(data->start_class,
3718                                              PL_fold_latin1[uc]);
3719
3720                             /* All folds except under /iaa that include s, S,
3721                              * and sharp_s also may include the others */
3722                             if (OP(scan) != EXACTFA) {
3723                                 if (uc == 's' || uc == 'S') {
3724                                     ANYOF_BITMAP_SET(data->start_class,
3725                                                    LATIN_SMALL_LETTER_SHARP_S);
3726                                 }
3727                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3728                                     ANYOF_BITMAP_SET(data->start_class, 's');
3729                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3730                                 }
3731                             }
3732                         }
3733                     }
3734                     data->start_class->flags &= ~ANYOF_EOS;
3735                 }
3736                 cl_and(data->start_class, and_withp);
3737             }
3738             flags &= ~SCF_DO_STCLASS;
3739         }
3740         else if (REGNODE_VARIES(OP(scan))) {
3741             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3742             I32 f = flags, pos_before = 0;
3743             regnode * const oscan = scan;
3744             struct regnode_charclass_class this_class;
3745             struct regnode_charclass_class *oclass = NULL;
3746             I32 next_is_eval = 0;
3747
3748             switch (PL_regkind[OP(scan)]) {
3749             case WHILEM:                /* End of (?:...)* . */
3750                 scan = NEXTOPER(scan);
3751                 goto finish;
3752             case PLUS:
3753                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3754                     next = NEXTOPER(scan);
3755                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3756                         mincount = 1;
3757                         maxcount = REG_INFTY;
3758                         next = regnext(scan);
3759                         scan = NEXTOPER(scan);
3760                         goto do_curly;
3761                     }
3762                 }
3763                 if (flags & SCF_DO_SUBSTR)
3764                     data->pos_min++;
3765                 min++;
3766                 /* Fall through. */
3767             case STAR:
3768                 if (flags & SCF_DO_STCLASS) {
3769                     mincount = 0;
3770                     maxcount = REG_INFTY;
3771                     next = regnext(scan);
3772                     scan = NEXTOPER(scan);
3773                     goto do_curly;
3774                 }
3775                 is_inf = is_inf_internal = 1;
3776                 scan = regnext(scan);
3777                 if (flags & SCF_DO_SUBSTR) {
3778                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3779                     data->longest = &(data->longest_float);
3780                 }
3781                 goto optimize_curly_tail;
3782             case CURLY:
3783                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3784                     && (scan->flags == stopparen))
3785                 {
3786                     mincount = 1;
3787                     maxcount = 1;
3788                 } else {
3789                     mincount = ARG1(scan);
3790                     maxcount = ARG2(scan);
3791                 }
3792                 next = regnext(scan);
3793                 if (OP(scan) == CURLYX) {
3794                     I32 lp = (data ? *(data->last_closep) : 0);
3795                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3796                 }
3797                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3798                 next_is_eval = (OP(scan) == EVAL);
3799               do_curly:
3800                 if (flags & SCF_DO_SUBSTR) {
3801                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3802                     pos_before = data->pos_min;
3803                 }
3804                 if (data) {
3805                     fl = data->flags;
3806                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3807                     if (is_inf)
3808                         data->flags |= SF_IS_INF;
3809                 }
3810                 if (flags & SCF_DO_STCLASS) {
3811                     cl_init(pRExC_state, &this_class);
3812                     oclass = data->start_class;
3813                     data->start_class = &this_class;
3814                     f |= SCF_DO_STCLASS_AND;
3815                     f &= ~SCF_DO_STCLASS_OR;
3816                 }
3817                 /* Exclude from super-linear cache processing any {n,m}
3818                    regops for which the combination of input pos and regex
3819                    pos is not enough information to determine if a match
3820                    will be possible.
3821
3822                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3823                    regex pos at the \s*, the prospects for a match depend not
3824                    only on the input position but also on how many (bar\s*)
3825                    repeats into the {4,8} we are. */
3826                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3827                     f &= ~SCF_WHILEM_VISITED_POS;
3828
3829                 /* This will finish on WHILEM, setting scan, or on NULL: */
3830                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3831                                       last, data, stopparen, recursed, NULL,
3832                                       (mincount == 0
3833                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3834
3835                 if (flags & SCF_DO_STCLASS)
3836                     data->start_class = oclass;
3837                 if (mincount == 0 || minnext == 0) {
3838                     if (flags & SCF_DO_STCLASS_OR) {
3839                         cl_or(pRExC_state, data->start_class, &this_class);
3840                     }
3841                     else if (flags & SCF_DO_STCLASS_AND) {
3842                         /* Switch to OR mode: cache the old value of
3843                          * data->start_class */
3844                         INIT_AND_WITHP;
3845                         StructCopy(data->start_class, and_withp,
3846                                    struct regnode_charclass_class);
3847                         flags &= ~SCF_DO_STCLASS_AND;
3848                         StructCopy(&this_class, data->start_class,
3849                                    struct regnode_charclass_class);
3850                         flags |= SCF_DO_STCLASS_OR;
3851                         data->start_class->flags |= ANYOF_EOS;
3852                     }
3853                 } else {                /* Non-zero len */
3854                     if (flags & SCF_DO_STCLASS_OR) {
3855                         cl_or(pRExC_state, data->start_class, &this_class);
3856                         cl_and(data->start_class, and_withp);
3857                     }
3858                     else if (flags & SCF_DO_STCLASS_AND)
3859                         cl_and(data->start_class, &this_class);
3860                     flags &= ~SCF_DO_STCLASS;
3861                 }
3862                 if (!scan)              /* It was not CURLYX, but CURLY. */
3863                     scan = next;
3864                 if ( /* ? quantifier ok, except for (?{ ... }) */
3865                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3866                     && (minnext == 0) && (deltanext == 0)
3867                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3868                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3869                 {
3870                     /* Fatal warnings may leak the regexp without this: */
3871                     SAVEFREESV(RExC_rx_sv);
3872                     ckWARNreg(RExC_parse,
3873                               "Quantifier unexpected on zero-length expression");
3874                     ReREFCNT_inc(RExC_rx_sv);
3875                 }
3876
3877                 min += minnext * mincount;
3878                 is_inf_internal |= ((maxcount == REG_INFTY
3879                                      && (minnext + deltanext) > 0)
3880                                     || deltanext == I32_MAX);
3881                 is_inf |= is_inf_internal;
3882                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3883
3884                 /* Try powerful optimization CURLYX => CURLYN. */
3885                 if (  OP(oscan) == CURLYX && data
3886                       && data->flags & SF_IN_PAR
3887                       && !(data->flags & SF_HAS_EVAL)
3888                       && !deltanext && minnext == 1 ) {
3889                     /* Try to optimize to CURLYN.  */
3890                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3891                     regnode * const nxt1 = nxt;
3892 #ifdef DEBUGGING
3893                     regnode *nxt2;
3894 #endif
3895
3896                     /* Skip open. */
3897                     nxt = regnext(nxt);
3898                     if (!REGNODE_SIMPLE(OP(nxt))
3899                         && !(PL_regkind[OP(nxt)] == EXACT
3900                              && STR_LEN(nxt) == 1))
3901                         goto nogo;
3902 #ifdef DEBUGGING
3903                     nxt2 = nxt;
3904 #endif
3905                     nxt = regnext(nxt);
3906                     if (OP(nxt) != CLOSE)
3907                         goto nogo;
3908                     if (RExC_open_parens) {
3909                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3910                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3911                     }
3912                     /* Now we know that nxt2 is the only contents: */
3913                     oscan->flags = (U8)ARG(nxt);
3914                     OP(oscan) = CURLYN;
3915                     OP(nxt1) = NOTHING; /* was OPEN. */
3916
3917 #ifdef DEBUGGING
3918                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3919                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3920                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3921                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3922                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3923                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3924 #endif
3925                 }
3926               nogo:
3927
3928                 /* Try optimization CURLYX => CURLYM. */
3929                 if (  OP(oscan) == CURLYX && data
3930                       && !(data->flags & SF_HAS_PAR)
3931                       && !(data->flags & SF_HAS_EVAL)
3932                       && !deltanext     /* atom is fixed width */
3933                       && minnext != 0   /* CURLYM can't handle zero width */
3934                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3935                 ) {
3936                     /* XXXX How to optimize if data == 0? */
3937                     /* Optimize to a simpler form.  */
3938                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3939                     regnode *nxt2;
3940
3941                     OP(oscan) = CURLYM;
3942                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3943                             && (OP(nxt2) != WHILEM))
3944                         nxt = nxt2;
3945                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3946                     /* Need to optimize away parenths. */
3947                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3948                         /* Set the parenth number.  */
3949                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3950
3951                         oscan->flags = (U8)ARG(nxt);
3952                         if (RExC_open_parens) {
3953                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3954                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3955                         }
3956                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3957                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3958
3959 #ifdef DEBUGGING
3960                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3961                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3962                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3963                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3964 #endif
3965 #if 0
3966                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3967                             regnode *nnxt = regnext(nxt1);
3968                             if (nnxt == nxt) {
3969                                 if (reg_off_by_arg[OP(nxt1)])
3970                                     ARG_SET(nxt1, nxt2 - nxt1);
3971                                 else if (nxt2 - nxt1 < U16_MAX)
3972                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3973                                 else
3974                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3975                             }
3976                             nxt1 = nnxt;
3977                         }
3978 #endif
3979                         /* Optimize again: */
3980                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3981                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3982                     }
3983                     else
3984                         oscan->flags = 0;
3985                 }
3986                 else if ((OP(oscan) == CURLYX)
3987                          && (flags & SCF_WHILEM_VISITED_POS)
3988                          /* See the comment on a similar expression above.
3989                             However, this time it's not a subexpression
3990                             we care about, but the expression itself. */
3991                          && (maxcount == REG_INFTY)
3992                          && data && ++data->whilem_c < 16) {
3993                     /* This stays as CURLYX, we can put the count/of pair. */
3994                     /* Find WHILEM (as in regexec.c) */
3995                     regnode *nxt = oscan + NEXT_OFF(oscan);
3996
3997                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3998                         nxt += ARG(nxt);
3999                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4000                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4001                 }
4002                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4003                     pars++;
4004                 if (flags & SCF_DO_SUBSTR) {
4005                     SV *last_str = NULL;
4006                     int counted = mincount != 0;
4007
4008                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4009 #if defined(SPARC64_GCC_WORKAROUND)
4010                         I32 b = 0;
4011                         STRLEN l = 0;
4012                         const char *s = NULL;
4013                         I32 old = 0;
4014
4015                         if (pos_before >= data->last_start_min)
4016                             b = pos_before;
4017                         else
4018                             b = data->last_start_min;
4019
4020                         l = 0;
4021                         s = SvPV_const(data->last_found, l);
4022                         old = b - data->last_start_min;
4023
4024 #else
4025                         I32 b = pos_before >= data->last_start_min
4026                             ? pos_before : data->last_start_min;
4027                         STRLEN l;
4028                         const char * const s = SvPV_const(data->last_found, l);
4029                         I32 old = b - data->last_start_min;
4030 #endif
4031
4032                         if (UTF)
4033                             old = utf8_hop((U8*)s, old) - (U8*)s;
4034                         l -= old;
4035                         /* Get the added string: */
4036                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4037                         if (deltanext == 0 && pos_before == b) {
4038                             /* What was added is a constant string */
4039                             if (mincount > 1) {
4040                                 SvGROW(last_str, (mincount * l) + 1);
4041                                 repeatcpy(SvPVX(last_str) + l,
4042                                           SvPVX_const(last_str), l, mincount - 1);
4043                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4044                                 /* Add additional parts. */
4045                                 SvCUR_set(data->last_found,
4046                                           SvCUR(data->last_found) - l);
4047                                 sv_catsv(data->last_found, last_str);
4048                                 {
4049                                     SV * sv = data->last_found;
4050                                     MAGIC *mg =
4051                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4052                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4053                                     if (mg && mg->mg_len >= 0)
4054                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4055                                 }
4056                                 data->last_end += l * (mincount - 1);
4057                             }
4058                         } else {
4059                             /* start offset must point into the last copy */
4060                             data->last_start_min += minnext * (mincount - 1);
4061                             data->last_start_max += is_inf ? I32_MAX
4062                                 : (maxcount - 1) * (minnext + data->pos_delta);
4063                         }
4064                     }
4065                     /* It is counted once already... */
4066                     data->pos_min += minnext * (mincount - counted);
4067                     data->pos_delta += - counted * deltanext +
4068                         (minnext + deltanext) * maxcount - minnext * mincount;
4069                     if (mincount != maxcount) {
4070                          /* Cannot extend fixed substrings found inside
4071                             the group.  */
4072                         SCAN_COMMIT(pRExC_state,data,minlenp);
4073                         if (mincount && last_str) {
4074                             SV * const sv = data->last_found;
4075                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4076                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4077
4078                             if (mg)
4079                                 mg->mg_len = -1;
4080                             sv_setsv(sv, last_str);
4081                             data->last_end = data->pos_min;
4082                             data->last_start_min =
4083                                 data->pos_min - CHR_SVLEN(last_str);
4084                             data->last_start_max = is_inf
4085                                 ? I32_MAX
4086                                 : data->pos_min + data->pos_delta
4087                                 - CHR_SVLEN(last_str);
4088                         }
4089                         data->longest = &(data->longest_float);
4090                     }
4091                     SvREFCNT_dec(last_str);
4092                 }
4093                 if (data && (fl & SF_HAS_EVAL))
4094                     data->flags |= SF_HAS_EVAL;
4095               optimize_curly_tail:
4096                 if (OP(oscan) != CURLYX) {
4097                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4098                            && NEXT_OFF(next))
4099                         NEXT_OFF(oscan) += NEXT_OFF(next);
4100                 }
4101                 continue;
4102             default:                    /* REF, ANYOFV, and CLUMP only? */
4103                 if (flags & SCF_DO_SUBSTR) {
4104                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4105                     data->longest = &(data->longest_float);
4106                 }
4107                 is_inf = is_inf_internal = 1;
4108                 if (flags & SCF_DO_STCLASS_OR)
4109                     cl_anything(pRExC_state, data->start_class);
4110                 flags &= ~SCF_DO_STCLASS;
4111                 break;
4112             }
4113         }
4114         else if (OP(scan) == LNBREAK) {
4115             if (flags & SCF_DO_STCLASS) {
4116                 int value = 0;
4117                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4118                 if (flags & SCF_DO_STCLASS_AND) {
4119                     for (value = 0; value < 256; value++)
4120                         if (!is_VERTWS_cp(value))
4121                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4122                 }
4123                 else {
4124                     for (value = 0; value < 256; value++)
4125                         if (is_VERTWS_cp(value))
4126                             ANYOF_BITMAP_SET(data->start_class, value);
4127                 }
4128                 if (flags & SCF_DO_STCLASS_OR)
4129                     cl_and(data->start_class, and_withp);
4130                 flags &= ~SCF_DO_STCLASS;
4131             }
4132             min++;
4133             delta++;    /* Because of the 2 char string cr-lf */
4134             if (flags & SCF_DO_SUBSTR) {
4135                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4136                 data->pos_min += 1;
4137                 data->pos_delta += 1;
4138                 data->longest = &(data->longest_float);
4139             }
4140         }
4141         else if (REGNODE_SIMPLE(OP(scan))) {
4142             int value = 0;
4143
4144             if (flags & SCF_DO_SUBSTR) {
4145                 SCAN_COMMIT(pRExC_state,data,minlenp);
4146                 data->pos_min++;
4147             }
4148             min++;
4149             if (flags & SCF_DO_STCLASS) {
4150                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4151
4152                 /* Some of the logic below assumes that switching
4153                    locale on will only add false positives. */
4154                 switch (PL_regkind[OP(scan)]) {
4155                 case SANY:
4156                 default:
4157                   do_default:
4158                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4159                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4160                         cl_anything(pRExC_state, data->start_class);
4161                     break;
4162                 case REG_ANY:
4163                     if (OP(scan) == SANY)
4164                         goto do_default;
4165                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4166                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4167                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4168                         cl_anything(pRExC_state, data->start_class);
4169                     }
4170                     if (flags & SCF_DO_STCLASS_AND || !value)
4171                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4172                     break;
4173                 case ANYOF:
4174                     if (flags & SCF_DO_STCLASS_AND)
4175                         cl_and(data->start_class,
4176                                (struct regnode_charclass_class*)scan);
4177                     else
4178                         cl_or(pRExC_state, data->start_class,
4179                               (struct regnode_charclass_class*)scan);
4180                     break;
4181                 case ALNUM:
4182                     if (flags & SCF_DO_STCLASS_AND) {
4183                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4184                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
4185                             if (OP(scan) == ALNUMU) {
4186                                 for (value = 0; value < 256; value++) {
4187                                     if (!isWORDCHAR_L1(value)) {
4188                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4189                                     }
4190                                 }
4191                             } else {
4192                                 for (value = 0; value < 256; value++) {
4193                                     if (!isALNUM(value)) {
4194                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4195                                     }
4196                                 }
4197                             }
4198                         }
4199                     }
4200                     else {
4201                         if (data->start_class->flags & ANYOF_LOCALE)
4202                             ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
4203
4204                         /* Even if under locale, set the bits for non-locale
4205                          * in case it isn't a true locale-node.  This will
4206                          * create false positives if it truly is locale */
4207                         if (OP(scan) == ALNUMU) {
4208                             for (value = 0; value < 256; value++) {
4209                                 if (isWORDCHAR_L1(value)) {
4210                                     ANYOF_BITMAP_SET(data->start_class, value);
4211                                 }
4212                             }
4213                         } else {
4214                             for (value = 0; value < 256; value++) {
4215                                 if (isALNUM(value)) {
4216                                     ANYOF_BITMAP_SET(data->start_class, value);
4217                                 }
4218                             }
4219                         }
4220                     }
4221                     break;
4222                 case NALNUM:
4223                     if (flags & SCF_DO_STCLASS_AND) {
4224                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4225                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
4226                             if (OP(scan) == NALNUMU) {
4227                                 for (value = 0; value < 256; value++) {
4228                                     if (isWORDCHAR_L1(value)) {
4229                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4230                                     }
4231                                 }
4232                             } else {
4233                                 for (value = 0; value < 256; value++) {
4234                                     if (isALNUM(value)) {
4235                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4236                                     }
4237                                 }
4238                             }
4239                         }
4240                     }
4241                     else {
4242                         if (data->start_class->flags & ANYOF_LOCALE)
4243                             ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
4244
4245                         /* Even if under locale, set the bits for non-locale in
4246                          * case it isn't a true locale-node.  This will create
4247                          * false positives if it truly is locale */
4248                         if (OP(scan) == NALNUMU) {
4249                             for (value = 0; value < 256; value++) {
4250                                 if (! isWORDCHAR_L1(value)) {
4251                                     ANYOF_BITMAP_SET(data->start_class, value);
4252                                 }
4253                             }
4254                         } else {
4255                             for (value = 0; value < 256; value++) {
4256                                 if (! isALNUM(value)) {
4257                                     ANYOF_BITMAP_SET(data->start_class, value);
4258                                 }
4259                             }
4260                         }
4261                     }
4262                     break;
4263                 case SPACE:
4264                     if (flags & SCF_DO_STCLASS_AND) {
4265                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4266                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4267                             if (OP(scan) == SPACEU) {
4268                                 for (value = 0; value < 256; value++) {
4269                                     if (!isSPACE_L1(value)) {
4270                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4271                                     }
4272                                 }
4273                             } else {
4274                                 for (value = 0; value < 256; value++) {
4275                                     if (!isSPACE(value)) {
4276                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4277                                     }
4278                                 }
4279                             }
4280                         }
4281                     }
4282                     else {
4283                         if (data->start_class->flags & ANYOF_LOCALE) {
4284                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4285                         }
4286                         if (OP(scan) == SPACEU) {
4287                             for (value = 0; value < 256; value++) {
4288                                 if (isSPACE_L1(value)) {
4289                                     ANYOF_BITMAP_SET(data->start_class, value);
4290                                 }
4291                             }
4292                         } else {
4293                             for (value = 0; value < 256; value++) {
4294                                 if (isSPACE(value)) {
4295                                     ANYOF_BITMAP_SET(data->start_class, value);
4296                                 }
4297                             }
4298                         }
4299                     }
4300                     break;
4301                 case NSPACE:
4302                     if (flags & SCF_DO_STCLASS_AND) {
4303                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4304                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4305                             if (OP(scan) == NSPACEU) {
4306                                 for (value = 0; value < 256; value++) {
4307                                     if (isSPACE_L1(value)) {
4308                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4309                                     }
4310                                 }
4311                             } else {
4312                                 for (value = 0; value < 256; value++) {
4313                                     if (isSPACE(value)) {
4314                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4315                                     }
4316                                 }
4317                             }
4318                         }
4319                     }
4320                     else {
4321                         if (data->start_class->flags & ANYOF_LOCALE)
4322                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4323                         if (OP(scan) == NSPACEU) {
4324                             for (value = 0; value < 256; value++) {
4325                                 if (!isSPACE_L1(value)) {
4326                                     ANYOF_BITMAP_SET(data->start_class, value);
4327                                 }
4328                             }
4329                         }
4330                         else {
4331                             for (value = 0; value < 256; value++) {
4332                                 if (!isSPACE(value)) {
4333                                     ANYOF_BITMAP_SET(data->start_class, value);
4334                                 }
4335                             }
4336                         }
4337                     }
4338                     break;
4339                 case DIGIT:
4340                     if (flags & SCF_DO_STCLASS_AND) {
4341                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4342                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4343                             for (value = 0; value < 256; value++)
4344                                 if (!isDIGIT(value))
4345                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4346                         }
4347                     }
4348                     else {
4349                         if (data->start_class->flags & ANYOF_LOCALE)
4350                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4351                         for (value = 0; value < 256; value++)
4352                             if (isDIGIT(value))
4353                                 ANYOF_BITMAP_SET(data->start_class, value);
4354                     }
4355                     break;
4356                 case NDIGIT:
4357                     if (flags & SCF_DO_STCLASS_AND) {
4358                         if (!(data->start_class->flags & ANYOF_LOCALE))
4359                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4360                         for (value = 0; value < 256; value++)
4361                             if (isDIGIT(value))
4362                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4363                     }
4364                     else {
4365                         if (data->start_class->flags & ANYOF_LOCALE)
4366                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4367                         for (value = 0; value < 256; value++)
4368                             if (!isDIGIT(value))
4369                                 ANYOF_BITMAP_SET(data->start_class, value);
4370                     }
4371                     break;
4372                 CASE_SYNST_FNC(VERTWS);
4373                 CASE_SYNST_FNC(HORIZWS);
4374
4375                 }
4376                 if (flags & SCF_DO_STCLASS_OR)
4377                     cl_and(data->start_class, and_withp);
4378                 flags &= ~SCF_DO_STCLASS;
4379             }
4380         }
4381         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4382             data->flags |= (OP(scan) == MEOL
4383                             ? SF_BEFORE_MEOL
4384                             : SF_BEFORE_SEOL);
4385             SCAN_COMMIT(pRExC_state, data, minlenp);
4386
4387         }
4388         else if (  PL_regkind[OP(scan)] == BRANCHJ
4389                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4390                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4391                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4392             if ( OP(scan) == UNLESSM &&
4393                  scan->flags == 0 &&
4394                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4395                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4396             ) {
4397                 regnode *opt;
4398                 regnode *upto= regnext(scan);
4399                 DEBUG_PARSE_r({
4400                     SV * const mysv_val=sv_newmortal();
4401                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4402
4403                     /*DEBUG_PARSE_MSG("opfail");*/
4404                     regprop(RExC_rx, mysv_val, upto);
4405                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4406                                   SvPV_nolen_const(mysv_val),
4407                                   (IV)REG_NODE_NUM(upto),
4408                                   (IV)(upto - scan)
4409                     );
4410                 });
4411                 OP(scan) = OPFAIL;
4412                 NEXT_OFF(scan) = upto - scan;
4413                 for (opt= scan + 1; opt < upto ; opt++)
4414                     OP(opt) = OPTIMIZED;
4415                 scan= upto;
4416                 continue;
4417             }
4418             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4419                 || OP(scan) == UNLESSM )
4420             {
4421                 /* Negative Lookahead/lookbehind
4422                    In this case we can't do fixed string optimisation.
4423                 */
4424
4425                 I32 deltanext, minnext, fake = 0;
4426                 regnode *nscan;
4427                 struct regnode_charclass_class intrnl;
4428                 int f = 0;
4429
4430                 data_fake.flags = 0;
4431                 if (data) {
4432                     data_fake.whilem_c = data->whilem_c;
4433                     data_fake.last_closep = data->last_closep;
4434                 }
4435                 else
4436                     data_fake.last_closep = &fake;
4437                 data_fake.pos_delta = delta;
4438                 if ( flags & SCF_DO_STCLASS && !scan->flags
4439                      && OP(scan) == IFMATCH ) { /* Lookahead */
4440                     cl_init(pRExC_state, &intrnl);
4441                     data_fake.start_class = &intrnl;
4442                     f |= SCF_DO_STCLASS_AND;
4443                 }
4444                 if (flags & SCF_WHILEM_VISITED_POS)
4445                     f |= SCF_WHILEM_VISITED_POS;
4446                 next = regnext(scan);
4447                 nscan = NEXTOPER(NEXTOPER(scan));
4448                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4449                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4450                 if (scan->flags) {
4451                     if (deltanext) {
4452                         FAIL("Variable length lookbehind not implemented");
4453                     }
4454                     else if (minnext > (I32)U8_MAX) {
4455                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4456                     }
4457                     scan->flags = (U8)minnext;
4458                 }
4459                 if (data) {
4460                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4461                         pars++;
4462                     if (data_fake.flags & SF_HAS_EVAL)
4463                         data->flags |= SF_HAS_EVAL;
4464                     data->whilem_c = data_fake.whilem_c;
4465                 }
4466                 if (f & SCF_DO_STCLASS_AND) {
4467                     if (flags & SCF_DO_STCLASS_OR) {
4468                         /* OR before, AND after: ideally we would recurse with
4469                          * data_fake to get the AND applied by study of the
4470                          * remainder of the pattern, and then derecurse;
4471                          * *** HACK *** for now just treat as "no information".
4472                          * See [perl #56690].
4473                          */
4474                         cl_init(pRExC_state, data->start_class);
4475                     }  else {
4476                         /* AND before and after: combine and continue */
4477                         const int was = (data->start_class->flags & ANYOF_EOS);
4478
4479                         cl_and(data->start_class, &intrnl);
4480                         if (was)
4481                             data->start_class->flags |= ANYOF_EOS;
4482                     }
4483                 }
4484             }
4485 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4486             else {
4487                 /* Positive Lookahead/lookbehind
4488                    In this case we can do fixed string optimisation,
4489                    but we must be careful about it. Note in the case of
4490                    lookbehind the positions will be offset by the minimum
4491                    length of the pattern, something we won't know about
4492                    until after the recurse.
4493                 */
4494                 I32 deltanext, fake = 0;
4495                 regnode *nscan;
4496                 struct regnode_charclass_class intrnl;
4497                 int f = 0;
4498                 /* We use SAVEFREEPV so that when the full compile
4499                     is finished perl will clean up the allocated
4500                     minlens when it's all done. This way we don't
4501                     have to worry about freeing them when we know
4502                     they wont be used, which would be a pain.
4503                  */
4504                 I32 *minnextp;
4505                 Newx( minnextp, 1, I32 );
4506                 SAVEFREEPV(minnextp);
4507
4508                 if (data) {
4509                     StructCopy(data, &data_fake, scan_data_t);
4510                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4511                         f |= SCF_DO_SUBSTR;
4512                         if (scan->flags)
4513                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4514                         data_fake.last_found=newSVsv(data->last_found);
4515                     }
4516                 }
4517                 else
4518                     data_fake.last_closep = &fake;
4519                 data_fake.flags = 0;
4520                 data_fake.pos_delta = delta;
4521                 if (is_inf)
4522                     data_fake.flags |= SF_IS_INF;
4523                 if ( flags & SCF_DO_STCLASS && !scan->flags
4524                      && OP(scan) == IFMATCH ) { /* Lookahead */
4525                     cl_init(pRExC_state, &intrnl);
4526                     data_fake.start_class = &intrnl;
4527                     f |= SCF_DO_STCLASS_AND;
4528                 }
4529                 if (flags & SCF_WHILEM_VISITED_POS)
4530                     f |= SCF_WHILEM_VISITED_POS;
4531                 next = regnext(scan);
4532                 nscan = NEXTOPER(NEXTOPER(scan));
4533
4534                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4535                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4536                 if (scan->flags) {
4537                     if (deltanext) {
4538                         FAIL("Variable length lookbehind not implemented");
4539                     }
4540                     else if (*minnextp > (I32)U8_MAX) {
4541                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4542                     }
4543                     scan->flags = (U8)*minnextp;
4544                 }
4545
4546                 *minnextp += min;
4547
4548                 if (f & SCF_DO_STCLASS_AND) {
4549                     const int was = (data->start_class->flags & ANYOF_EOS);
4550
4551                     cl_and(data->start_class, &intrnl);
4552                     if (was)
4553                         data->start_class->flags |= ANYOF_EOS;
4554                 }
4555                 if (data) {
4556                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4557                         pars++;
4558                     if (data_fake.flags & SF_HAS_EVAL)
4559                         data->flags |= SF_HAS_EVAL;
4560                     data->whilem_c = data_fake.whilem_c;
4561                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4562                         if (RExC_rx->minlen<*minnextp)
4563                             RExC_rx->minlen=*minnextp;
4564                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4565                         SvREFCNT_dec(data_fake.last_found);
4566
4567                         if ( data_fake.minlen_fixed != minlenp )
4568                         {
4569                             data->offset_fixed= data_fake.offset_fixed;
4570                             data->minlen_fixed= data_fake.minlen_fixed;
4571                             data->lookbehind_fixed+= scan->flags;
4572                         }
4573                         if ( data_fake.minlen_float != minlenp )
4574                         {
4575                             data->minlen_float= data_fake.minlen_float;
4576                             data->offset_float_min=data_fake.offset_float_min;
4577                             data->offset_float_max=data_fake.offset_float_max;
4578                             data->lookbehind_float+= scan->flags;
4579                         }
4580                     }
4581                 }
4582             }
4583 #endif
4584         }
4585         else if (OP(scan) == OPEN) {
4586             if (stopparen != (I32)ARG(scan))
4587                 pars++;
4588         }
4589         else if (OP(scan) == CLOSE) {
4590             if (stopparen == (I32)ARG(scan)) {
4591                 break;
4592             }
4593             if ((I32)ARG(scan) == is_par) {
4594                 next = regnext(scan);
4595
4596                 if ( next && (OP(next) != WHILEM) && next < last)
4597                     is_par = 0;         /* Disable optimization */
4598             }
4599             if (data)
4600                 *(data->last_closep) = ARG(scan);
4601         }
4602         else if (OP(scan) == EVAL) {
4603                 if (data)
4604                     data->flags |= SF_HAS_EVAL;
4605         }
4606         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4607             if (flags & SCF_DO_SUBSTR) {
4608                 SCAN_COMMIT(pRExC_state,data,minlenp);
4609                 flags &= ~SCF_DO_SUBSTR;
4610             }
4611             if (data && OP(scan)==ACCEPT) {
4612                 data->flags |= SCF_SEEN_ACCEPT;
4613                 if (stopmin > min)
4614                     stopmin = min;
4615             }
4616         }
4617         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4618         {
4619                 if (flags & SCF_DO_SUBSTR) {
4620                     SCAN_COMMIT(pRExC_state,data,minlenp);
4621                     data->longest = &(data->longest_float);
4622                 }
4623                 is_inf = is_inf_internal = 1;
4624                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4625                     cl_anything(pRExC_state, data->start_class);
4626                 flags &= ~SCF_DO_STCLASS;
4627         }
4628         else if (OP(scan) == GPOS) {
4629             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4630                 !(delta || is_inf || (data && data->pos_delta)))
4631             {
4632                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4633                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4634                 if (RExC_rx->gofs < (U32)min)
4635                     RExC_rx->gofs = min;
4636             } else {
4637                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4638                 RExC_rx->gofs = 0;
4639             }
4640         }
4641 #ifdef TRIE_STUDY_OPT
4642 #ifdef FULL_TRIE_STUDY
4643         else if (PL_regkind[OP(scan)] == TRIE) {
4644             /* NOTE - There is similar code to this block above for handling
4645                BRANCH nodes on the initial study.  If you change stuff here
4646                check there too. */
4647             regnode *trie_node= scan;
4648             regnode *tail= regnext(scan);
4649             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4650             I32 max1 = 0, min1 = I32_MAX;
4651             struct regnode_charclass_class accum;
4652
4653             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4654                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4655             if (flags & SCF_DO_STCLASS)
4656                 cl_init_zero(pRExC_state, &accum);
4657
4658             if (!trie->jump) {
4659                 min1= trie->minlen;
4660                 max1= trie->maxlen;
4661             } else {
4662                 const regnode *nextbranch= NULL;
4663                 U32 word;
4664
4665                 for ( word=1 ; word <= trie->wordcount ; word++)
4666                 {
4667                     I32 deltanext=0, minnext=0, f = 0, fake;
4668                     struct regnode_charclass_class this_class;
4669
4670                     data_fake.flags = 0;
4671                     if (data) {
4672                         data_fake.whilem_c = data->whilem_c;
4673                         data_fake.last_closep = data->last_closep;
4674                     }
4675                     else
4676                         data_fake.last_closep = &fake;
4677                     data_fake.pos_delta = delta;
4678                     if (flags & SCF_DO_STCLASS) {
4679                         cl_init(pRExC_state, &this_class);
4680                         data_fake.start_class = &this_class;
4681                         f = SCF_DO_STCLASS_AND;
4682                     }
4683                     if (flags & SCF_WHILEM_VISITED_POS)
4684                         f |= SCF_WHILEM_VISITED_POS;
4685
4686                     if (trie->jump[word]) {
4687                         if (!nextbranch)
4688                             nextbranch = trie_node + trie->jump[0];
4689                         scan= trie_node + trie->jump[word];
4690                         /* We go from the jump point to the branch that follows
4691                            it. Note this means we need the vestigal unused branches
4692                            even though they arent otherwise used.
4693                          */
4694                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4695                             &deltanext, (regnode *)nextbranch, &data_fake,
4696                             stopparen, recursed, NULL, f,depth+1);
4697                     }
4698                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4699                         nextbranch= regnext((regnode*)nextbranch);
4700
4701                     if (min1 > (I32)(minnext + trie->minlen))
4702                         min1 = minnext + trie->minlen;
4703                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4704                         max1 = minnext + deltanext + trie->maxlen;
4705                     if (deltanext == I32_MAX)
4706                         is_inf = is_inf_internal = 1;
4707
4708                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4709                         pars++;
4710                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4711                         if ( stopmin > min + min1)
4712                             stopmin = min + min1;
4713                         flags &= ~SCF_DO_SUBSTR;
4714                         if (data)
4715                             data->flags |= SCF_SEEN_ACCEPT;
4716                     }
4717                     if (data) {
4718                         if (data_fake.flags & SF_HAS_EVAL)
4719                             data->flags |= SF_HAS_EVAL;
4720                         data->whilem_c = data_fake.whilem_c;
4721                     }
4722                     if (flags & SCF_DO_STCLASS)
4723                         cl_or(pRExC_state, &accum, &this_class);
4724                 }
4725             }
4726             if (flags & SCF_DO_SUBSTR) {
4727                 data->pos_min += min1;
4728                 data->pos_delta += max1 - min1;
4729                 if (max1 != min1 || is_inf)
4730                     data->longest = &(data->longest_float);
4731             }
4732             min += min1;
4733             delta += max1 - min1;
4734             if (flags & SCF_DO_STCLASS_OR) {
4735                 cl_or(pRExC_state, data->start_class, &accum);
4736                 if (min1) {
4737                     cl_and(data->start_class, and_withp);
4738                     flags &= ~SCF_DO_STCLASS;
4739                 }
4740             }
4741             else if (flags & SCF_DO_STCLASS_AND) {
4742                 if (min1) {
4743                     cl_and(data->start_class, &accum);
4744                     flags &= ~SCF_DO_STCLASS;
4745                 }
4746                 else {
4747                     /* Switch to OR mode: cache the old value of
4748                      * data->start_class */
4749                     INIT_AND_WITHP;
4750                     StructCopy(data->start_class, and_withp,
4751                                struct regnode_charclass_class);
4752                     flags &= ~SCF_DO_STCLASS_AND;
4753                     StructCopy(&accum, data->start_class,
4754                                struct regnode_charclass_class);
4755                     flags |= SCF_DO_STCLASS_OR;
4756                     data->start_class->flags |= ANYOF_EOS;
4757                 }
4758             }
4759             scan= tail;
4760             continue;
4761         }
4762 #else
4763         else if (PL_regkind[OP(scan)] == TRIE) {
4764             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4765             U8*bang=NULL;
4766
4767             min += trie->minlen;
4768             delta += (trie->maxlen - trie->minlen);
4769             flags &= ~SCF_DO_STCLASS; /* xxx */
4770             if (flags & SCF_DO_SUBSTR) {
4771                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4772                 data->pos_min += trie->minlen;
4773                 data->pos_delta += (trie->maxlen - trie->minlen);
4774                 if (trie->maxlen != trie->minlen)
4775                     data->longest = &(data->longest_float);
4776             }
4777             if (trie->jump) /* no more substrings -- for now /grr*/
4778                 flags &= ~SCF_DO_SUBSTR;
4779         }
4780 #endif /* old or new */
4781 #endif /* TRIE_STUDY_OPT */
4782
4783         /* Else: zero-length, ignore. */
4784         scan = regnext(scan);
4785     }
4786     if (frame) {
4787         last = frame->last;
4788         scan = frame->next;
4789         stopparen = frame->stop;
4790         frame = frame->prev;
4791         goto fake_study_recurse;
4792     }
4793
4794   finish:
4795     assert(!frame);
4796     DEBUG_STUDYDATA("pre-fin:",data,depth);
4797
4798     *scanp = scan;
4799     *deltap = is_inf_internal ? I32_MAX : delta;
4800     if (flags & SCF_DO_SUBSTR && is_inf)
4801         data->pos_delta = I32_MAX - data->pos_min;
4802     if (is_par > (I32)U8_MAX)
4803         is_par = 0;
4804     if (is_par && pars==1 && data) {
4805         data->flags |= SF_IN_PAR;
4806         data->flags &= ~SF_HAS_PAR;
4807     }
4808     else if (pars && data) {
4809         data->flags |= SF_HAS_PAR;
4810         data->flags &= ~SF_IN_PAR;
4811     }
4812     if (flags & SCF_DO_STCLASS_OR)
4813         cl_and(data->start_class, and_withp);
4814     if (flags & SCF_TRIE_RESTUDY)
4815         data->flags |=  SCF_TRIE_RESTUDY;
4816
4817     DEBUG_STUDYDATA("post-fin:",data,depth);
4818
4819     return min < stopmin ? min : stopmin;
4820 }
4821
4822 STATIC U32
4823 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4824 {
4825     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4826
4827     PERL_ARGS_ASSERT_ADD_DATA;
4828
4829     Renewc(RExC_rxi->data,
4830            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4831            char, struct reg_data);
4832     if(count)
4833         Renew(RExC_rxi->data->what, count + n, U8);
4834     else
4835         Newx(RExC_rxi->data->what, n, U8);
4836     RExC_rxi->data->count = count + n;
4837     Copy(s, RExC_rxi->data->what + count, n, U8);
4838     return count;
4839 }
4840
4841 /*XXX: todo make this not included in a non debugging perl */
4842 #ifndef PERL_IN_XSUB_RE
4843 void
4844 Perl_reginitcolors(pTHX)
4845 {
4846     dVAR;
4847     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4848     if (s) {
4849         char *t = savepv(s);
4850         int i = 0;
4851         PL_colors[0] = t;
4852         while (++i < 6) {
4853             t = strchr(t, '\t');
4854             if (t) {
4855                 *t = '\0';
4856                 PL_colors[i] = ++t;
4857             }
4858             else
4859                 PL_colors[i] = t = (char *)"";
4860         }
4861     } else {
4862         int i = 0;
4863         while (i < 6)
4864             PL_colors[i++] = (char *)"";
4865     }
4866     PL_colorset = 1;
4867 }
4868 #endif
4869
4870
4871 #ifdef TRIE_STUDY_OPT
4872 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4873     STMT_START {                                            \
4874         if (                                                \
4875               (data.flags & SCF_TRIE_RESTUDY)               \
4876               && ! restudied++                              \
4877         ) {                                                 \
4878             dOsomething;                                    \
4879             goto reStudy;                                   \
4880         }                                                   \
4881     } STMT_END
4882 #else
4883 #define CHECK_RESTUDY_GOTO_butfirst
4884 #endif
4885
4886 /*
4887  * pregcomp - compile a regular expression into internal code
4888  *
4889  * Decides which engine's compiler to call based on the hint currently in
4890  * scope
4891  */
4892
4893 #ifndef PERL_IN_XSUB_RE
4894
4895 /* return the currently in-scope regex engine (or the default if none)  */
4896
4897 regexp_engine const *
4898 Perl_current_re_engine(pTHX)
4899 {
4900     dVAR;
4901
4902     if (IN_PERL_COMPILETIME) {
4903         HV * const table = GvHV(PL_hintgv);
4904         SV **ptr;
4905
4906         if (!table)
4907             return &PL_core_reg_engine;
4908         ptr = hv_fetchs(table, "regcomp", FALSE);
4909         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4910             return &PL_core_reg_engine;
4911         return INT2PTR(regexp_engine*,SvIV(*ptr));
4912     }
4913     else {
4914         SV *ptr;
4915         if (!PL_curcop->cop_hints_hash)
4916             return &PL_core_reg_engine;
4917         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4918         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4919             return &PL_core_reg_engine;
4920         return INT2PTR(regexp_engine*,SvIV(ptr));
4921     }
4922 }
4923
4924
4925 REGEXP *
4926 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4927 {
4928     dVAR;
4929     regexp_engine const *eng = current_re_engine();
4930     GET_RE_DEBUG_FLAGS_DECL;
4931
4932     PERL_ARGS_ASSERT_PREGCOMP;
4933
4934     /* Dispatch a request to compile a regexp to correct regexp engine. */
4935     DEBUG_COMPILE_r({
4936         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4937                         PTR2UV(eng));
4938     });
4939     return CALLREGCOMP_ENG(eng, pattern, flags);
4940 }
4941 #endif
4942
4943 /* public(ish) entry point for the perl core's own regex compiling code.
4944  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4945  * pattern rather than a list of OPs, and uses the internal engine rather
4946  * than the current one */
4947
4948 REGEXP *
4949 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4950 {
4951     SV *pat = pattern; /* defeat constness! */
4952     PERL_ARGS_ASSERT_RE_COMPILE;
4953     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4954 #ifdef PERL_IN_XSUB_RE
4955                                 &my_reg_engine,
4956 #else
4957                                 &PL_core_reg_engine,
4958 #endif
4959                                 NULL, NULL, rx_flags, 0);
4960 }
4961
4962 /* see if there are any run-time code blocks in the pattern.
4963  * False positives are allowed */
4964
4965 static bool
4966 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4967                     U32 pm_flags, char *pat, STRLEN plen)
4968 {
4969     int n = 0;
4970     STRLEN s;
4971
4972     /* avoid infinitely recursing when we recompile the pattern parcelled up
4973      * as qr'...'. A single constant qr// string can't have have any
4974      * run-time component in it, and thus, no runtime code. (A non-qr
4975      * string, however, can, e.g. $x =~ '(?{})') */
4976     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4977         return 0;
4978
4979     for (s = 0; s < plen; s++) {
4980         if (n < pRExC_state->num_code_blocks
4981             && s == pRExC_state->code_blocks[n].start)
4982         {
4983             s = pRExC_state->code_blocks[n].end;
4984             n++;
4985             continue;
4986         }
4987         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4988          * positives here */
4989         if (pat[s] == '(' && pat[s+1] == '?' &&
4990             (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4991         )
4992             return 1;
4993     }
4994     return 0;
4995 }
4996
4997 /* Handle run-time code blocks. We will already have compiled any direct
4998  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4999  * copy of it, but with any literal code blocks blanked out and
5000  * appropriate chars escaped; then feed it into
5001  *
5002  *    eval "qr'modified_pattern'"
5003  *
5004  * For example,
5005  *
5006  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5007  *
5008  * becomes
5009  *
5010  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5011  *
5012  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5013  * and merge them with any code blocks of the original regexp.
5014  *
5015  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5016  * instead, just save the qr and return FALSE; this tells our caller that
5017  * the original pattern needs upgrading to utf8.
5018  */
5019
5020 static bool
5021 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5022     char *pat, STRLEN plen)
5023 {
5024     SV *qr;
5025
5026     GET_RE_DEBUG_FLAGS_DECL;
5027
5028     if (pRExC_state->runtime_code_qr) {
5029         /* this is the second time we've been called; this should
5030          * only happen if the main pattern got upgraded to utf8
5031          * during compilation; re-use the qr we compiled first time
5032          * round (which should be utf8 too)
5033          */
5034         qr = pRExC_state->runtime_code_qr;
5035         pRExC_state->runtime_code_qr = NULL;
5036         assert(RExC_utf8 && SvUTF8(qr));
5037     }
5038     else {
5039         int n = 0;
5040         STRLEN s;
5041         char *p, *newpat;
5042         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5043         SV *sv, *qr_ref;
5044         dSP;
5045
5046         /* determine how many extra chars we need for ' and \ escaping */
5047         for (s = 0; s < plen; s++) {
5048             if (pat[s] == '\'' || pat[s] == '\\')
5049                 newlen++;
5050         }
5051
5052         Newx(newpat, newlen, char);
5053         p = newpat;
5054         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5055
5056         for (s = 0; s < plen; s++) {
5057             if (n < pRExC_state->num_code_blocks
5058                 && s == pRExC_state->code_blocks[n].start)
5059             {
5060                 /* blank out literal code block */
5061                 assert(pat[s] == '(');
5062                 while (s <= pRExC_state->code_blocks[n].end) {
5063                     *p++ = '_';
5064                     s++;
5065                 }
5066                 s--;
5067                 n++;
5068                 continue;
5069             }
5070             if (pat[s] == '\'' || pat[s] == '\\')
5071                 *p++ = '\\';
5072             *p++ = pat[s];
5073         }
5074         *p++ = '\'';
5075         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5076             *p++ = 'x';
5077         *p++ = '\0';
5078         DEBUG_COMPILE_r({
5079             PerlIO_printf(Perl_debug_log,
5080                 "%sre-parsing pattern for runtime code:%s %s\n",
5081                 PL_colors[4],PL_colors[5],newpat);
5082         });
5083
5084         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5085         Safefree(newpat);
5086
5087         ENTER;
5088         SAVETMPS;
5089         save_re_context();
5090         PUSHSTACKi(PERLSI_REQUIRE);
5091         /* this causes the toker to collapse \\ into \ when parsing
5092          * qr''; normally only q'' does this. It also alters hints
5093          * handling */
5094         PL_reg_state.re_reparsing = TRUE;
5095         eval_sv(sv, G_SCALAR);
5096         SvREFCNT_dec(sv);
5097         SPAGAIN;
5098         qr_ref = POPs;
5099         PUTBACK;
5100         {
5101             SV * const errsv = ERRSV;
5102             if (SvTRUE_NN(errsv))
5103             {
5104                 Safefree(pRExC_state->code_blocks);
5105                 /* use croak_sv ? */
5106                 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5107             }
5108         }
5109         assert(SvROK(qr_ref));
5110         qr = SvRV(qr_ref);
5111         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5112         /* the leaving below frees the tmp qr_ref.
5113          * Give qr a life of its own */
5114         SvREFCNT_inc(qr);
5115         POPSTACK;
5116         FREETMPS;
5117         LEAVE;
5118
5119     }
5120
5121     if (!RExC_utf8 && SvUTF8(qr)) {
5122         /* first time through; the pattern got upgraded; save the
5123          * qr for the next time through */
5124         assert(!pRExC_state->runtime_code_qr);
5125         pRExC_state->runtime_code_qr = qr;
5126         return 0;
5127     }
5128
5129
5130     /* extract any code blocks within the returned qr//  */
5131
5132
5133     /* merge the main (r1) and run-time (r2) code blocks into one */
5134     {
5135         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5136         struct reg_code_block *new_block, *dst;
5137         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5138         int i1 = 0, i2 = 0;
5139
5140         if (!r2->num_code_blocks) /* we guessed wrong */
5141         {
5142             SvREFCNT_dec(qr);
5143             return 1;
5144         }
5145
5146         Newx(new_block,
5147             r1->num_code_blocks + r2->num_code_blocks,
5148             struct reg_code_block);
5149         dst = new_block;
5150
5151         while (    i1 < r1->num_code_blocks
5152                 || i2 < r2->num_code_blocks)
5153         {
5154             struct reg_code_block *src;
5155             bool is_qr = 0;
5156
5157             if (i1 == r1->num_code_blocks) {
5158                 src = &r2->code_blocks[i2++];
5159                 is_qr = 1;
5160             }
5161             else if (i2 == r2->num_code_blocks)
5162                 src = &r1->code_blocks[i1++];
5163             else if (  r1->code_blocks[i1].start
5164                      < r2->code_blocks[i2].start)
5165             {
5166                 src = &r1->code_blocks[i1++];
5167                 assert(src->end < r2->code_blocks[i2].start);
5168             }
5169             else {
5170                 assert(  r1->code_blocks[i1].start
5171                        > r2->code_blocks[i2].start);
5172                 src = &r2->code_blocks[i2++];
5173                 is_qr = 1;
5174                 assert(src->end < r1->code_blocks[i1].start);
5175             }
5176
5177             assert(pat[src->start] == '(');
5178             assert(pat[src->end]   == ')');
5179             dst->start      = src->start;
5180             dst->end        = src->end;
5181             dst->block      = src->block;
5182             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5183                                     : src->src_regex;
5184             dst++;
5185         }
5186         r1->num_code_blocks += r2->num_code_blocks;
5187         Safefree(r1->code_blocks);
5188         r1->code_blocks = new_block;
5189     }
5190
5191     SvREFCNT_dec(qr);
5192     return 1;
5193 }
5194
5195
5196 STATIC bool
5197 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5198 {
5199     /* This is the common code for setting up the floating and fixed length
5200      * string data extracted from Perlre_op_compile() below.  Returns a boolean
5201      * as to whether succeeded or not */
5202
5203     I32 t,ml;
5204
5205     if (! (longest_length
5206            || (eol /* Can't have SEOL and MULTI */
5207                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5208           )
5209             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5210         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5211     {
5212         return FALSE;
5213     }
5214
5215     /* copy the information about the longest from the reg_scan_data
5216         over to the program. */
5217     if (SvUTF8(sv_longest)) {
5218         *rx_utf8 = sv_longest;
5219         *rx_substr = NULL;
5220     } else {
5221         *rx_substr = sv_longest;
5222         *rx_utf8 = NULL;
5223     }
5224     /* end_shift is how many chars that must be matched that
5225         follow this item. We calculate it ahead of time as once the
5226         lookbehind offset is added in we lose the ability to correctly
5227         calculate it.*/
5228     ml = minlen ? *(minlen) : (I32)longest_length;
5229     *rx_end_shift = ml - offset
5230         - longest_length + (SvTAIL(sv_longest) != 0)
5231         + lookbehind;
5232
5233     t = (eol/* Can't have SEOL and MULTI */
5234          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5235     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5236
5237     return TRUE;
5238 }
5239
5240 /*
5241  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5242  * regular expression into internal code.
5243  * The pattern may be passed either as:
5244  *    a list of SVs (patternp plus pat_count)
5245  *    a list of OPs (expr)
5246  * If both are passed, the SV list is used, but the OP list indicates
5247  * which SVs are actually pre-compiled code blocks
5248  *
5249  * The SVs in the list have magic and qr overloading applied to them (and
5250  * the list may be modified in-place with replacement SVs in the latter
5251  * case).
5252  *
5253  * If the pattern hasn't changed from old_re, then old_re will be
5254  * returned.
5255  *
5256  * eng is the current engine. If that engine has an op_comp method, then
5257  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5258  * do the initial concatenation of arguments and pass on to the external
5259  * engine.
5260  *
5261  * If is_bare_re is not null, set it to a boolean indicating whether the
5262  * arg list reduced (after overloading) to a single bare regex which has
5263  * been returned (i.e. /$qr/).
5264  *
5265  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5266  *
5267  * pm_flags contains the PMf_* flags, typically based on those from the
5268  * pm_flags field of the related PMOP. Currently we're only interested in
5269  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5270  *
5271  * We can't allocate space until we know how big the compiled form will be,
5272  * but we can't compile it (and thus know how big it is) until we've got a
5273  * place to put the code.  So we cheat:  we compile it twice, once with code
5274  * generation turned off and size counting turned on, and once "for real".
5275  * This also means that we don't allocate space until we are sure that the
5276  * thing really will compile successfully, and we never have to move the
5277  * code and thus invalidate pointers into it.  (Note that it has to be in
5278  * one piece because free() must be able to free it all.) [NB: not true in perl]
5279  *
5280  * Beware that the optimization-preparation code in here knows about some
5281  * of the structure of the compiled regexp.  [I'll say.]
5282  */
5283
5284 REGEXP *
5285 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5286                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5287                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5288 {
5289     dVAR;
5290     REGEXP *rx;
5291     struct regexp *r;
5292     regexp_internal *ri;
5293     STRLEN plen;
5294     char  * VOL exp;
5295     char* xend;
5296     regnode *scan;
5297     I32 flags;
5298     I32 minlen = 0;
5299     U32 rx_flags;
5300     SV * VOL pat;
5301     SV * VOL code_blocksv = NULL;
5302
5303     /* these are all flags - maybe they should be turned
5304      * into a single int with different bit masks */
5305     I32 sawlookahead = 0;
5306     I32 sawplus = 0;
5307     I32 sawopen = 0;
5308     bool used_setjump = FALSE;
5309     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5310     bool code_is_utf8 = 0;
5311     bool VOL recompile = 0;
5312     bool runtime_code = 0;
5313     U8 jump_ret = 0;
5314     dJMPENV;
5315     scan_data_t data;
5316     RExC_state_t RExC_state;
5317     RExC_state_t * const pRExC_state = &RExC_state;
5318 #ifdef TRIE_STUDY_OPT
5319     int restudied;
5320     RExC_state_t copyRExC_state;
5321 #endif
5322     GET_RE_DEBUG_FLAGS_DECL;
5323
5324     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5325
5326     DEBUG_r(if (!PL_colorset) reginitcolors());
5327
5328 #ifndef PERL_IN_XSUB_RE
5329     /* Initialize these here instead of as-needed, as is quick and avoids
5330      * having to test them each time otherwise */
5331     if (! PL_AboveLatin1) {
5332         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5333         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5334         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5335
5336         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5337         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5338
5339         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5340         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5341
5342         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5343         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5344
5345         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5346
5347         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5348         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5349
5350         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5351
5352         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5353         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5354
5355         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5356         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5357
5358         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5359         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5360
5361         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5362         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5363
5364         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5365         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5366
5367         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5368         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5369
5370         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5371         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5372
5373         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5374
5375         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5376         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5377
5378         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5379         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5380
5381         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5382     }
5383 #endif
5384
5385     pRExC_state->code_blocks = NULL;
5386     pRExC_state->num_code_blocks = 0;
5387
5388     if (is_bare_re)
5389         *is_bare_re = FALSE;
5390
5391     if (expr && (expr->op_type == OP_LIST ||
5392                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5393
5394         /* is the source UTF8, and how many code blocks are there? */
5395         OP *o;
5396         int ncode = 0;
5397
5398         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5399             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5400                 code_is_utf8 = 1;
5401             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5402                 /* count of DO blocks */
5403                 ncode++;
5404         }
5405         if (ncode) {
5406             pRExC_state->num_code_blocks = ncode;
5407             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5408         }
5409     }
5410
5411     if (pat_count) {
5412         /* handle a list of SVs */
5413
5414         SV **svp;
5415
5416         /* apply magic and RE overloading to each arg */
5417         for (svp = patternp; svp < patternp + pat_count; svp++) {
5418             SV *rx = *svp;
5419             SvGETMAGIC(rx);
5420             if (SvROK(rx) && SvAMAGIC(rx)) {
5421                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5422                 if (sv) {
5423                     if (SvROK(sv))
5424                         sv = SvRV(sv);
5425                     if (SvTYPE(sv) != SVt_REGEXP)
5426                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5427                     *svp = sv;
5428                 }
5429             }
5430         }
5431
5432         if (pat_count > 1) {
5433             /* concat multiple args and find any code block indexes */
5434
5435             OP *o = NULL;
5436             int n = 0;
5437             bool utf8 = 0;
5438             STRLEN orig_patlen = 0;
5439
5440             if (pRExC_state->num_code_blocks) {
5441                 o = cLISTOPx(expr)->op_first;
5442                 assert(   o->op_type == OP_PUSHMARK
5443                        || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5444                        || o->op_type == OP_PADRANGE);
5445                 o = o->op_sibling;
5446             }
5447
5448             pat = newSVpvn("", 0);
5449             SAVEFREESV(pat);
5450
5451             /* determine if the pattern is going to be utf8 (needed
5452              * in advance to align code block indices correctly).
5453              * XXX This could fail to be detected for an arg with
5454              * overloading but not concat overloading; but the main effect
5455              * in this obscure case is to need a 'use re eval' for a
5456              * literal code block */
5457             for (svp = patternp; svp < patternp + pat_count; svp++) {
5458                 if (SvUTF8(*svp))
5459                     utf8 = 1;
5460             }
5461             if (utf8)
5462                 SvUTF8_on(pat);
5463
5464             for (svp = patternp; svp < patternp + pat_count; svp++) {
5465                 SV *sv, *msv = *svp;
5466                 SV *rx;
5467                 bool code = 0;
5468                 /* we make the assumption here that each op in the list of
5469                  * op_siblings maps to one SV pushed onto the stack,
5470                  * except for code blocks, with have both an OP_NULL and
5471                  * and OP_CONST.
5472                  * This allows us to match up the list of SVs against the
5473                  * list of OPs to find the next code block.
5474                  *
5475                  * Note that       PUSHMARK PADSV PADSV ..
5476                  * is optimised to
5477                  *                 PADRANGE NULL  NULL  ..
5478                  * so the alignment still works. */
5479                 if (o) {
5480                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5481                         assert(n < pRExC_state->num_code_blocks);
5482                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5483                         pRExC_state->code_blocks[n].block = o;
5484                         pRExC_state->code_blocks[n].src_regex = NULL;
5485                         n++;
5486                         code = 1;
5487                         o = o->op_sibling; /* skip CONST */
5488                         assert(o);
5489                     }
5490                     o = o->op_sibling;;
5491                 }
5492
5493                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5494                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5495                 {
5496                     sv_setsv(pat, sv);
5497                     /* overloading involved: all bets are off over literal
5498                      * code. Pretend we haven't seen it */
5499                     pRExC_state->num_code_blocks -= n;
5500                     n = 0;
5501                     rx = NULL;
5502
5503                 }
5504                 else  {
5505                     while (SvAMAGIC(msv)
5506                             && (sv = AMG_CALLunary(msv, string_amg))
5507                             && sv != msv
5508                             &&  !(   SvROK(msv)
5509                                   && SvROK(sv)
5510                                   && SvRV(msv) == SvRV(sv))
5511                     ) {
5512                         msv = sv;
5513                         SvGETMAGIC(msv);
5514                     }
5515                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5516                         msv = SvRV(msv);
5517                     orig_patlen = SvCUR(pat);
5518                     sv_catsv_nomg(pat, msv);
5519                     rx = msv;
5520                     if (code)
5521                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5522                 }
5523
5524                 /* extract any code blocks within any embedded qr//'s */
5525                 if (rx && SvTYPE(rx) == SVt_REGEXP
5526                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5527                 {
5528
5529                     RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5530                     if (ri->num_code_blocks) {
5531                         int i;
5532                         /* the presence of an embedded qr// with code means
5533                          * we should always recompile: the text of the
5534                          * qr// may not have changed, but it may be a
5535                          * different closure than last time */
5536                         recompile = 1;
5537                         Renew(pRExC_state->code_blocks,
5538                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5539                             struct reg_code_block);
5540                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5541                         for (i=0; i < ri->num_code_blocks; i++) {
5542                             struct reg_code_block *src, *dst;
5543                             STRLEN offset =  orig_patlen
5544                                 + ReANY((REGEXP *)rx)->pre_prefix;
5545                             assert(n < pRExC_state->num_code_blocks);
5546                             src = &ri->code_blocks[i];
5547                             dst = &pRExC_state->code_blocks[n];
5548                             dst->start      = src->start + offset;
5549                             dst->end        = src->end   + offset;
5550                             dst->block      = src->block;
5551                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5552                                                     src->src_regex
5553                                                         ? src->src_regex
5554                                                         : (REGEXP*)rx);
5555                             n++;
5556                         }
5557                     }
5558                 }
5559             }
5560             SvSETMAGIC(pat);
5561         }
5562         else {
5563             SV *sv;
5564             pat = *patternp;
5565             while (SvAMAGIC(pat)
5566                     && (sv = AMG_CALLunary(pat, string_amg))
5567                     && sv != pat)
5568             {
5569                 pat = sv;
5570                 SvGETMAGIC(pat);
5571             }
5572         }
5573
5574         /* handle bare regex: foo =~ $re */
5575         {
5576             SV *re = pat;
5577             if (SvROK(re))
5578                 re = SvRV(re);
5579             if (SvTYPE(re) == SVt_REGEXP) {
5580                 if (is_bare_re)
5581                     *is_bare_re = TRUE;
5582                 SvREFCNT_inc(re);
5583                 Safefree(pRExC_state->code_blocks);
5584                 return (REGEXP*)re;
5585             }
5586         }
5587     }
5588     else {
5589         /* not a list of SVs, so must be a list of OPs */
5590         assert(expr);
5591         if (expr->op_type == OP_LIST) {
5592             int i = -1;
5593             bool is_code = 0;
5594             OP *o;
5595
5596             pat = newSVpvn("", 0);
5597             SAVEFREESV(pat);
5598             if (code_is_utf8)
5599                 SvUTF8_on(pat);
5600
5601             /* given a list of CONSTs and DO blocks in expr, append all
5602              * the CONSTs to pat, and record the start and end of each
5603              * code block in code_blocks[] (each DO{} op is followed by an
5604              * OP_CONST containing the corresponding literal '(?{...})
5605              * text)
5606              */
5607             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5608                 if (o->op_type == OP_CONST) {
5609                     sv_catsv(pat, cSVOPo_sv);
5610                     if (is_code) {
5611                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5612                         is_code = 0;
5613                     }
5614                 }
5615                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5616                     assert(i+1 < pRExC_state->num_code_blocks);
5617                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5618                     pRExC_state->code_blocks[i].block = o;
5619                     pRExC_state->code_blocks[i].src_regex = NULL;
5620                     is_code = 1;
5621                 }
5622             }
5623         }
5624         else {
5625             assert(expr->op_type == OP_CONST);
5626             pat = cSVOPx_sv(expr);
5627         }
5628     }
5629
5630     exp = SvPV_nomg(pat, plen);
5631
5632     if (!eng->op_comp) {
5633         if ((SvUTF8(pat) && IN_BYTES)
5634                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5635         {
5636             /* make a temporary copy; either to convert to bytes,
5637              * or to avoid repeating get-magic / overloaded stringify */
5638             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5639                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5640         }
5641         Safefree(pRExC_state->code_blocks);
5642         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5643     }
5644
5645     /* ignore the utf8ness if the pattern is 0 length */
5646     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5647     RExC_uni_semantics = 0;
5648     RExC_contains_locale = 0;
5649     pRExC_state->runtime_code_qr = NULL;
5650
5651     /****************** LONG JUMP TARGET HERE***********************/
5652     /* Longjmp back to here if have to switch in midstream to utf8 */
5653     if (! RExC_orig_utf8) {
5654         JMPENV_PUSH(jump_ret);
5655         used_setjump = TRUE;
5656     }
5657
5658     if (jump_ret == 0) {    /* First time through */
5659         xend = exp + plen;
5660
5661         DEBUG_COMPILE_r({
5662             SV *dsv= sv_newmortal();
5663             RE_PV_QUOTED_DECL(s, RExC_utf8,
5664                 dsv, exp, plen, 60);
5665             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5666                            PL_colors[4],PL_colors[5],s);
5667         });
5668     }
5669     else {  /* longjumped back */
5670         U8 *src, *dst;
5671         int n=0;
5672         STRLEN s = 0, d = 0;
5673         bool do_end = 0;
5674
5675         /* If the cause for the longjmp was other than changing to utf8, pop
5676          * our own setjmp, and longjmp to the correct handler */
5677         if (jump_ret != UTF8_LONGJMP) {
5678             JMPENV_POP;
5679             JMPENV_JUMP(jump_ret);
5680         }
5681
5682         GET_RE_DEBUG_FLAGS;
5683
5684         /* It's possible to write a regexp in ascii that represents Unicode
5685         codepoints outside of the byte range, such as via \x{100}. If we
5686         detect such a sequence we have to convert the entire pattern to utf8
5687         and then recompile, as our sizing calculation will have been based
5688         on 1 byte == 1 character, but we will need to use utf8 to encode
5689         at least some part of the pattern, and therefore must convert the whole
5690         thing.
5691         -- dmq */
5692         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5693             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5694
5695         /* upgrade pattern to UTF8, and if there are code blocks,
5696          * recalculate the indices.
5697          * This is essentially an unrolled Perl_bytes_to_utf8() */
5698
5699         src = (U8*)SvPV_nomg(pat, plen);
5700         Newx(dst, plen * 2 + 1, U8);
5701
5702         while (s < plen) {
5703             const UV uv = NATIVE_TO_ASCII(src[s]);
5704             if (UNI_IS_INVARIANT(uv))
5705                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5706             else {
5707                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5708                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5709             }
5710             if (n < pRExC_state->num_code_blocks) {
5711                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5712                     pRExC_state->code_blocks[n].start = d;
5713                     assert(dst[d] == '(');
5714                     do_end = 1;
5715                 }
5716                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5717                     pRExC_state->code_blocks[n].end = d;
5718                     assert(dst[d] == ')');
5719                     do_end = 0;
5720                     n++;
5721                 }
5722             }
5723             s++;
5724             d++;
5725         }
5726         dst[d] = '\0';
5727         plen = d;
5728         exp = (char*) dst;
5729         xend = exp + plen;
5730         SAVEFREEPV(exp);
5731         RExC_orig_utf8 = RExC_utf8 = 1;
5732     }
5733
5734     /* return old regex if pattern hasn't changed */
5735
5736     if (   old_re
5737         && !recompile
5738         && !!RX_UTF8(old_re) == !!RExC_utf8
5739         && RX_PRECOMP(old_re)
5740         && RX_PRELEN(old_re) == plen
5741         && memEQ(RX_PRECOMP(old_re), exp, plen))
5742     {
5743         /* with runtime code, always recompile */
5744         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5745                                             exp, plen);
5746         if (!runtime_code) {
5747             if (used_setjump) {
5748                 JMPENV_POP;
5749             }
5750             Safefree(pRExC_state->code_blocks);
5751             return old_re;
5752         }
5753     }
5754     else if ((pm_flags & PMf_USE_RE_EVAL)
5755                 /* this second condition covers the non-regex literal case,
5756                  * i.e.  $foo =~ '(?{})'. */
5757                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5758                     && (PL_hints & HINT_RE_EVAL))
5759     )
5760         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5761                             exp, plen);
5762
5763 #ifdef TRIE_STUDY_OPT
5764     restudied = 0;
5765 #endif
5766
5767     rx_flags = orig_rx_flags;
5768
5769     if (initial_charset == REGEX_LOCALE_CHARSET) {
5770         RExC_contains_locale = 1;
5771     }
5772     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5773
5774         /* Set to use unicode semantics if the pattern is in utf8 and has the
5775          * 'depends' charset specified, as it means unicode when utf8  */
5776         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5777     }
5778
5779     RExC_precomp = exp;
5780     RExC_flags = rx_flags;
5781     RExC_pm_flags = pm_flags;
5782
5783     if (runtime_code) {
5784         if (TAINTING_get && TAINT_get)
5785             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5786
5787         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5788             /* whoops, we have a non-utf8 pattern, whilst run-time code
5789              * got compiled as utf8. Try again with a utf8 pattern */
5790              JMPENV_JUMP(UTF8_LONGJMP);
5791         }
5792     }
5793     assert(!pRExC_state->runtime_code_qr);
5794
5795     RExC_sawback = 0;
5796
5797     RExC_seen = 0;
5798     RExC_in_lookbehind = 0;
5799     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5800     RExC_extralen = 0;
5801     RExC_override_recoding = 0;
5802     RExC_in_multi_char_class = 0;
5803
5804     /* First pass: determine size, legality. */
5805     RExC_parse = exp;
5806     RExC_start = exp;
5807     RExC_end = xend;
5808     RExC_naughty = 0;
5809     RExC_npar = 1;
5810     RExC_nestroot = 0;
5811     RExC_size = 0L;
5812     RExC_emit = &PL_regdummy;
5813     RExC_whilem_seen = 0;
5814     RExC_open_parens = NULL;
5815     RExC_close_parens = NULL;
5816     RExC_opend = NULL;
5817     RExC_paren_names = NULL;
5818 #ifdef DEBUGGING
5819     RExC_paren_name_list = NULL;
5820 #endif
5821     RExC_recurse = NULL;
5822     RExC_recurse_count = 0;
5823     pRExC_state->code_index = 0;
5824
5825 #if 0 /* REGC() is (currently) a NOP at the first pass.
5826        * Clever compilers notice this and complain. --jhi */
5827     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5828 #endif
5829     DEBUG_PARSE_r(
5830         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5831         RExC_lastnum=0;
5832         RExC_lastparse=NULL;
5833     );
5834     /* reg may croak on us, not giving us a chance to free
5835        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5836        need it to survive as long as the regexp (qr/(?{})/).
5837        We must check that code_blocksv is not already set, because we may
5838        have longjmped back. */
5839     if (pRExC_state->code_blocks && !code_blocksv) {
5840         code_blocksv = newSV_type(SVt_PV);
5841         SAVEFREESV(code_blocksv);
5842         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5843         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5844     }
5845     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5846         RExC_precomp = NULL;
5847         return(NULL);
5848     }
5849     if (code_blocksv)
5850         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5851
5852     /* Here, finished first pass.  Get rid of any added setjmp */
5853     if (used_setjump) {
5854         JMPENV_POP;
5855     }
5856
5857     DEBUG_PARSE_r({
5858         PerlIO_printf(Perl_debug_log,
5859             "Required size %"IVdf" nodes\n"
5860             "Starting second pass (creation)\n",
5861             (IV)RExC_size);
5862         RExC_lastnum=0;
5863         RExC_lastparse=NULL;
5864     });
5865
5866     /* The first pass could have found things that force Unicode semantics */
5867     if ((RExC_utf8 || RExC_uni_semantics)
5868          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5869     {
5870         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5871     }
5872
5873     /* Small enough for pointer-storage convention?
5874        If extralen==0, this means that we will not need long jumps. */
5875     if (RExC_size >= 0x10000L && RExC_extralen)
5876         RExC_size += RExC_extralen;
5877     else
5878         RExC_extralen = 0;
5879     if (RExC_whilem_seen > 15)
5880         RExC_whilem_seen = 15;
5881
5882     /* Allocate space and zero-initialize. Note, the two step process
5883        of zeroing when in debug mode, thus anything assigned has to
5884        happen after that */
5885     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5886     r = ReANY(rx);
5887     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5888          char, regexp_internal);
5889     if ( r == NULL || ri == NULL )
5890         FAIL("Regexp out of space");
5891 #ifdef DEBUGGING
5892     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5893     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5894 #else
5895     /* bulk initialize base fields with 0. */
5896     Zero(ri, sizeof(regexp_internal), char);
5897 #endif
5898
5899     /* non-zero initialization begins here */
5900     RXi_SET( r, ri );
5901     r->engine= eng;
5902     r->extflags = rx_flags;
5903     if (pm_flags & PMf_IS_QR) {
5904         ri->code_blocks = pRExC_state->code_blocks;
5905         ri->num_code_blocks = pRExC_state->num_code_blocks;
5906     }
5907     else
5908     {
5909         int n;
5910         for (n = 0; n < pRExC_state->num_code_blocks; n++)
5911             if (pRExC_state->code_blocks[n].src_regex)
5912                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5913         SAVEFREEPV(pRExC_state->code_blocks);
5914     }
5915
5916     {
5917         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5918         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5919
5920         /* The caret is output if there are any defaults: if not all the STD
5921          * flags are set, or if no character set specifier is needed */
5922         bool has_default =
5923                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5924                     || ! has_charset);
5925         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5926         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5927                             >> RXf_PMf_STD_PMMOD_SHIFT);
5928         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5929         char *p;
5930         /* Allocate for the worst case, which is all the std flags are turned
5931          * on.  If more precision is desired, we could do a population count of
5932          * the flags set.  This could be done with a small lookup table, or by
5933          * shifting, masking and adding, or even, when available, assembly
5934          * language for a machine-language population count.
5935          * We never output a minus, as all those are defaults, so are
5936          * covered by the caret */
5937         const STRLEN wraplen = plen + has_p + has_runon
5938             + has_default       /* If needs a caret */
5939
5940                 /* If needs a character set specifier */
5941             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5942             + (sizeof(STD_PAT_MODS) - 1)
5943             + (sizeof("(?:)") - 1);
5944
5945         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5946         r->xpv_len_u.xpvlenu_pv = p;
5947         if (RExC_utf8)
5948             SvFLAGS(rx) |= SVf_UTF8;
5949         *p++='('; *p++='?';
5950
5951         /* If a default, cover it using the caret */
5952         if (has_default) {
5953             *p++= DEFAULT_PAT_MOD;
5954         }
5955         if (has_charset) {
5956             STRLEN len;
5957             const char* const name = get_regex_charset_name(r->extflags, &len);
5958             Copy(name, p, len, char);
5959             p += len;
5960         }
5961         if (has_p)
5962             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5963         {
5964             char ch;
5965             while((ch = *fptr++)) {
5966                 if(reganch & 1)
5967                     *p++ = ch;
5968                 reganch >>= 1;
5969             }
5970         }
5971
5972         *p++ = ':';
5973         Copy(RExC_precomp, p, plen, char);
5974         assert ((RX_WRAPPED(rx) - p) < 16);
5975         r->pre_prefix = p - RX_WRAPPED(rx);
5976         p += plen;
5977         if (has_runon)
5978             *p++ = '\n';
5979         *p++ = ')';
5980         *p = 0;
5981         SvCUR_set(rx, p - RX_WRAPPED(rx));
5982     }
5983
5984     r->intflags = 0;
5985     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5986
5987     if (RExC_seen & REG_SEEN_RECURSE) {
5988         Newxz(RExC_open_parens, RExC_npar,regnode *);
5989         SAVEFREEPV(RExC_open_parens);
5990         Newxz(RExC_close_parens,RExC_npar,regnode *);
5991         SAVEFREEPV(RExC_close_parens);
5992     }
5993
5994     /* Useful during FAIL. */
5995 #ifdef RE_TRACK_PATTERN_OFFSETS
5996     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5997     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5998                           "%s %"UVuf" bytes for offset annotations.\n",
5999                           ri->u.offsets ? "Got" : "Couldn't get",
6000                           (UV)((2*RExC_size+1) * sizeof(U32))));
6001 #endif
6002     SetProgLen(ri,RExC_size);
6003     RExC_rx_sv = rx;
6004     RExC_rx = r;
6005     RExC_rxi = ri;
6006
6007     /* Second pass: emit code. */
6008     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6009     RExC_pm_flags = pm_flags;
6010     RExC_parse = exp;
6011     RExC_end = xend;
6012     RExC_naughty = 0;
6013     RExC_npar = 1;
6014     RExC_emit_start = ri->program;
6015     RExC_emit = ri->program;
6016     RExC_emit_bound = ri->program + RExC_size + 1;
6017     pRExC_state->code_index = 0;
6018
6019     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6020     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6021         ReREFCNT_dec(rx);
6022         return(NULL);
6023     }
6024     /* XXXX To minimize changes to RE engine we always allocate
6025        3-units-long substrs field. */
6026     Newx(r->substrs, 1, struct reg_substr_data);
6027     if (RExC_recurse_count) {
6028         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6029         SAVEFREEPV(RExC_recurse);
6030     }
6031
6032 reStudy:
6033     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6034     Zero(r->substrs, 1, struct reg_substr_data);
6035
6036 #ifdef TRIE_STUDY_OPT
6037     if (!restudied) {
6038         StructCopy(&zero_scan_data, &data, scan_data_t);
6039         copyRExC_state = RExC_state;
6040     } else {
6041         U32 seen=RExC_seen;
6042         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6043
6044         RExC_state = copyRExC_state;
6045         if (seen & REG_TOP_LEVEL_BRANCHES)
6046             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6047         else
6048             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6049         StructCopy(&zero_scan_data, &data, scan_data_t);
6050     }
6051 #else
6052     StructCopy(&zero_scan_data, &data, scan_data_t);
6053 #endif
6054
6055     /* Dig out information for optimizations. */
6056     r->extflags = RExC_flags; /* was pm_op */
6057     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6058
6059     if (UTF)
6060         SvUTF8_on(rx);  /* Unicode in it? */
6061     ri->regstclass = NULL;
6062     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6063         r->intflags |= PREGf_NAUGHTY;
6064     scan = ri->program + 1;             /* First BRANCH. */
6065
6066     /* testing for BRANCH here tells us whether there is "must appear"
6067        data in the pattern. If there is then we can use it for optimisations */
6068     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6069         I32 fake;
6070         STRLEN longest_float_length, longest_fixed_length;
6071         struct regnode_charclass_class ch_class; /* pointed to by data */
6072         int stclass_flag;
6073         I32 last_close = 0; /* pointed to by data */
6074         regnode *first= scan;
6075         regnode *first_next= regnext(first);
6076         /*
6077          * Skip introductions and multiplicators >= 1
6078          * so that we can extract the 'meat' of the pattern that must
6079          * match in the large if() sequence following.
6080          * NOTE that EXACT is NOT covered here, as it is normally
6081          * picked up by the optimiser separately.
6082          *
6083          * This is unfortunate as the optimiser isnt handling lookahead
6084          * properly currently.
6085          *
6086          */
6087         while ((OP(first) == OPEN && (sawopen = 1)) ||
6088                /* An OR of *one* alternative - should not happen now. */
6089             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6090             /* for now we can't handle lookbehind IFMATCH*/
6091             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6092             (OP(first) == PLUS) ||
6093             (OP(first) == MINMOD) ||
6094                /* An {n,m} with n>0 */
6095             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6096             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6097         {
6098                 /*
6099                  * the only op that could be a regnode is PLUS, all the rest
6100                  * will be regnode_1 or regnode_2.
6101                  *
6102                  */
6103                 if (OP(first) == PLUS)
6104                     sawplus = 1;
6105                 else
6106                     first += regarglen[OP(first)];
6107
6108                 first = NEXTOPER(first);
6109                 first_next= regnext(first);
6110         }
6111
6112         /* Starting-point info. */
6113       again:
6114         DEBUG_PEEP("first:",first,0);
6115         /* Ignore EXACT as we deal with it later. */
6116         if (PL_regkind[OP(first)] == EXACT) {
6117             if (OP(first) == EXACT)
6118                 NOOP;   /* Empty, get anchored substr later. */
6119             else
6120                 ri->regstclass = first;
6121         }
6122 #ifdef TRIE_STCLASS
6123         else if (PL_regkind[OP(first)] == TRIE &&
6124                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6125         {
6126             regnode *trie_op;
6127             /* this can happen only on restudy */
6128             if ( OP(first) == TRIE ) {
6129                 struct regnode_1 *trieop = (struct regnode_1 *)
6130                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6131                 StructCopy(first,trieop,struct regnode_1);
6132                 trie_op=(regnode *)trieop;
6133             } else {
6134                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6135                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6136                 StructCopy(first,trieop,struct regnode_charclass);
6137                 trie_op=(regnode *)trieop;
6138             }
6139             OP(trie_op)+=2;
6140             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6141             ri->regstclass = trie_op;
6142         }
6143 #endif
6144         else if (REGNODE_SIMPLE(OP(first)))
6145             ri->regstclass = first;
6146         else if (PL_regkind[OP(first)] == BOUND ||
6147                  PL_regkind[OP(first)] == NBOUND)
6148             ri->regstclass = first;
6149         else if (PL_regkind[OP(first)] == BOL) {
6150             r->extflags |= (OP(first) == MBOL
6151                            ? RXf_ANCH_MBOL
6152                            : (OP(first) == SBOL
6153                               ? RXf_ANCH_SBOL
6154                               : RXf_ANCH_BOL));
6155             first = NEXTOPER(first);
6156             goto again;
6157         }
6158         else if (OP(first) == GPOS) {
6159             r->extflags |= RXf_ANCH_GPOS;
6160             first = NEXTOPER(first);
6161             goto again;
6162         }
6163         else if ((!sawopen || !RExC_sawback) &&
6164             (OP(first) == STAR &&
6165             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6166             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6167         {
6168             /* turn .* into ^.* with an implied $*=1 */
6169             const int type =
6170                 (OP(NEXTOPER(first)) == REG_ANY)
6171                     ? RXf_ANCH_MBOL
6172                     : RXf_ANCH_SBOL;
6173             r->extflags |= type;
6174             r->intflags |= PREGf_IMPLICIT;
6175             first = NEXTOPER(first);
6176             goto again;
6177         }
6178         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6179             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6180             /* x+ must match at the 1st pos of run of x's */
6181             r->intflags |= PREGf_SKIP;
6182
6183         /* Scan is after the zeroth branch, first is atomic matcher. */
6184 #ifdef TRIE_STUDY_OPT
6185         DEBUG_PARSE_r(
6186             if (!restudied)
6187                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6188                               (IV)(first - scan + 1))
6189         );
6190 #else
6191         DEBUG_PARSE_r(
6192             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6193                 (IV)(first - scan + 1))
6194         );
6195 #endif
6196
6197
6198         /*
6199         * If there's something expensive in the r.e., find the
6200         * longest literal string that must appear and make it the
6201         * regmust.  Resolve ties in favor of later strings, since
6202         * the regstart check works with the beginning of the r.e.
6203         * and avoiding duplication strengthens checking.  Not a
6204         * strong reason, but sufficient in the absence of others.
6205         * [Now we resolve ties in favor of the earlier string if
6206         * it happens that c_offset_min has been invalidated, since the
6207         * earlier string may buy us something the later one won't.]
6208         */
6209
6210         data.longest_fixed = newSVpvs("");
6211         data.longest_float = newSVpvs("");
6212         data.last_found = newSVpvs("");
6213         data.longest = &(data.longest_fixed);
6214         ENTER_with_name("study_chunk");
6215         SAVEFREESV(data.longest_fixed);
6216         SAVEFREESV(data.longest_float);
6217         SAVEFREESV(data.last_found);
6218         first = scan;
6219         if (!ri->regstclass) {
6220             cl_init(pRExC_state, &ch_class);
6221             data.start_class = &ch_class;
6222             stclass_flag = SCF_DO_STCLASS_AND;
6223         } else                          /* XXXX Check for BOUND? */
6224             stclass_flag = 0;
6225         data.last_closep = &last_close;
6226
6227         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6228             &data, -1, NULL, NULL,
6229             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6230
6231
6232         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6233
6234
6235         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6236              && data.last_start_min == 0 && data.last_end > 0
6237              && !RExC_seen_zerolen
6238              && !(RExC_seen & REG_SEEN_VERBARG)
6239              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6240             r->extflags |= RXf_CHECK_ALL;
6241         scan_commit(pRExC_state, &data,&minlen,0);
6242
6243         longest_float_length = CHR_SVLEN(data.longest_float);
6244
6245         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6246                    && data.offset_fixed == data.offset_float_min
6247                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6248             && S_setup_longest (aTHX_ pRExC_state,
6249                                     data.longest_float,
6250                                     &(r->float_utf8),
6251                                     &(r->float_substr),
6252                                     &(r->float_end_shift),
6253                                     data.lookbehind_float,
6254                                     data.offset_float_min,
6255                                     data.minlen_float,
6256                                     longest_float_length,
6257                                     data.flags & SF_FL_BEFORE_EOL,
6258                                     data.flags & SF_FL_BEFORE_MEOL))
6259         {
6260             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6261             r->float_max_offset = data.offset_float_max;
6262             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6263                 r->float_max_offset -= data.lookbehind_float;
6264             SvREFCNT_inc_simple_void_NN(data.longest_float);
6265         }
6266         else {
6267             r->float_substr = r->float_utf8 = NULL;
6268             longest_float_length = 0;
6269         }
6270
6271         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6272
6273         if (S_setup_longest (aTHX_ pRExC_state,
6274                                 data.longest_fixed,
6275                                 &(r->anchored_utf8),
6276                                 &(r->anchored_substr),
6277                                 &(r->anchored_end_shift),
6278                                 data.lookbehind_fixed,
6279                                 data.offset_fixed,
6280                                 data.minlen_fixed,
6281                                 longest_fixed_length,
6282                                 data.flags & SF_FIX_BEFORE_EOL,
6283                                 data.flags & SF_FIX_BEFORE_MEOL))
6284         {
6285             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6286             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6287         }
6288         else {
6289             r->anchored_substr = r->anchored_utf8 = NULL;
6290             longest_fixed_length = 0;
6291         }
6292         LEAVE_with_name("study_chunk");
6293
6294         if (ri->regstclass
6295             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6296             ri->regstclass = NULL;
6297
6298         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6299             && stclass_flag
6300             && !(data.start_class->flags & ANYOF_EOS)
6301             && !cl_is_anything(data.start_class))
6302         {
6303             const U32 n = add_data(pRExC_state, 1, "f");
6304             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6305
6306             Newx(RExC_rxi->data->data[n], 1,
6307                 struct regnode_charclass_class);
6308             StructCopy(data.start_class,
6309                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6310                        struct regnode_charclass_class);
6311             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6312             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6313             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6314                       regprop(r, sv, (regnode*)data.start_class);
6315                       PerlIO_printf(Perl_debug_log,
6316                                     "synthetic stclass \"%s\".\n",
6317                                     SvPVX_const(sv));});
6318         }
6319
6320         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6321         if (longest_fixed_length > longest_float_length) {
6322             r->check_end_shift = r->anchored_end_shift;
6323             r->check_substr = r->anchored_substr;
6324             r->check_utf8 = r->anchored_utf8;
6325             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6326             if (r->extflags & RXf_ANCH_SINGLE)
6327                 r->extflags |= RXf_NOSCAN;
6328         }
6329         else {
6330             r->check_end_shift = r->float_end_shift;
6331             r->check_substr = r->float_substr;
6332             r->check_utf8 = r->float_utf8;
6333             r->check_offset_min = r->float_min_offset;
6334             r->check_offset_max = r->float_max_offset;
6335         }
6336         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6337            This should be changed ASAP!  */
6338         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6339             r->extflags |= RXf_USE_INTUIT;
6340             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6341                 r->extflags |= RXf_INTUIT_TAIL;
6342         }
6343         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6344         if ( (STRLEN)minlen < longest_float_length )
6345             minlen= longest_float_length;
6346         if ( (STRLEN)minlen < longest_fixed_length )
6347             minlen= longest_fixed_length;
6348         */
6349     }
6350     else {
6351         /* Several toplevels. Best we can is to set minlen. */
6352         I32 fake;
6353         struct regnode_charclass_class ch_class;
6354         I32 last_close = 0;
6355
6356         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6357
6358         scan = ri->program + 1;
6359         cl_init(pRExC_state, &ch_class);
6360         data.start_class = &ch_class;
6361         data.last_closep = &last_close;
6362
6363
6364         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6365             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6366
6367         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6368
6369         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6370                 = r->float_substr = r->float_utf8 = NULL;
6371
6372         if (!(data.start_class->flags & ANYOF_EOS)
6373             && !cl_is_anything(data.start_class))
6374         {
6375             const U32 n = add_data(pRExC_state, 1, "f");
6376             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6377
6378             Newx(RExC_rxi->data->data[n], 1,
6379                 struct regnode_charclass_class);
6380             StructCopy(data.start_class,
6381                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6382                        struct regnode_charclass_class);
6383             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6384             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6385             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6386                       regprop(r, sv, (regnode*)data.start_class);
6387                       PerlIO_printf(Perl_debug_log,
6388                                     "synthetic stclass \"%s\".\n",
6389                                     SvPVX_const(sv));});
6390         }
6391     }
6392
6393     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6394        the "real" pattern. */
6395     DEBUG_OPTIMISE_r({
6396         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6397                       (IV)minlen, (IV)r->minlen);
6398     });
6399     r->minlenret = minlen;
6400     if (r->minlen < minlen)
6401         r->minlen = minlen;
6402
6403     if (RExC_seen & REG_SEEN_GPOS)
6404         r->extflags |= RXf_GPOS_SEEN;
6405     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6406         r->extflags |= RXf_LOOKBEHIND_SEEN;
6407     if (pRExC_state->num_code_blocks)
6408         r->extflags |= RXf_EVAL_SEEN;
6409     if (RExC_seen & REG_SEEN_CANY)
6410         r->extflags |= RXf_CANY_SEEN;
6411     if (RExC_seen & REG_SEEN_VERBARG)
6412     {
6413         r->intflags |= PREGf_VERBARG_SEEN;
6414         r->extflags |= RXf_MODIFIES_VARS;
6415     }
6416     if (RExC_seen & REG_SEEN_CUTGROUP)
6417         r->intflags |= PREGf_CUTGROUP_SEEN;
6418     if (pm_flags & PMf_USE_RE_EVAL)
6419         r->intflags |= PREGf_USE_RE_EVAL;
6420     if (RExC_paren_names)
6421         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6422     else
6423         RXp_PAREN_NAMES(r) = NULL;
6424
6425 #ifdef STUPID_PATTERN_CHECKS
6426     if (RX_PRELEN(rx) == 0)
6427         r->extflags |= RXf_NULL;
6428     if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6429         r->extflags |= RXf_WHITE;
6430     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6431         r->extflags |= RXf_START_ONLY;
6432 #else
6433     {
6434         regnode *first = ri->program + 1;
6435         U8 fop = OP(first);
6436
6437         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6438             r->extflags |= RXf_NULL;
6439         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6440             r->extflags |= RXf_START_ONLY;
6441         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6442                              && OP(regnext(first)) == END)
6443             r->extflags |= RXf_WHITE;
6444     }
6445 #endif
6446 #ifdef DEBUGGING
6447     if (RExC_paren_names) {
6448         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6449         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6450     } else
6451 #endif
6452         ri->name_list_idx = 0;
6453
6454     if (RExC_recurse_count) {
6455         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6456             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6457             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6458         }
6459     }
6460     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6461     /* assume we don't need to swap parens around before we match */
6462
6463     DEBUG_DUMP_r({
6464         PerlIO_printf(Perl_debug_log,"Final program:\n");
6465         regdump(r);
6466     });
6467 #ifdef RE_TRACK_PATTERN_OFFSETS
6468     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6469         const U32 len = ri->u.offsets[0];
6470         U32 i;
6471         GET_RE_DEBUG_FLAGS_DECL;
6472         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6473         for (i = 1; i <= len; i++) {
6474             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6475                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6476                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6477             }
6478         PerlIO_printf(Perl_debug_log, "\n");
6479     });
6480 #endif
6481     return rx;
6482 }
6483
6484
6485 SV*
6486 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6487                     const U32 flags)
6488 {
6489     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6490
6491     PERL_UNUSED_ARG(value);
6492
6493     if (flags & RXapif_FETCH) {
6494         return reg_named_buff_fetch(rx, key, flags);
6495     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6496         Perl_croak_no_modify();
6497         return NULL;
6498     } else if (flags & RXapif_EXISTS) {
6499         return reg_named_buff_exists(rx, key, flags)
6500             ? &PL_sv_yes
6501             : &PL_sv_no;
6502     } else if (flags & RXapif_REGNAMES) {
6503         return reg_named_buff_all(rx, flags);
6504     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6505         return reg_named_buff_scalar(rx, flags);
6506     } else {
6507         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6508         return NULL;
6509     }
6510 }
6511
6512 SV*
6513 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6514                          const U32 flags)
6515 {
6516     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6517     PERL_UNUSED_ARG(lastkey);
6518
6519     if (flags & RXapif_FIRSTKEY)
6520         return reg_named_buff_firstkey(rx, flags);
6521     else if (flags & RXapif_NEXTKEY)
6522         return reg_named_buff_nextkey(rx, flags);
6523     else {
6524         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6525         return NULL;
6526     }
6527 }
6528
6529 SV*
6530 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6531                           const U32 flags)
6532 {
6533     AV *retarray = NULL;
6534     SV *ret;
6535     struct regexp *const rx = ReANY(r);
6536
6537     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6538
6539     if (flags & RXapif_ALL)
6540         retarray=newAV();
6541
6542     if (rx && RXp_PAREN_NAMES(rx)) {
6543         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6544         if (he_str) {
6545             IV i;
6546             SV* sv_dat=HeVAL(he_str);
6547             I32 *nums=(I32*)SvPVX(sv_dat);
6548             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6549                 if ((I32)(rx->nparens) >= nums[i]
6550                     && rx->offs[nums[i]].start != -1
6551                     && rx->offs[nums[i]].end != -1)
6552                 {
6553                     ret = newSVpvs("");
6554                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6555                     if (!retarray)
6556                         return ret;
6557                 } else {
6558                     if (retarray)
6559                         ret = newSVsv(&PL_sv_undef);
6560                 }
6561                 if (retarray)
6562                     av_push(retarray, ret);
6563             }
6564             if (retarray)
6565                 return newRV_noinc(MUTABLE_SV(retarray));
6566         }
6567     }
6568     return NULL;
6569 }
6570
6571 bool
6572 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6573                            const U32 flags)
6574 {
6575     struct regexp *const rx = ReANY(r);
6576
6577     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6578
6579     if (rx && RXp_PAREN_NAMES(rx)) {
6580         if (flags & RXapif_ALL) {
6581             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6582         } else {
6583             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6584             if (sv) {
6585                 SvREFCNT_dec(sv);
6586                 return TRUE;
6587             } else {
6588                 return FALSE;
6589             }
6590         }
6591     } else {
6592         return FALSE;
6593     }
6594 }
6595
6596 SV*
6597 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6598 {
6599     struct regexp *const rx = ReANY(r);
6600
6601     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6602
6603     if ( rx && RXp_PAREN_NAMES(rx) ) {
6604         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6605
6606         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6607     } else {
6608         return FALSE;
6609     }
6610 }
6611
6612 SV*
6613 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6614 {
6615     struct regexp *const rx = ReANY(r);
6616     GET_RE_DEBUG_FLAGS_DECL;
6617
6618     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6619
6620     if (rx && RXp_PAREN_NAMES(rx)) {
6621         HV *hv = RXp_PAREN_NAMES(rx);
6622         HE *temphe;
6623         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6624             IV i;
6625             IV parno = 0;
6626             SV* sv_dat = HeVAL(temphe);
6627             I32 *nums = (I32*)SvPVX(sv_dat);
6628             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6629                 if ((I32)(rx->lastparen) >= nums[i] &&
6630                     rx->offs[nums[i]].start != -1 &&
6631                     rx->offs[nums[i]].end != -1)
6632                 {
6633                     parno = nums[i];
6634                     break;
6635                 }
6636             }
6637             if (parno || flags & RXapif_ALL) {
6638                 return newSVhek(HeKEY_hek(temphe));
6639             }
6640         }
6641     }
6642     return NULL;
6643 }
6644
6645 SV*
6646 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6647 {
6648     SV *ret;
6649     AV *av;
6650     I32 length;
6651     struct regexp *const rx = ReANY(r);
6652
6653     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6654
6655     if (rx && RXp_PAREN_NAMES(rx)) {
6656         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6657             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6658         } else if (flags & RXapif_ONE) {
6659             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6660             av = MUTABLE_AV(SvRV(ret));
6661             length = av_len(av);
6662             SvREFCNT_dec(ret);
6663             return newSViv(length + 1);
6664         } else {
6665             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6666             return NULL;
6667         }
6668     }
6669     return &PL_sv_undef;
6670 }
6671
6672 SV*
6673 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6674 {
6675     struct regexp *const rx = ReANY(r);
6676     AV *av = newAV();
6677
6678     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6679
6680     if (rx && RXp_PAREN_NAMES(rx)) {
6681         HV *hv= RXp_PAREN_NAMES(rx);
6682         HE *temphe;
6683         (void)hv_iterinit(hv);
6684         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6685             IV i;
6686             IV parno = 0;
6687             SV* sv_dat = HeVAL(temphe);
6688             I32 *nums = (I32*)SvPVX(sv_dat);
6689             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6690                 if ((I32)(rx->lastparen) >= nums[i] &&
6691                     rx->offs[nums[i]].start != -1 &&
6692                     rx->offs[nums[i]].end != -1)
6693                 {
6694                     parno = nums[i];
6695                     break;
6696                 }
6697             }
6698             if (parno || flags & RXapif_ALL) {
6699                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6700             }
6701         }
6702     }
6703
6704     return newRV_noinc(MUTABLE_SV(av));
6705 }
6706
6707 void
6708 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6709                              SV * const sv)
6710 {
6711     struct regexp *const rx = ReANY(r);
6712     char *s = NULL;
6713     I32 i = 0;
6714     I32 s1, t1;
6715     I32 n = paren;
6716
6717     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6718
6719     if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6720            || n == RX_BUFF_IDX_CARET_FULLMATCH
6721            || n == RX_BUFF_IDX_CARET_POSTMATCH
6722          )
6723          && !(rx->extflags & RXf_PMf_KEEPCOPY)
6724     )
6725         goto ret_undef;
6726
6727     if (!rx->subbeg)
6728         goto ret_undef;
6729
6730     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6731         /* no need to distinguish between them any more */
6732         n = RX_BUFF_IDX_FULLMATCH;
6733
6734     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6735         && rx->offs[0].start != -1)
6736     {
6737         /* $`, ${^PREMATCH} */
6738         i = rx->offs[0].start;
6739         s = rx->subbeg;
6740     }
6741     else
6742     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6743         && rx->offs[0].end != -1)
6744     {
6745         /* $', ${^POSTMATCH} */
6746         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6747         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6748     }
6749     else
6750     if ( 0 <= n && n <= (I32)rx->nparens &&
6751         (s1 = rx->offs[n].start) != -1 &&
6752         (t1 = rx->offs[n].end) != -1)
6753     {
6754         /* $&, ${^MATCH},  $1 ... */
6755         i = t1 - s1;
6756         s = rx->subbeg + s1 - rx->suboffset;
6757     } else {
6758         goto ret_undef;
6759     }
6760
6761     assert(s >= rx->subbeg);
6762     assert(rx->sublen >= (s - rx->subbeg) + i );
6763     if (i >= 0) {
6764 #if NO_TAINT_SUPPORT
6765         sv_setpvn(sv, s, i);
6766 #else
6767         const int oldtainted = TAINT_get;
6768         TAINT_NOT;
6769         sv_setpvn(sv, s, i);
6770         TAINT_set(oldtainted);
6771 #endif
6772         if ( (rx->extflags & RXf_CANY_SEEN)
6773             ? (RXp_MATCH_UTF8(rx)
6774                         && (!i || is_utf8_string((U8*)s, i)))
6775             : (RXp_MATCH_UTF8(rx)) )
6776         {
6777             SvUTF8_on(sv);
6778         }
6779         else
6780             SvUTF8_off(sv);
6781         if (TAINTING_get) {
6782             if (RXp_MATCH_TAINTED(rx)) {
6783                 if (SvTYPE(sv) >= SVt_PVMG) {
6784                     MAGIC* const mg = SvMAGIC(sv);
6785                     MAGIC* mgt;
6786                     TAINT;
6787                     SvMAGIC_set(sv, mg->mg_moremagic);
6788                     SvTAINT(sv);
6789                     if ((mgt = SvMAGIC(sv))) {
6790                         mg->mg_moremagic = mgt;
6791                         SvMAGIC_set(sv, mg);
6792                     }
6793                 } else {
6794                     TAINT;
6795                     SvTAINT(sv);
6796                 }
6797             } else
6798                 SvTAINTED_off(sv);
6799         }
6800     } else {
6801       ret_undef:
6802         sv_setsv(sv,&PL_sv_undef);
6803         return;
6804     }
6805 }
6806
6807 void
6808 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6809                                                          SV const * const value)
6810 {
6811     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6812
6813     PERL_UNUSED_ARG(rx);
6814     PERL_UNUSED_ARG(paren);
6815     PERL_UNUSED_ARG(value);
6816
6817     if (!PL_localizing)
6818         Perl_croak_no_modify();
6819 }
6820
6821 I32
6822 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6823                               const I32 paren)
6824 {
6825     struct regexp *const rx = ReANY(r);
6826     I32 i;
6827     I32 s1, t1;
6828
6829     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6830
6831     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6832     switch (paren) {
6833       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6834          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6835             goto warn_undef;
6836         /*FALLTHROUGH*/
6837
6838       case RX_BUFF_IDX_PREMATCH:       /* $` */
6839         if (rx->offs[0].start != -1) {
6840                         i = rx->offs[0].start;
6841                         if (i > 0) {
6842                                 s1 = 0;
6843                                 t1 = i;
6844                                 goto getlen;
6845                         }
6846             }
6847         return 0;
6848
6849       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6850          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6851             goto warn_undef;
6852       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6853             if (rx->offs[0].end != -1) {
6854                         i = rx->sublen - rx->offs[0].end;
6855                         if (i > 0) {
6856                                 s1 = rx->offs[0].end;
6857                                 t1 = rx->sublen;
6858                                 goto getlen;
6859                         }
6860             }
6861         return 0;
6862
6863       case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6864          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6865             goto warn_undef;
6866         /*FALLTHROUGH*/
6867
6868       /* $& / ${^MATCH}, $1, $2, ... */
6869       default:
6870             if (paren <= (I32)rx->nparens &&
6871             (s1 = rx->offs[paren].start) != -1 &&
6872             (t1 = rx->offs[paren].end) != -1)
6873             {
6874             i = t1 - s1;
6875             goto getlen;
6876         } else {
6877           warn_undef:
6878             if (ckWARN(WARN_UNINITIALIZED))
6879                 report_uninit((const SV *)sv);
6880             return 0;
6881         }
6882     }
6883   getlen:
6884     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6885         const char * const s = rx->subbeg - rx->suboffset + s1;
6886         const U8 *ep;
6887         STRLEN el;
6888
6889         i = t1 - s1;
6890         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6891                         i = el;
6892     }
6893     return i;
6894 }
6895
6896 SV*
6897 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6898 {
6899     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6900         PERL_UNUSED_ARG(rx);
6901         if (0)
6902             return NULL;
6903         else
6904             return newSVpvs("Regexp");
6905 }
6906
6907 /* Scans the name of a named buffer from the pattern.
6908  * If flags is REG_RSN_RETURN_NULL returns null.
6909  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6910  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6911  * to the parsed name as looked up in the RExC_paren_names hash.
6912  * If there is an error throws a vFAIL().. type exception.
6913  */
6914
6915 #define REG_RSN_RETURN_NULL    0
6916 #define REG_RSN_RETURN_NAME    1
6917 #define REG_RSN_RETURN_DATA    2
6918
6919 STATIC SV*
6920 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6921 {
6922     char *name_start = RExC_parse;
6923
6924     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6925
6926     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6927          /* skip IDFIRST by using do...while */
6928         if (UTF)
6929             do {
6930                 RExC_parse += UTF8SKIP(RExC_parse);
6931             } while (isALNUM_utf8((U8*)RExC_parse));
6932         else
6933             do {
6934                 RExC_parse++;
6935             } while (isALNUM(*RExC_parse));
6936     } else {
6937         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6938         vFAIL("Group name must start with a non-digit word character");
6939     }
6940     if ( flags ) {
6941         SV* sv_name
6942             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6943                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6944         if ( flags == REG_RSN_RETURN_NAME)
6945             return sv_name;
6946         else if (flags==REG_RSN_RETURN_DATA) {
6947             HE *he_str = NULL;
6948             SV *sv_dat = NULL;
6949             if ( ! sv_name )      /* should not happen*/
6950                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6951             if (RExC_paren_names)
6952                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6953             if ( he_str )
6954                 sv_dat = HeVAL(he_str);
6955             if ( ! sv_dat )
6956                 vFAIL("Reference to nonexistent named group");
6957             return sv_dat;
6958         }
6959         else {
6960             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6961                        (unsigned long) flags);
6962         }
6963         assert(0); /* NOT REACHED */
6964     }
6965     return NULL;
6966 }
6967
6968 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6969     int rem=(int)(RExC_end - RExC_parse);                       \
6970     int cut;                                                    \
6971     int num;                                                    \
6972     int iscut=0;                                                \
6973     if (rem>10) {                                               \
6974         rem=10;                                                 \
6975         iscut=1;                                                \
6976     }                                                           \
6977     cut=10-rem;                                                 \
6978     if (RExC_lastparse!=RExC_parse)                             \
6979         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6980             rem, RExC_parse,                                    \
6981             cut + 4,                                            \
6982             iscut ? "..." : "<"                                 \
6983         );                                                      \
6984     else                                                        \
6985         PerlIO_printf(Perl_debug_log,"%16s","");                \
6986                                                                 \
6987     if (SIZE_ONLY)                                              \
6988        num = RExC_size + 1;                                     \
6989     else                                                        \
6990        num=REG_NODE_NUM(RExC_emit);                             \
6991     if (RExC_lastnum!=num)                                      \
6992        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6993     else                                                        \
6994        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6995     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6996         (int)((depth*2)), "",                                   \
6997         (funcname)                                              \
6998     );                                                          \
6999     RExC_lastnum=num;                                           \
7000     RExC_lastparse=RExC_parse;                                  \
7001 })
7002
7003
7004
7005 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7006     DEBUG_PARSE_MSG((funcname));                            \
7007     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7008 })
7009 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7010     DEBUG_PARSE_MSG((funcname));                            \
7011     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7012 })
7013
7014 /* This section of code defines the inversion list object and its methods.  The
7015  * interfaces are highly subject to change, so as much as possible is static to
7016  * this file.  An inversion list is here implemented as a malloc'd C UV array
7017  * with some added info that is placed as UVs at the beginning in a header
7018  * portion.  An inversion list for Unicode is an array of code points, sorted
7019  * by ordinal number.  The zeroth element is the first code point in the list.
7020  * The 1th element is the first element beyond that not in the list.  In other
7021  * words, the first range is
7022  *  invlist[0]..(invlist[1]-1)
7023  * The other ranges follow.  Thus every element whose index is divisible by two
7024  * marks the beginning of a range that is in the list, and every element not
7025  * divisible by two marks the beginning of a range not in the list.  A single
7026  * element inversion list that contains the single code point N generally
7027  * consists of two elements
7028  *  invlist[0] == N
7029  *  invlist[1] == N+1
7030  * (The exception is when N is the highest representable value on the
7031  * machine, in which case the list containing just it would be a single
7032  * element, itself.  By extension, if the last range in the list extends to
7033  * infinity, then the first element of that range will be in the inversion list
7034  * at a position that is divisible by two, and is the final element in the
7035  * list.)
7036  * Taking the complement (inverting) an inversion list is quite simple, if the
7037  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7038  * This implementation reserves an element at the beginning of each inversion
7039  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
7040  * actual beginning of the list is either that element if 0, or the next one if
7041  * 1.
7042  *
7043  * More about inversion lists can be found in "Unicode Demystified"
7044  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7045  * More will be coming when functionality is added later.
7046  *
7047  * The inversion list data structure is currently implemented as an SV pointing
7048  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7049  * array of UV whose memory management is automatically handled by the existing
7050  * facilities for SV's.
7051  *
7052  * Some of the methods should always be private to the implementation, and some
7053  * should eventually be made public */
7054
7055 /* The header definitions are in F<inline_invlist.c> */
7056
7057 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7058 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7059
7060 #define INVLIST_INITIAL_LEN 10
7061
7062 PERL_STATIC_INLINE UV*
7063 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7064 {
7065     /* Returns a pointer to the first element in the inversion list's array.
7066      * This is called upon initialization of an inversion list.  Where the
7067      * array begins depends on whether the list has the code point U+0000
7068      * in it or not.  The other parameter tells it whether the code that
7069      * follows this call is about to put a 0 in the inversion list or not.
7070      * The first element is either the element with 0, if 0, or the next one,
7071      * if 1 */
7072
7073     UV* zero = get_invlist_zero_addr(invlist);
7074
7075     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7076
7077     /* Must be empty */
7078     assert(! *_get_invlist_len_addr(invlist));
7079
7080     /* 1^1 = 0; 1^0 = 1 */
7081     *zero = 1 ^ will_have_0;
7082     return zero + *zero;
7083 }
7084
7085 PERL_STATIC_INLINE UV*
7086 S_invlist_array(pTHX_ SV* const invlist)
7087 {
7088     /* Returns the pointer to the inversion list's array.  Every time the
7089      * length changes, this needs to be called in case malloc or realloc moved
7090      * it */
7091
7092     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7093
7094     /* Must not be empty.  If these fail, you probably didn't check for <len>
7095      * being non-zero before trying to get the array */
7096     assert(*_get_invlist_len_addr(invlist));
7097     assert(*get_invlist_zero_addr(invlist) == 0
7098            || *get_invlist_zero_addr(invlist) == 1);
7099
7100     /* The array begins either at the element reserved for zero if the
7101      * list contains 0 (that element will be set to 0), or otherwise the next
7102      * element (in which case the reserved element will be set to 1). */
7103     return (UV *) (get_invlist_zero_addr(invlist)
7104                    + *get_invlist_zero_addr(invlist));
7105 }
7106
7107 PERL_STATIC_INLINE void
7108 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7109 {
7110     /* Sets the current number of elements stored in the inversion list */
7111
7112     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7113
7114     *_get_invlist_len_addr(invlist) = len;
7115
7116     assert(len <= SvLEN(invlist));
7117
7118     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7119     /* If the list contains U+0000, that element is part of the header,
7120      * and should not be counted as part of the array.  It will contain
7121      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7122      * subtract:
7123      *  SvCUR_set(invlist,
7124      *            TO_INTERNAL_SIZE(len
7125      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7126      * But, this is only valid if len is not 0.  The consequences of not doing
7127      * this is that the memory allocation code may think that 1 more UV is
7128      * being used than actually is, and so might do an unnecessary grow.  That
7129      * seems worth not bothering to make this the precise amount.
7130      *
7131      * Note that when inverting, SvCUR shouldn't change */
7132 }
7133
7134 PERL_STATIC_INLINE IV*
7135 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7136 {
7137     /* Return the address of the UV that is reserved to hold the cached index
7138      * */
7139
7140     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7141
7142     return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7143 }
7144
7145 PERL_STATIC_INLINE IV
7146 S_invlist_previous_index(pTHX_ SV* const invlist)
7147 {
7148     /* Returns cached index of previous search */
7149
7150     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7151
7152     return *get_invlist_previous_index_addr(invlist);
7153 }
7154
7155 PERL_STATIC_INLINE void
7156 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7157 {
7158     /* Caches <index> for later retrieval */
7159
7160     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7161
7162     assert(index == 0 || index < (int) _invlist_len(invlist));
7163
7164     *get_invlist_previous_index_addr(invlist) = index;
7165 }
7166
7167 PERL_STATIC_INLINE UV
7168 S_invlist_max(pTHX_ SV* const invlist)
7169 {
7170     /* Returns the maximum number of elements storable in the inversion list's
7171      * array, without having to realloc() */
7172
7173     PERL_ARGS_ASSERT_INVLIST_MAX;
7174
7175     return FROM_INTERNAL_SIZE(SvLEN(invlist));
7176 }
7177
7178 PERL_STATIC_INLINE UV*
7179 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7180 {
7181     /* Return the address of the UV that is reserved to hold 0 if the inversion
7182      * list contains 0.  This has to be the last element of the heading, as the
7183      * list proper starts with either it if 0, or the next element if not.
7184      * (But we force it to contain either 0 or 1) */
7185
7186     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7187
7188     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7189 }
7190
7191 #ifndef PERL_IN_XSUB_RE
7192 SV*
7193 Perl__new_invlist(pTHX_ IV initial_size)
7194 {
7195
7196     /* Return a pointer to a newly constructed inversion list, with enough
7197      * space to store 'initial_size' elements.  If that number is negative, a
7198      * system default is used instead */
7199
7200     SV* new_list;
7201
7202     if (initial_size < 0) {
7203         initial_size = INVLIST_INITIAL_LEN;
7204     }
7205
7206     /* Allocate the initial space */
7207     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7208     invlist_set_len(new_list, 0);
7209
7210     /* Force iterinit() to be used to get iteration to work */
7211     *get_invlist_iter_addr(new_list) = UV_MAX;
7212
7213     /* This should force a segfault if a method doesn't initialize this
7214      * properly */
7215     *get_invlist_zero_addr(new_list) = UV_MAX;
7216
7217     *get_invlist_previous_index_addr(new_list) = 0;
7218     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7219 #if HEADER_LENGTH != 5
7220 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7221 #endif
7222
7223     return new_list;
7224 }
7225 #endif
7226
7227 STATIC SV*
7228 S__new_invlist_C_array(pTHX_ UV* list)
7229 {
7230     /* Return a pointer to a newly constructed inversion list, initialized to
7231      * point to <list>, which has to be in the exact correct inversion list
7232      * form, including internal fields.  Thus this is a dangerous routine that
7233      * should not be used in the wrong hands */
7234
7235     SV* invlist = newSV_type(SVt_PV);
7236
7237     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7238
7239     SvPV_set(invlist, (char *) list);
7240     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7241                                shouldn't touch it */
7242     SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7243
7244     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7245         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7246     }
7247
7248     return invlist;
7249 }
7250
7251 STATIC void
7252 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7253 {
7254     /* Grow the maximum size of an inversion list */
7255
7256     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7257
7258     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7259 }
7260
7261 PERL_STATIC_INLINE void
7262 S_invlist_trim(pTHX_ SV* const invlist)
7263 {
7264     PERL_ARGS_ASSERT_INVLIST_TRIM;
7265
7266     /* Change the length of the inversion list to how many entries it currently
7267      * has */
7268
7269     SvPV_shrink_to_cur((SV *) invlist);
7270 }
7271
7272 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7273
7274 STATIC void
7275 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7276 {
7277    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7278     * the end of the inversion list.  The range must be above any existing
7279     * ones. */
7280
7281     UV* array;
7282     UV max = invlist_max(invlist);
7283     UV len = _invlist_len(invlist);
7284
7285     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7286
7287     if (len == 0) { /* Empty lists must be initialized */
7288         array = _invlist_array_init(invlist, start == 0);
7289     }
7290     else {
7291         /* Here, the existing list is non-empty. The current max entry in the
7292          * list is generally the first value not in the set, except when the
7293          * set extends to the end of permissible values, in which case it is
7294          * the first entry in that final set, and so this call is an attempt to
7295          * append out-of-order */
7296
7297         UV final_element = len - 1;
7298         array = invlist_array(invlist);
7299         if (array[final_element] > start
7300             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7301         {
7302             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7303                        array[final_element], start,
7304                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7305         }
7306
7307         /* Here, it is a legal append.  If the new range begins with the first
7308          * value not in the set, it is extending the set, so the new first
7309          * value not in the set is one greater than the newly extended range.
7310          * */
7311         if (array[final_element] == start) {
7312             if (end != UV_MAX) {
7313                 array[final_element] = end + 1;
7314             }
7315             else {
7316                 /* But if the end is the maximum representable on the machine,
7317                  * just let the range that this would extend to have no end */
7318                 invlist_set_len(invlist, len - 1);
7319             }
7320             return;
7321         }
7322     }
7323
7324     /* Here the new range doesn't extend any existing set.  Add it */
7325
7326     len += 2;   /* Includes an element each for the start and end of range */
7327
7328     /* If overflows the existing space, extend, which may cause the array to be
7329      * moved */
7330     if (max < len) {
7331         invlist_extend(invlist, len);
7332         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7333                                            failure in invlist_array() */
7334         array = invlist_array(invlist);
7335     }
7336     else {
7337         invlist_set_len(invlist, len);
7338     }
7339
7340     /* The next item on the list starts the range, the one after that is
7341      * one past the new range.  */
7342     array[len - 2] = start;
7343     if (end != UV_MAX) {
7344         array[len - 1] = end + 1;
7345     }
7346     else {
7347         /* But if the end is the maximum representable on the machine, just let
7348          * the range have no end */
7349         invlist_set_len(invlist, len - 1);
7350     }
7351 }
7352
7353 #ifndef PERL_IN_XSUB_RE
7354
7355 IV
7356 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7357 {
7358     /* Searches the inversion list for the entry that contains the input code
7359      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7360      * return value is the index into the list's array of the range that
7361      * contains <cp> */
7362
7363     IV low = 0;
7364     IV mid;
7365     IV high = _invlist_len(invlist);
7366     const IV highest_element = high - 1;
7367     const UV* array;
7368
7369     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7370
7371     /* If list is empty, return failure. */
7372     if (high == 0) {
7373         return -1;
7374     }
7375
7376     /* If the code point is before the first element, return failure.  (We
7377      * can't combine this with the test above, because we can't get the array
7378      * unless we know the list is non-empty) */
7379     array = invlist_array(invlist);
7380
7381     mid = invlist_previous_index(invlist);
7382     assert(mid >=0 && mid <= highest_element);
7383
7384     /* <mid> contains the cache of the result of the previous call to this
7385      * function (0 the first time).  See if this call is for the same result,
7386      * or if it is for mid-1.  This is under the theory that calls to this
7387      * function will often be for related code points that are near each other.
7388      * And benchmarks show that caching gives better results.  We also test
7389      * here if the code point is within the bounds of the list.  These tests
7390      * replace others that would have had to be made anyway to make sure that
7391      * the array bounds were not exceeded, and these give us extra information
7392      * at the same time */
7393     if (cp >= array[mid]) {
7394         if (cp >= array[highest_element]) {
7395             return highest_element;
7396         }
7397
7398         /* Here, array[mid] <= cp < array[highest_element].  This means that
7399          * the final element is not the answer, so can exclude it; it also
7400          * means that <mid> is not the final element, so can refer to 'mid + 1'
7401          * safely */
7402         if (cp < array[mid + 1]) {
7403             return mid;
7404         }
7405         high--;
7406         low = mid + 1;
7407     }
7408     else { /* cp < aray[mid] */
7409         if (cp < array[0]) { /* Fail if outside the array */
7410             return -1;
7411         }
7412         high = mid;
7413         if (cp >= array[mid - 1]) {
7414             goto found_entry;
7415         }
7416     }
7417
7418     /* Binary search.  What we are looking for is <i> such that
7419      *  array[i] <= cp < array[i+1]
7420      * The loop below converges on the i+1.  Note that there may not be an
7421      * (i+1)th element in the array, and things work nonetheless */
7422     while (low < high) {
7423         mid = (low + high) / 2;
7424         assert(mid <= highest_element);
7425         if (array[mid] <= cp) { /* cp >= array[mid] */
7426             low = mid + 1;
7427
7428             /* We could do this extra test to exit the loop early.
7429             if (cp < array[low]) {
7430                 return mid;
7431             }
7432             */
7433         }
7434         else { /* cp < array[mid] */
7435             high = mid;
7436         }
7437     }
7438
7439   found_entry:
7440     high--;
7441     invlist_set_previous_index(invlist, high);
7442     return high;
7443 }
7444
7445 void
7446 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7447 {
7448     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7449      * but is used when the swash has an inversion list.  This makes this much
7450      * faster, as it uses a binary search instead of a linear one.  This is
7451      * intimately tied to that function, and perhaps should be in utf8.c,
7452      * except it is intimately tied to inversion lists as well.  It assumes
7453      * that <swatch> is all 0's on input */
7454
7455     UV current = start;
7456     const IV len = _invlist_len(invlist);
7457     IV i;
7458     const UV * array;
7459
7460     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7461
7462     if (len == 0) { /* Empty inversion list */
7463         return;
7464     }
7465
7466     array = invlist_array(invlist);
7467
7468     /* Find which element it is */
7469     i = _invlist_search(invlist, start);
7470
7471     /* We populate from <start> to <end> */
7472     while (current < end) {
7473         UV upper;
7474
7475         /* The inversion list gives the results for every possible code point
7476          * after the first one in the list.  Only those ranges whose index is
7477          * even are ones that the inversion list matches.  For the odd ones,
7478          * and if the initial code point is not in the list, we have to skip
7479          * forward to the next element */
7480         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7481             i++;
7482             if (i >= len) { /* Finished if beyond the end of the array */
7483                 return;
7484             }
7485             current = array[i];
7486             if (current >= end) {   /* Finished if beyond the end of what we
7487                                        are populating */
7488                 if (LIKELY(end < UV_MAX)) {
7489                     return;
7490                 }
7491
7492                 /* We get here when the upper bound is the maximum
7493                  * representable on the machine, and we are looking for just
7494                  * that code point.  Have to special case it */
7495                 i = len;
7496                 goto join_end_of_list;
7497             }
7498         }
7499         assert(current >= start);
7500
7501         /* The current range ends one below the next one, except don't go past
7502          * <end> */
7503         i++;
7504         upper = (i < len && array[i] < end) ? array[i] : end;
7505
7506         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7507          * for each code point in it */
7508         for (; current < upper; current++) {
7509             const STRLEN offset = (STRLEN)(current - start);
7510             swatch[offset >> 3] |= 1 << (offset & 7);
7511         }
7512
7513     join_end_of_list:
7514
7515         /* Quit if at the end of the list */
7516         if (i >= len) {
7517
7518             /* But first, have to deal with the highest possible code point on
7519              * the platform.  The previous code assumes that <end> is one
7520              * beyond where we want to populate, but that is impossible at the
7521              * platform's infinity, so have to handle it specially */
7522             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7523             {
7524                 const STRLEN offset = (STRLEN)(end - start);
7525                 swatch[offset >> 3] |= 1 << (offset & 7);
7526             }
7527             return;
7528         }
7529
7530         /* Advance to the next range, which will be for code points not in the
7531          * inversion list */
7532         current = array[i];
7533     }
7534
7535     return;
7536 }
7537
7538 void
7539 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7540 {
7541     /* Take the union of two inversion lists and point <output> to it.  *output
7542      * should be defined upon input, and if it points to one of the two lists,
7543      * the reference count to that list will be decremented.  The first list,
7544      * <a>, may be NULL, in which case a copy of the second list is returned.
7545      * If <complement_b> is TRUE, the union is taken of the complement
7546      * (inversion) of <b> instead of b itself.
7547      *
7548      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7549      * Richard Gillam, published by Addison-Wesley, and explained at some
7550      * length there.  The preface says to incorporate its examples into your
7551      * code at your own risk.
7552      *
7553      * The algorithm is like a merge sort.
7554      *
7555      * XXX A potential performance improvement is to keep track as we go along
7556      * if only one of the inputs contributes to the result, meaning the other
7557      * is a subset of that one.  In that case, we can skip the final copy and
7558      * return the larger of the input lists, but then outside code might need
7559      * to keep track of whether to free the input list or not */
7560
7561     UV* array_a;    /* a's array */
7562     UV* array_b;
7563     UV len_a;       /* length of a's array */
7564     UV len_b;
7565
7566     SV* u;                      /* the resulting union */
7567     UV* array_u;
7568     UV len_u;
7569
7570     UV i_a = 0;             /* current index into a's array */
7571     UV i_b = 0;
7572     UV i_u = 0;
7573
7574     /* running count, as explained in the algorithm source book; items are
7575      * stopped accumulating and are output when the count changes to/from 0.
7576      * The count is incremented when we start a range that's in the set, and
7577      * decremented when we start a range that's not in the set.  So its range
7578      * is 0 to 2.  Only when the count is zero is something not in the set.
7579      */
7580     UV count = 0;
7581
7582     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7583     assert(a != b);
7584
7585     /* If either one is empty, the union is the other one */
7586     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7587         if (*output == a) {
7588             if (a != NULL) {
7589                 SvREFCNT_dec(a);
7590             }
7591         }
7592         if (*output != b) {
7593             *output = invlist_clone(b);
7594             if (complement_b) {
7595                 _invlist_invert(*output);
7596             }
7597         } /* else *output already = b; */
7598         return;
7599     }
7600     else if ((len_b = _invlist_len(b)) == 0) {
7601         if (*output == b) {
7602             SvREFCNT_dec(b);
7603         }
7604
7605         /* The complement of an empty list is a list that has everything in it,
7606          * so the union with <a> includes everything too */
7607         if (complement_b) {
7608             if (a == *output) {
7609                 SvREFCNT_dec(a);
7610             }
7611             *output = _new_invlist(1);
7612             _append_range_to_invlist(*output, 0, UV_MAX);
7613         }
7614         else if (*output != a) {
7615             *output = invlist_clone(a);
7616         }
7617         /* else *output already = a; */
7618         return;
7619     }
7620
7621     /* Here both lists exist and are non-empty */
7622     array_a = invlist_array(a);
7623     array_b = invlist_array(b);
7624
7625     /* If are to take the union of 'a' with the complement of b, set it
7626      * up so are looking at b's complement. */
7627     if (complement_b) {
7628
7629         /* To complement, we invert: if the first element is 0, remove it.  To
7630          * do this, we just pretend the array starts one later, and clear the
7631          * flag as we don't have to do anything else later */
7632         if (array_b[0] == 0) {
7633             array_b++;
7634             len_b--;
7635             complement_b = FALSE;
7636         }
7637         else {
7638
7639             /* But if the first element is not zero, we unshift a 0 before the
7640              * array.  The data structure reserves a space for that 0 (which
7641              * should be a '1' right now), so physical shifting is unneeded,
7642              * but temporarily change that element to 0.  Before exiting the
7643              * routine, we must restore the element to '1' */
7644             array_b--;
7645             len_b++;
7646             array_b[0] = 0;
7647         }
7648     }
7649
7650     /* Size the union for the worst case: that the sets are completely
7651      * disjoint */
7652     u = _new_invlist(len_a + len_b);
7653
7654     /* Will contain U+0000 if either component does */
7655     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7656                                       || (len_b > 0 && array_b[0] == 0));
7657
7658     /* Go through each list item by item, stopping when exhausted one of
7659      * them */
7660     while (i_a < len_a && i_b < len_b) {
7661         UV cp;      /* The element to potentially add to the union's array */
7662         bool cp_in_set;   /* is it in the the input list's set or not */
7663
7664         /* We need to take one or the other of the two inputs for the union.
7665          * Since we are merging two sorted lists, we take the smaller of the
7666          * next items.  In case of a tie, we take the one that is in its set
7667          * first.  If we took one not in the set first, it would decrement the
7668          * count, possibly to 0 which would cause it to be output as ending the
7669          * range, and the next time through we would take the same number, and
7670          * output it again as beginning the next range.  By doing it the
7671          * opposite way, there is no possibility that the count will be
7672          * momentarily decremented to 0, and thus the two adjoining ranges will
7673          * be seamlessly merged.  (In a tie and both are in the set or both not
7674          * in the set, it doesn't matter which we take first.) */
7675         if (array_a[i_a] < array_b[i_b]
7676             || (array_a[i_a] == array_b[i_b]
7677                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7678         {
7679             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7680             cp= array_a[i_a++];
7681         }
7682         else {
7683             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7684             cp= array_b[i_b++];
7685         }
7686
7687         /* Here, have chosen which of the two inputs to look at.  Only output
7688          * if the running count changes to/from 0, which marks the
7689          * beginning/end of a range in that's in the set */
7690         if (cp_in_set) {
7691             if (count == 0) {
7692                 array_u[i_u++] = cp;
7693             }
7694             count++;
7695         }
7696         else {
7697             count--;
7698             if (count == 0) {
7699                 array_u[i_u++] = cp;
7700             }
7701         }
7702     }
7703
7704     /* Here, we are finished going through at least one of the lists, which
7705      * means there is something remaining in at most one.  We check if the list
7706      * that hasn't been exhausted is positioned such that we are in the middle
7707      * of a range in its set or not.  (i_a and i_b point to the element beyond
7708      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7709      * is potentially more to output.
7710      * There are four cases:
7711      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7712      *     in the union is entirely from the non-exhausted set.
7713      *  2) Both were in their sets, count is 2.  Nothing further should
7714      *     be output, as everything that remains will be in the exhausted
7715      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7716      *     that
7717      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7718      *     Nothing further should be output because the union includes
7719      *     everything from the exhausted set.  Not decrementing ensures that.
7720      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7721      *     decrementing to 0 insures that we look at the remainder of the
7722      *     non-exhausted set */
7723     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7724         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7725     {
7726         count--;
7727     }
7728
7729     /* The final length is what we've output so far, plus what else is about to
7730      * be output.  (If 'count' is non-zero, then the input list we exhausted
7731      * has everything remaining up to the machine's limit in its set, and hence
7732      * in the union, so there will be no further output. */
7733     len_u = i_u;
7734     if (count == 0) {
7735         /* At most one of the subexpressions will be non-zero */
7736         len_u += (len_a - i_a) + (len_b - i_b);
7737     }
7738
7739     /* Set result to final length, which can change the pointer to array_u, so
7740      * re-find it */
7741     if (len_u != _invlist_len(u)) {
7742         invlist_set_len(u, len_u);
7743         invlist_trim(u);
7744         array_u = invlist_array(u);
7745     }
7746
7747     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7748      * the other) ended with everything above it not in its set.  That means
7749      * that the remaining part of the union is precisely the same as the
7750      * non-exhausted list, so can just copy it unchanged.  (If both list were
7751      * exhausted at the same time, then the operations below will be both 0.)
7752      */
7753     if (count == 0) {
7754         IV copy_count; /* At most one will have a non-zero copy count */
7755         if ((copy_count = len_a - i_a) > 0) {
7756             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7757         }
7758         else if ((copy_count = len_b - i_b) > 0) {
7759             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7760         }
7761     }
7762
7763     /*  We may be removing a reference to one of the inputs */
7764     if (a == *output || b == *output) {
7765         SvREFCNT_dec(*output);
7766     }
7767
7768     /* If we've changed b, restore it */
7769     if (complement_b) {
7770         array_b[0] = 1;
7771     }
7772
7773     *output = u;
7774     return;
7775 }
7776
7777 void
7778 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7779 {
7780     /* Take the intersection of two inversion lists and point <i> to it.  *i
7781      * should be defined upon input, and if it points to one of the two lists,
7782      * the reference count to that list will be decremented.
7783      * If <complement_b> is TRUE, the result will be the intersection of <a>
7784      * and the complement (or inversion) of <b> instead of <b> directly.
7785      *
7786      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7787      * Richard Gillam, published by Addison-Wesley, and explained at some
7788      * length there.  The preface says to incorporate its examples into your
7789      * code at your own risk.  In fact, it had bugs
7790      *
7791      * The algorithm is like a merge sort, and is essentially the same as the
7792      * union above
7793      */
7794
7795     UV* array_a;                /* a's array */
7796     UV* array_b;
7797     UV len_a;   /* length of a's array */
7798     UV len_b;
7799
7800     SV* r;                   /* the resulting intersection */
7801     UV* array_r;
7802     UV len_r;
7803
7804     UV i_a = 0;             /* current index into a's array */
7805     UV i_b = 0;
7806     UV i_r = 0;
7807
7808     /* running count, as explained in the algorithm source book; items are
7809      * stopped accumulating and are output when the count changes to/from 2.
7810      * The count is incremented when we start a range that's in the set, and
7811      * decremented when we start a range that's not in the set.  So its range
7812      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7813      */
7814     UV count = 0;
7815
7816     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7817     assert(a != b);
7818
7819     /* Special case if either one is empty */
7820     len_a = _invlist_len(a);
7821     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7822
7823         if (len_a != 0 && complement_b) {
7824
7825             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7826              * be empty.  Here, also we are using 'b's complement, which hence
7827              * must be every possible code point.  Thus the intersection is
7828              * simply 'a'. */
7829             if (*i != a) {
7830                 *i = invlist_clone(a);
7831
7832                 if (*i == b) {
7833                     SvREFCNT_dec(b);
7834                 }
7835             }
7836             /* else *i is already 'a' */
7837             return;
7838         }
7839
7840         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7841          * intersection must be empty */
7842         if (*i == a) {
7843             SvREFCNT_dec(a);
7844         }
7845         else if (*i == b) {
7846             SvREFCNT_dec(b);
7847         }
7848         *i = _new_invlist(0);
7849         return;
7850     }
7851
7852     /* Here both lists exist and are non-empty */
7853     array_a = invlist_array(a);
7854     array_b = invlist_array(b);
7855
7856     /* If are to take the intersection of 'a' with the complement of b, set it
7857      * up so are looking at b's complement. */
7858     if (complement_b) {
7859
7860         /* To complement, we invert: if the first element is 0, remove it.  To
7861          * do this, we just pretend the array starts one later, and clear the
7862          * flag as we don't have to do anything else later */
7863         if (array_b[0] == 0) {
7864             array_b++;
7865             len_b--;
7866             complement_b = FALSE;
7867         }
7868         else {
7869
7870             /* But if the first element is not zero, we unshift a 0 before the
7871              * array.  The data structure reserves a space for that 0 (which
7872              * should be a '1' right now), so physical shifting is unneeded,
7873              * but temporarily change that element to 0.  Before exiting the
7874              * routine, we must restore the element to '1' */
7875             array_b--;
7876             len_b++;
7877             array_b[0] = 0;
7878         }
7879     }
7880
7881     /* Size the intersection for the worst case: that the intersection ends up
7882      * fragmenting everything to be completely disjoint */
7883     r= _new_invlist(len_a + len_b);
7884
7885     /* Will contain U+0000 iff both components do */
7886     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7887                                      && len_b > 0 && array_b[0] == 0);
7888
7889     /* Go through each list item by item, stopping when exhausted one of
7890      * them */
7891     while (i_a < len_a && i_b < len_b) {
7892         UV cp;      /* The element to potentially add to the intersection's
7893                        array */
7894         bool cp_in_set; /* Is it in the input list's set or not */
7895
7896         /* We need to take one or the other of the two inputs for the
7897          * intersection.  Since we are merging two sorted lists, we take the
7898          * smaller of the next items.  In case of a tie, we take the one that
7899          * is not in its set first (a difference from the union algorithm).  If
7900          * we took one in the set first, it would increment the count, possibly
7901          * to 2 which would cause it to be output as starting a range in the
7902          * intersection, and the next time through we would take that same
7903          * number, and output it again as ending the set.  By doing it the
7904          * opposite of this, there is no possibility that the count will be
7905          * momentarily incremented to 2.  (In a tie and both are in the set or
7906          * both not in the set, it doesn't matter which we take first.) */
7907         if (array_a[i_a] < array_b[i_b]
7908             || (array_a[i_a] == array_b[i_b]
7909                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7910         {
7911             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7912             cp= array_a[i_a++];
7913         }
7914         else {
7915             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7916             cp= array_b[i_b++];
7917         }
7918
7919         /* Here, have chosen which of the two inputs to look at.  Only output
7920          * if the running count changes to/from 2, which marks the
7921          * beginning/end of a range that's in the intersection */
7922         if (cp_in_set) {
7923             count++;
7924             if (count == 2) {
7925                 array_r[i_r++] = cp;
7926             }
7927         }
7928         else {
7929             if (count == 2) {
7930                 array_r[i_r++] = cp;
7931             }
7932             count--;
7933         }
7934     }
7935
7936     /* Here, we are finished going through at least one of the lists, which
7937      * means there is something remaining in at most one.  We check if the list
7938      * that has been exhausted is positioned such that we are in the middle
7939      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7940      * the ones we care about.)  There are four cases:
7941      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7942      *     nothing left in the intersection.
7943      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7944      *     above 2.  What should be output is exactly that which is in the
7945      *     non-exhausted set, as everything it has is also in the intersection
7946      *     set, and everything it doesn't have can't be in the intersection
7947      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7948      *     gets incremented to 2.  Like the previous case, the intersection is
7949      *     everything that remains in the non-exhausted set.
7950      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7951      *     remains 1.  And the intersection has nothing more. */
7952     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7953         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7954     {
7955         count++;
7956     }
7957
7958     /* The final length is what we've output so far plus what else is in the
7959      * intersection.  At most one of the subexpressions below will be non-zero */
7960     len_r = i_r;
7961     if (count >= 2) {
7962         len_r += (len_a - i_a) + (len_b - i_b);
7963     }
7964
7965     /* Set result to final length, which can change the pointer to array_r, so
7966      * re-find it */
7967     if (len_r != _invlist_len(r)) {
7968         invlist_set_len(r, len_r);
7969         invlist_trim(r);
7970         array_r = invlist_array(r);
7971     }
7972
7973     /* Finish outputting any remaining */
7974     if (count >= 2) { /* At most one will have a non-zero copy count */
7975         IV copy_count;
7976         if ((copy_count = len_a - i_a) > 0) {
7977             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7978         }
7979         else if ((copy_count = len_b - i_b) > 0) {
7980             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7981         }
7982     }
7983
7984     /*  We may be removing a reference to one of the inputs */
7985     if (a == *i || b == *i) {
7986         SvREFCNT_dec(*i);
7987     }
7988
7989     /* If we've changed b, restore it */
7990     if (complement_b) {
7991         array_b[0] = 1;
7992     }
7993
7994     *i = r;
7995     return;
7996 }
7997
7998 SV*
7999 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8000 {
8001     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8002      * set.  A pointer to the inversion list is returned.  This may actually be
8003      * a new list, in which case the passed in one has been destroyed.  The
8004      * passed in inversion list can be NULL, in which case a new one is created
8005      * with just the one range in it */
8006
8007     SV* range_invlist;
8008     UV len;
8009
8010     if (invlist == NULL) {
8011         invlist = _new_invlist(2);
8012         len = 0;
8013     }
8014     else {
8015         len = _invlist_len(invlist);
8016     }
8017
8018     /* If comes after the final entry, can just append it to the end */
8019     if (len == 0
8020         || start >= invlist_array(invlist)
8021                                     [_invlist_len(invlist) - 1])
8022     {
8023         _append_range_to_invlist(invlist, start, end);
8024         return invlist;
8025     }
8026
8027     /* Here, can't just append things, create and return a new inversion list
8028      * which is the union of this range and the existing inversion list */
8029     range_invlist = _new_invlist(2);
8030     _append_range_to_invlist(range_invlist, start, end);
8031
8032     _invlist_union(invlist, range_invlist, &invlist);
8033
8034     /* The temporary can be freed */
8035     SvREFCNT_dec(range_invlist);
8036
8037     return invlist;
8038 }
8039
8040 #endif
8041
8042 PERL_STATIC_INLINE SV*
8043 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8044     return _add_range_to_invlist(invlist, cp, cp);
8045 }
8046
8047 #ifndef PERL_IN_XSUB_RE
8048 void
8049 Perl__invlist_invert(pTHX_ SV* const invlist)
8050 {
8051     /* Complement the input inversion list.  This adds a 0 if the list didn't
8052      * have a zero; removes it otherwise.  As described above, the data
8053      * structure is set up so that this is very efficient */
8054
8055     UV* len_pos = _get_invlist_len_addr(invlist);
8056
8057     PERL_ARGS_ASSERT__INVLIST_INVERT;
8058
8059     /* The inverse of matching nothing is matching everything */
8060     if (*len_pos == 0) {
8061         _append_range_to_invlist(invlist, 0, UV_MAX);
8062         return;
8063     }
8064
8065     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
8066      * zero element was a 0, so it is being removed, so the length decrements
8067      * by 1; and vice-versa.  SvCUR is unaffected */
8068     if (*get_invlist_zero_addr(invlist) ^= 1) {
8069         (*len_pos)--;
8070     }
8071     else {
8072         (*len_pos)++;
8073     }
8074 }
8075
8076 void
8077 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8078 {
8079     /* Complement the input inversion list (which must be a Unicode property,
8080      * all of which don't match above the Unicode maximum code point.)  And
8081      * Perl has chosen to not have the inversion match above that either.  This
8082      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8083      */
8084
8085     UV len;
8086     UV* array;
8087
8088     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8089
8090     _invlist_invert(invlist);
8091
8092     len = _invlist_len(invlist);
8093
8094     if (len != 0) { /* If empty do nothing */
8095         array = invlist_array(invlist);
8096         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8097             /* Add 0x110000.  First, grow if necessary */
8098             len++;
8099             if (invlist_max(invlist) < len) {
8100                 invlist_extend(invlist, len);
8101                 array = invlist_array(invlist);
8102             }
8103             invlist_set_len(invlist, len);
8104             array[len - 1] = PERL_UNICODE_MAX + 1;
8105         }
8106         else {  /* Remove the 0x110000 */
8107             invlist_set_len(invlist, len - 1);
8108         }
8109     }
8110
8111     return;
8112 }
8113 #endif
8114
8115 PERL_STATIC_INLINE SV*
8116 S_invlist_clone(pTHX_ SV* const invlist)
8117 {
8118
8119     /* Return a new inversion list that is a copy of the input one, which is
8120      * unchanged */
8121
8122     /* Need to allocate extra space to accommodate Perl's addition of a
8123      * trailing NUL to SvPV's, since it thinks they are always strings */
8124     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8125     STRLEN length = SvCUR(invlist);
8126
8127     PERL_ARGS_ASSERT_INVLIST_CLONE;
8128
8129     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8130     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8131
8132     return new_invlist;
8133 }
8134
8135 PERL_STATIC_INLINE UV*
8136 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8137 {
8138     /* Return the address of the UV that contains the current iteration
8139      * position */
8140
8141     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8142
8143     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8144 }
8145
8146 PERL_STATIC_INLINE UV*
8147 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8148 {
8149     /* Return the address of the UV that contains the version id. */
8150
8151     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8152
8153     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8154 }
8155
8156 PERL_STATIC_INLINE void
8157 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8158 {
8159     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8160
8161     *get_invlist_iter_addr(invlist) = 0;
8162 }
8163
8164 STATIC bool
8165 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8166 {
8167     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8168      * This call sets in <*start> and <*end>, the next range in <invlist>.
8169      * Returns <TRUE> if successful and the next call will return the next
8170      * range; <FALSE> if was already at the end of the list.  If the latter,
8171      * <*start> and <*end> are unchanged, and the next call to this function
8172      * will start over at the beginning of the list */
8173
8174     UV* pos = get_invlist_iter_addr(invlist);
8175     UV len = _invlist_len(invlist);
8176     UV *array;
8177
8178     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8179
8180     if (*pos >= len) {
8181         *pos = UV_MAX;  /* Force iternit() to be required next time */
8182         return FALSE;
8183     }
8184
8185     array = invlist_array(invlist);
8186
8187     *start = array[(*pos)++];
8188
8189     if (*pos >= len) {
8190         *end = UV_MAX;
8191     }
8192     else {
8193         *end = array[(*pos)++] - 1;
8194     }
8195
8196     return TRUE;
8197 }
8198
8199 PERL_STATIC_INLINE UV
8200 S_invlist_highest(pTHX_ SV* const invlist)
8201 {
8202     /* Returns the highest code point that matches an inversion list.  This API
8203      * has an ambiguity, as it returns 0 under either the highest is actually
8204      * 0, or if the list is empty.  If this distinction matters to you, check
8205      * for emptiness before calling this function */
8206
8207     UV len = _invlist_len(invlist);
8208     UV *array;
8209
8210     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8211
8212     if (len == 0) {
8213         return 0;
8214     }
8215
8216     array = invlist_array(invlist);
8217
8218     /* The last element in the array in the inversion list always starts a
8219      * range that goes to infinity.  That range may be for code points that are
8220      * matched in the inversion list, or it may be for ones that aren't
8221      * matched.  In the latter case, the highest code point in the set is one
8222      * less than the beginning of this range; otherwise it is the final element
8223      * of this range: infinity */
8224     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8225            ? UV_MAX
8226            : array[len - 1] - 1;
8227 }
8228
8229 #ifndef PERL_IN_XSUB_RE
8230 SV *
8231 Perl__invlist_contents(pTHX_ SV* const invlist)
8232 {
8233     /* Get the contents of an inversion list into a string SV so that they can
8234      * be printed out.  It uses the format traditionally done for debug tracing
8235      */
8236
8237     UV start, end;
8238     SV* output = newSVpvs("\n");
8239
8240     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8241
8242     invlist_iterinit(invlist);
8243     while (invlist_iternext(invlist, &start, &end)) {
8244         if (end == UV_MAX) {
8245             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8246         }
8247         else if (end != start) {
8248             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8249                     start,       end);
8250         }
8251         else {
8252             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8253         }
8254     }
8255
8256     return output;
8257 }
8258 #endif
8259
8260 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8261 void
8262 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8263 {
8264     /* Dumps out the ranges in an inversion list.  The string 'header'
8265      * if present is output on a line before the first range */
8266
8267     UV start, end;
8268
8269     PERL_ARGS_ASSERT__INVLIST_DUMP;
8270
8271     if (header && strlen(header)) {
8272         PerlIO_printf(Perl_debug_log, "%s\n", header);
8273     }
8274     invlist_iterinit(invlist);
8275     while (invlist_iternext(invlist, &start, &end)) {
8276         if (end == UV_MAX) {
8277             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8278         }
8279         else if (end != start) {
8280             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8281                                                  start,         end);
8282         }
8283         else {
8284             PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8285         }
8286     }
8287 }
8288 #endif
8289
8290 #if 0
8291 bool
8292 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8293 {
8294     /* Return a boolean as to if the two passed in inversion lists are
8295      * identical.  The final argument, if TRUE, says to take the complement of
8296      * the second inversion list before doing the comparison */
8297
8298     UV* array_a = invlist_array(a);
8299     UV* array_b = invlist_array(b);
8300     UV len_a = _invlist_len(a);
8301     UV len_b = _invlist_len(b);
8302
8303     UV i = 0;               /* current index into the arrays */
8304     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8305
8306     PERL_ARGS_ASSERT__INVLISTEQ;
8307
8308     /* If are to compare 'a' with the complement of b, set it
8309      * up so are looking at b's complement. */
8310     if (complement_b) {
8311
8312         /* The complement of nothing is everything, so <a> would have to have
8313          * just one element, starting at zero (ending at infinity) */
8314         if (len_b == 0) {
8315             return (len_a == 1 && array_a[0] == 0);
8316         }
8317         else if (array_b[0] == 0) {
8318
8319             /* Otherwise, to complement, we invert.  Here, the first element is
8320              * 0, just remove it.  To do this, we just pretend the array starts
8321              * one later, and clear the flag as we don't have to do anything
8322              * else later */
8323
8324             array_b++;
8325             len_b--;
8326             complement_b = FALSE;
8327         }
8328         else {
8329
8330             /* But if the first element is not zero, we unshift a 0 before the
8331              * array.  The data structure reserves a space for that 0 (which
8332              * should be a '1' right now), so physical shifting is unneeded,
8333              * but temporarily change that element to 0.  Before exiting the
8334              * routine, we must restore the element to '1' */
8335             array_b--;
8336             len_b++;
8337             array_b[0] = 0;
8338         }
8339     }
8340
8341     /* Make sure that the lengths are the same, as well as the final element
8342      * before looping through the remainder.  (Thus we test the length, final,
8343      * and first elements right off the bat) */
8344     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8345         retval = FALSE;
8346     }
8347     else for (i = 0; i < len_a - 1; i++) {
8348         if (array_a[i] != array_b[i]) {
8349             retval = FALSE;
8350             break;
8351         }
8352     }
8353
8354     if (complement_b) {
8355         array_b[0] = 1;
8356     }
8357     return retval;
8358 }
8359 #endif
8360
8361 #undef HEADER_LENGTH
8362 #undef INVLIST_INITIAL_LENGTH
8363 #undef TO_INTERNAL_SIZE
8364 #undef FROM_INTERNAL_SIZE
8365 #undef INVLIST_LEN_OFFSET
8366 #undef INVLIST_ZERO_OFFSET
8367 #undef INVLIST_ITER_OFFSET
8368 #undef INVLIST_VERSION_ID
8369
8370 /* End of inversion list object */
8371
8372 /*
8373  - reg - regular expression, i.e. main body or parenthesized thing
8374  *
8375  * Caller must absorb opening parenthesis.
8376  *
8377  * Combining parenthesis handling with the base level of regular expression
8378  * is a trifle forced, but the need to tie the tails of the branches to what
8379  * follows makes it hard to avoid.
8380  */
8381 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8382 #ifdef DEBUGGING
8383 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8384 #else
8385 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8386 #endif
8387
8388 STATIC regnode *
8389 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8390     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8391 {
8392     dVAR;
8393     regnode *ret;               /* Will be the head of the group. */
8394     regnode *br;
8395     regnode *lastbr;
8396     regnode *ender = NULL;
8397     I32 parno = 0;
8398     I32 flags;
8399     U32 oregflags = RExC_flags;
8400     bool have_branch = 0;
8401     bool is_open = 0;
8402     I32 freeze_paren = 0;
8403     I32 after_freeze = 0;
8404
8405     /* for (?g), (?gc), and (?o) warnings; warning
8406        about (?c) will warn about (?g) -- japhy    */
8407
8408 #define WASTED_O  0x01
8409 #define WASTED_G  0x02
8410 #define WASTED_C  0x04
8411 #define WASTED_GC (0x02|0x04)
8412     I32 wastedflags = 0x00;
8413
8414     char * parse_start = RExC_parse; /* MJD */
8415     char * const oregcomp_parse = RExC_parse;
8416
8417     GET_RE_DEBUG_FLAGS_DECL;
8418
8419     PERL_ARGS_ASSERT_REG;
8420     DEBUG_PARSE("reg ");
8421
8422     *flagp = 0;                         /* Tentatively. */
8423
8424
8425     /* Make an OPEN node, if parenthesized. */
8426     if (paren) {
8427         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8428             char *start_verb = RExC_parse;
8429             STRLEN verb_len = 0;
8430             char *start_arg = NULL;
8431             unsigned char op = 0;
8432             int argok = 1;
8433             int internal_argval = 0; /* internal_argval is only useful if !argok */
8434             while ( *RExC_parse && *RExC_parse != ')' ) {
8435                 if ( *RExC_parse == ':' ) {
8436                     start_arg = RExC_parse + 1;
8437                     break;
8438                 }
8439                 RExC_parse++;
8440             }
8441             ++start_verb;
8442             verb_len = RExC_parse - start_verb;
8443             if ( start_arg ) {
8444                 RExC_parse++;
8445                 while ( *RExC_parse && *RExC_parse != ')' )
8446                     RExC_parse++;
8447                 if ( *RExC_parse != ')' )
8448                     vFAIL("Unterminated verb pattern argument");
8449                 if ( RExC_parse == start_arg )
8450                     start_arg = NULL;
8451             } else {
8452                 if ( *RExC_parse != ')' )
8453                     vFAIL("Unterminated verb pattern");
8454             }
8455
8456             switch ( *start_verb ) {
8457             case 'A':  /* (*ACCEPT) */
8458                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8459                     op = ACCEPT;
8460                     internal_argval = RExC_nestroot;
8461                 }
8462                 break;
8463             case 'C':  /* (*COMMIT) */
8464                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8465                     op = COMMIT;
8466                 break;
8467             case 'F':  /* (*FAIL) */
8468                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8469                     op = OPFAIL;
8470                     argok = 0;
8471                 }
8472                 break;
8473             case ':':  /* (*:NAME) */
8474             case 'M':  /* (*MARK:NAME) */
8475                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8476                     op = MARKPOINT;
8477                     argok = -1;
8478                 }
8479                 break;
8480             case 'P':  /* (*PRUNE) */
8481                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8482                     op = PRUNE;
8483                 break;
8484             case 'S':   /* (*SKIP) */
8485                 if ( memEQs(start_verb,verb_len,"SKIP") )
8486                     op = SKIP;
8487                 break;
8488             case 'T':  /* (*THEN) */
8489                 /* [19:06] <TimToady> :: is then */
8490                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8491                     op = CUTGROUP;
8492                     RExC_seen |= REG_SEEN_CUTGROUP;
8493                 }
8494                 break;
8495             }
8496             if ( ! op ) {
8497                 RExC_parse++;
8498                 vFAIL3("Unknown verb pattern '%.*s'",
8499                     verb_len, start_verb);
8500             }
8501             if ( argok ) {
8502                 if ( start_arg && internal_argval ) {
8503                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8504                         verb_len, start_verb);
8505                 } else if ( argok < 0 && !start_arg ) {
8506                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8507                         verb_len, start_verb);
8508                 } else {
8509                     ret = reganode(pRExC_state, op, internal_argval);
8510                     if ( ! internal_argval && ! SIZE_ONLY ) {
8511                         if (start_arg) {
8512                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8513                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8514                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8515                             ret->flags = 0;
8516                         } else {
8517                             ret->flags = 1;
8518                         }
8519                     }
8520                 }
8521                 if (!internal_argval)
8522                     RExC_seen |= REG_SEEN_VERBARG;
8523             } else if ( start_arg ) {
8524                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8525                         verb_len, start_verb);
8526             } else {
8527                 ret = reg_node(pRExC_state, op);
8528             }
8529             nextchar(pRExC_state);
8530             return ret;
8531         } else
8532         if (*RExC_parse == '?') { /* (?...) */
8533             bool is_logical = 0;
8534             const char * const seqstart = RExC_parse;
8535             bool has_use_defaults = FALSE;
8536
8537             RExC_parse++;
8538             paren = *RExC_parse++;
8539             ret = NULL;                 /* For look-ahead/behind. */
8540             switch (paren) {
8541
8542             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8543                 paren = *RExC_parse++;
8544                 if ( paren == '<')         /* (?P<...>) named capture */
8545                     goto named_capture;
8546                 else if (paren == '>') {   /* (?P>name) named recursion */
8547                     goto named_recursion;
8548                 }
8549                 else if (paren == '=') {   /* (?P=...)  named backref */
8550                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8551                        you change this make sure you change that */
8552                     char* name_start = RExC_parse;
8553                     U32 num = 0;
8554                     SV *sv_dat = reg_scan_name(pRExC_state,
8555                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8556                     if (RExC_parse == name_start || *RExC_parse != ')')
8557                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8558
8559                     if (!SIZE_ONLY) {
8560                         num = add_data( pRExC_state, 1, "S" );
8561                         RExC_rxi->data->data[num]=(void*)sv_dat;
8562                         SvREFCNT_inc_simple_void(sv_dat);
8563                     }
8564                     RExC_sawback = 1;
8565                     ret = reganode(pRExC_state,
8566                                    ((! FOLD)
8567                                      ? NREF
8568                                      : (ASCII_FOLD_RESTRICTED)
8569                                        ? NREFFA
8570                                        : (AT_LEAST_UNI_SEMANTICS)
8571                                          ? NREFFU
8572                                          : (LOC)
8573                                            ? NREFFL
8574                                            : NREFF),
8575                                     num);
8576                     *flagp |= HASWIDTH;
8577
8578                     Set_Node_Offset(ret, parse_start+1);
8579                     Set_Node_Cur_Length(ret); /* MJD */
8580
8581                     nextchar(pRExC_state);
8582                     return ret;
8583                 }
8584                 RExC_parse++;
8585                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8586                 /*NOTREACHED*/
8587             case '<':           /* (?<...) */
8588                 if (*RExC_parse == '!')
8589                     paren = ',';
8590                 else if (*RExC_parse != '=')
8591               named_capture:
8592                 {               /* (?<...>) */
8593                     char *name_start;
8594                     SV *svname;
8595                     paren= '>';
8596             case '\'':          /* (?'...') */
8597                     name_start= RExC_parse;
8598                     svname = reg_scan_name(pRExC_state,
8599                         SIZE_ONLY ?  /* reverse test from the others */
8600                         REG_RSN_RETURN_NAME :
8601                         REG_RSN_RETURN_NULL);
8602                     if (RExC_parse == name_start) {
8603                         RExC_parse++;
8604                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8605                         /*NOTREACHED*/
8606                     }
8607                     if (*RExC_parse != paren)
8608                         vFAIL2("Sequence (?%c... not terminated",
8609                             paren=='>' ? '<' : paren);
8610                     if (SIZE_ONLY) {
8611                         HE *he_str;
8612                         SV *sv_dat = NULL;
8613                         if (!svname) /* shouldn't happen */
8614                             Perl_croak(aTHX_
8615                                 "panic: reg_scan_name returned NULL");
8616                         if (!RExC_paren_names) {
8617                             RExC_paren_names= newHV();
8618                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8619 #ifdef DEBUGGING
8620                             RExC_paren_name_list= newAV();
8621                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8622 #endif
8623                         }
8624                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8625                         if ( he_str )
8626                             sv_dat = HeVAL(he_str);
8627                         if ( ! sv_dat ) {
8628                             /* croak baby croak */
8629                             Perl_croak(aTHX_
8630                                 "panic: paren_name hash element allocation failed");
8631                         } else if ( SvPOK(sv_dat) ) {
8632                             /* (?|...) can mean we have dupes so scan to check
8633                                its already been stored. Maybe a flag indicating
8634                                we are inside such a construct would be useful,
8635                                but the arrays are likely to be quite small, so
8636                                for now we punt -- dmq */
8637                             IV count = SvIV(sv_dat);
8638                             I32 *pv = (I32*)SvPVX(sv_dat);
8639                             IV i;
8640                             for ( i = 0 ; i < count ; i++ ) {
8641                                 if ( pv[i] == RExC_npar ) {
8642                                     count = 0;
8643                                     break;
8644                                 }
8645                             }
8646                             if ( count ) {
8647                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8648                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8649                                 pv[count] = RExC_npar;
8650                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8651                             }
8652                         } else {
8653                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8654                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8655                             SvIOK_on(sv_dat);
8656                             SvIV_set(sv_dat, 1);
8657                         }
8658 #ifdef DEBUGGING
8659                         /* Yes this does cause a memory leak in debugging Perls */
8660                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8661                             SvREFCNT_dec(svname);
8662 #endif
8663
8664                         /*sv_dump(sv_dat);*/
8665                     }
8666                     nextchar(pRExC_state);
8667                     paren = 1;
8668                     goto capturing_parens;
8669                 }
8670                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8671                 RExC_in_lookbehind++;
8672                 RExC_parse++;
8673             case '=':           /* (?=...) */
8674                 RExC_seen_zerolen++;
8675                 break;
8676             case '!':           /* (?!...) */
8677                 RExC_seen_zerolen++;
8678                 if (*RExC_parse == ')') {
8679                     ret=reg_node(pRExC_state, OPFAIL);
8680                     nextchar(pRExC_state);
8681                     return ret;
8682                 }
8683                 break;
8684             case '|':           /* (?|...) */
8685                 /* branch reset, behave like a (?:...) except that
8686                    buffers in alternations share the same numbers */
8687                 paren = ':';
8688                 after_freeze = freeze_paren = RExC_npar;
8689                 break;
8690             case ':':           /* (?:...) */
8691             case '>':           /* (?>...) */
8692                 break;
8693             case '$':           /* (?$...) */
8694             case '@':           /* (?@...) */
8695                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8696                 break;
8697             case '#':           /* (?#...) */
8698                 while (*RExC_parse && *RExC_parse != ')')
8699                     RExC_parse++;
8700                 if (*RExC_parse != ')')
8701                     FAIL("Sequence (?#... not terminated");
8702                 nextchar(pRExC_state);
8703                 *flagp = TRYAGAIN;
8704                 return NULL;
8705             case '0' :           /* (?0) */
8706             case 'R' :           /* (?R) */
8707                 if (*RExC_parse != ')')
8708                     FAIL("Sequence (?R) not terminated");
8709                 ret = reg_node(pRExC_state, GOSTART);
8710                 *flagp |= POSTPONED;
8711                 nextchar(pRExC_state);
8712                 return ret;
8713                 /*notreached*/
8714             { /* named and numeric backreferences */
8715                 I32 num;
8716             case '&':            /* (?&NAME) */
8717                 parse_start = RExC_parse - 1;
8718               named_recursion:
8719                 {
8720                     SV *sv_dat = reg_scan_name(pRExC_state,
8721                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8722                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8723                 }
8724                 goto gen_recurse_regop;
8725                 assert(0); /* NOT REACHED */
8726             case '+':
8727                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8728                     RExC_parse++;
8729                     vFAIL("Illegal pattern");
8730                 }
8731                 goto parse_recursion;
8732                 /* NOT REACHED*/
8733             case '-': /* (?-1) */
8734                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8735                     RExC_parse--; /* rewind to let it be handled later */
8736                     goto parse_flags;
8737                 }
8738                 /*FALLTHROUGH */
8739             case '1': case '2': case '3': case '4': /* (?1) */
8740             case '5': case '6': case '7': case '8': case '9':
8741                 RExC_parse--;
8742               parse_recursion:
8743                 num = atoi(RExC_parse);
8744                 parse_start = RExC_parse - 1; /* MJD */
8745                 if (*RExC_parse == '-')
8746                     RExC_parse++;
8747                 while (isDIGIT(*RExC_parse))
8748                         RExC_parse++;
8749                 if (*RExC_parse!=')')
8750                     vFAIL("Expecting close bracket");
8751
8752               gen_recurse_regop:
8753                 if ( paren == '-' ) {
8754                     /*
8755                     Diagram of capture buffer numbering.
8756                     Top line is the normal capture buffer numbers
8757                     Bottom line is the negative indexing as from
8758                     the X (the (?-2))
8759
8760                     +   1 2    3 4 5 X          6 7
8761                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8762                     -   5 4    3 2 1 X          x x
8763
8764                     */
8765                     num = RExC_npar + num;
8766                     if (num < 1)  {
8767                         RExC_parse++;
8768                         vFAIL("Reference to nonexistent group");
8769                     }
8770                 } else if ( paren == '+' ) {
8771                     num = RExC_npar + num - 1;
8772                 }
8773
8774                 ret = reganode(pRExC_state, GOSUB, num);
8775                 if (!SIZE_ONLY) {
8776                     if (num > (I32)RExC_rx->nparens) {
8777                         RExC_parse++;
8778                         vFAIL("Reference to nonexistent group");
8779                     }
8780                     ARG2L_SET( ret, RExC_recurse_count++);
8781                     RExC_emit++;
8782                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8783                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8784                 } else {
8785                     RExC_size++;
8786                 }
8787                 RExC_seen |= REG_SEEN_RECURSE;
8788                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8789                 Set_Node_Offset(ret, parse_start); /* MJD */
8790
8791                 *flagp |= POSTPONED;
8792                 nextchar(pRExC_state);
8793                 return ret;
8794             } /* named and numeric backreferences */
8795             assert(0); /* NOT REACHED */
8796
8797             case '?':           /* (??...) */
8798                 is_logical = 1;
8799                 if (*RExC_parse != '{') {
8800                     RExC_parse++;
8801                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8802                     /*NOTREACHED*/
8803                 }
8804                 *flagp |= POSTPONED;
8805                 paren = *RExC_parse++;
8806                 /* FALL THROUGH */
8807             case '{':           /* (?{...}) */
8808             {
8809                 U32 n = 0;
8810                 struct reg_code_block *cb;
8811
8812                 RExC_seen_zerolen++;
8813
8814                 if (   !pRExC_state->num_code_blocks
8815                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8816                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8817                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8818                             - RExC_start)
8819                 ) {
8820                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8821                         FAIL("panic: Sequence (?{...}): no code block found\n");
8822                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8823                 }
8824                 /* this is a pre-compiled code block (?{...}) */
8825                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8826                 RExC_parse = RExC_start + cb->end;
8827                 if (!SIZE_ONLY) {
8828                     OP *o = cb->block;
8829                     if (cb->src_regex) {
8830                         n = add_data(pRExC_state, 2, "rl");
8831                         RExC_rxi->data->data[n] =
8832                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8833                         RExC_rxi->data->data[n+1] = (void*)o;
8834                     }
8835                     else {
8836                         n = add_data(pRExC_state, 1,
8837                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8838                         RExC_rxi->data->data[n] = (void*)o;
8839                     }
8840                 }
8841                 pRExC_state->code_index++;
8842                 nextchar(pRExC_state);
8843
8844                 if (is_logical) {
8845                     regnode *eval;
8846                     ret = reg_node(pRExC_state, LOGICAL);
8847                     eval = reganode(pRExC_state, EVAL, n);
8848                     if (!SIZE_ONLY) {
8849                         ret->flags = 2;
8850                         /* for later propagation into (??{}) return value */
8851                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8852                     }
8853                     REGTAIL(pRExC_state, ret, eval);
8854                     /* deal with the length of this later - MJD */
8855                     return ret;
8856                 }
8857                 ret = reganode(pRExC_state, EVAL, n);
8858                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8859                 Set_Node_Offset(ret, parse_start);
8860                 return ret;
8861             }
8862             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8863             {
8864                 int is_define= 0;
8865                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8866                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8867                         || RExC_parse[1] == '<'
8868                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8869                         I32 flag;
8870
8871                         ret = reg_node(pRExC_state, LOGICAL);
8872                         if (!SIZE_ONLY)
8873                             ret->flags = 1;
8874                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8875                         goto insert_if;
8876                     }
8877                 }
8878                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8879                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8880                 {
8881                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8882                     char *name_start= RExC_parse++;
8883                     U32 num = 0;
8884                     SV *sv_dat=reg_scan_name(pRExC_state,
8885                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8886                     if (RExC_parse == name_start || *RExC_parse != ch)
8887                         vFAIL2("Sequence (?(%c... not terminated",
8888                             (ch == '>' ? '<' : ch));
8889                     RExC_parse++;
8890                     if (!SIZE_ONLY) {
8891                         num = add_data( pRExC_state, 1, "S" );
8892                         RExC_rxi->data->data[num]=(void*)sv_dat;
8893                         SvREFCNT_inc_simple_void(sv_dat);
8894                     }
8895                     ret = reganode(pRExC_state,NGROUPP,num);
8896                     goto insert_if_check_paren;
8897                 }
8898                 else if (RExC_parse[0] == 'D' &&
8899                          RExC_parse[1] == 'E' &&
8900                          RExC_parse[2] == 'F' &&
8901                          RExC_parse[3] == 'I' &&
8902                          RExC_parse[4] == 'N' &&
8903                          RExC_parse[5] == 'E')
8904                 {
8905                     ret = reganode(pRExC_state,DEFINEP,0);
8906                     RExC_parse +=6 ;
8907                     is_define = 1;
8908                     goto insert_if_check_paren;
8909                 }
8910                 else if (RExC_parse[0] == 'R') {
8911                     RExC_parse++;
8912                     parno = 0;
8913                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8914                         parno = atoi(RExC_parse++);
8915                         while (isDIGIT(*RExC_parse))
8916                             RExC_parse++;
8917                     } else if (RExC_parse[0] == '&') {
8918                         SV *sv_dat;
8919                         RExC_parse++;
8920                         sv_dat = reg_scan_name(pRExC_state,
8921                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8922                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8923                     }
8924                     ret = reganode(pRExC_state,INSUBP,parno);
8925                     goto insert_if_check_paren;
8926                 }
8927                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8928                     /* (?(1)...) */
8929                     char c;
8930                     parno = atoi(RExC_parse++);
8931
8932                     while (isDIGIT(*RExC_parse))
8933                         RExC_parse++;
8934                     ret = reganode(pRExC_state, GROUPP, parno);
8935
8936                  insert_if_check_paren:
8937                     if ((c = *nextchar(pRExC_state)) != ')')
8938                         vFAIL("Switch condition not recognized");
8939                   insert_if:
8940                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8941                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8942                     if (br == NULL)
8943                         br = reganode(pRExC_state, LONGJMP, 0);
8944                     else
8945                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8946                     c = *nextchar(pRExC_state);
8947                     if (flags&HASWIDTH)
8948                         *flagp |= HASWIDTH;
8949                     if (c == '|') {
8950                         if (is_define)
8951                             vFAIL("(?(DEFINE)....) does not allow branches");
8952                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8953                         regbranch(pRExC_state, &flags, 1,depth+1);
8954                         REGTAIL(pRExC_state, ret, lastbr);
8955                         if (flags&HASWIDTH)
8956                             *flagp |= HASWIDTH;
8957                         c = *nextchar(pRExC_state);
8958                     }
8959                     else
8960                         lastbr = NULL;
8961                     if (c != ')')
8962                         vFAIL("Switch (?(condition)... contains too many branches");
8963                     ender = reg_node(pRExC_state, TAIL);
8964                     REGTAIL(pRExC_state, br, ender);
8965                     if (lastbr) {
8966                         REGTAIL(pRExC_state, lastbr, ender);
8967                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8968                     }
8969                     else
8970                         REGTAIL(pRExC_state, ret, ender);
8971                     RExC_size++; /* XXX WHY do we need this?!!
8972                                     For large programs it seems to be required
8973                                     but I can't figure out why. -- dmq*/
8974                     return ret;
8975                 }
8976                 else {
8977                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8978                 }
8979             }
8980             case 0:
8981                 RExC_parse--; /* for vFAIL to print correctly */
8982                 vFAIL("Sequence (? incomplete");
8983                 break;
8984             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8985                                        that follow */
8986                 has_use_defaults = TRUE;
8987                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8988                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8989                                                 ? REGEX_UNICODE_CHARSET
8990                                                 : REGEX_DEPENDS_CHARSET);
8991                 goto parse_flags;
8992             default:
8993                 --RExC_parse;
8994                 parse_flags:      /* (?i) */
8995             {
8996                 U32 posflags = 0, negflags = 0;
8997                 U32 *flagsp = &posflags;
8998                 char has_charset_modifier = '\0';
8999                 regex_charset cs = get_regex_charset(RExC_flags);
9000                 if (cs == REGEX_DEPENDS_CHARSET
9001                     && (RExC_utf8 || RExC_uni_semantics))
9002                 {
9003                     cs = REGEX_UNICODE_CHARSET;
9004                 }
9005
9006                 while (*RExC_parse) {
9007                     /* && strchr("iogcmsx", *RExC_parse) */
9008                     /* (?g), (?gc) and (?o) are useless here
9009                        and must be globally applied -- japhy */
9010                     switch (*RExC_parse) {
9011                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9012                     case LOCALE_PAT_MOD:
9013                         if (has_charset_modifier) {
9014                             goto excess_modifier;
9015                         }
9016                         else if (flagsp == &negflags) {
9017                             goto neg_modifier;
9018                         }
9019                         cs = REGEX_LOCALE_CHARSET;
9020                         has_charset_modifier = LOCALE_PAT_MOD;
9021                         RExC_contains_locale = 1;
9022                         break;
9023                     case UNICODE_PAT_MOD:
9024                         if (has_charset_modifier) {
9025                             goto excess_modifier;
9026                         }
9027                         else if (flagsp == &negflags) {
9028                             goto neg_modifier;
9029                         }
9030                         cs = REGEX_UNICODE_CHARSET;
9031                         has_charset_modifier = UNICODE_PAT_MOD;
9032                         break;
9033                     case ASCII_RESTRICT_PAT_MOD:
9034                         if (flagsp == &negflags) {
9035                             goto neg_modifier;
9036                         }
9037                         if (has_charset_modifier) {
9038                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9039                                 goto excess_modifier;
9040                             }
9041                             /* Doubled modifier implies more restricted */
9042                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9043                         }
9044                         else {
9045                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
9046                         }
9047                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9048                         break;
9049                     case DEPENDS_PAT_MOD:
9050                         if (has_use_defaults) {
9051                             goto fail_modifiers;
9052                         }
9053                         else if (flagsp == &negflags) {
9054                             goto neg_modifier;
9055                         }
9056                         else if (has_charset_modifier) {
9057                             goto excess_modifier;
9058                         }
9059
9060                         /* The dual charset means unicode semantics if the
9061                          * pattern (or target, not known until runtime) are
9062                          * utf8, or something in the pattern indicates unicode
9063                          * semantics */
9064                         cs = (RExC_utf8 || RExC_uni_semantics)
9065                              ? REGEX_UNICODE_CHARSET
9066                              : REGEX_DEPENDS_CHARSET;
9067                         has_charset_modifier = DEPENDS_PAT_MOD;
9068                         break;
9069                     excess_modifier:
9070                         RExC_parse++;
9071                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9072                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9073                         }
9074                         else if (has_charset_modifier == *(RExC_parse - 1)) {
9075                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9076                         }
9077                         else {
9078                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9079                         }
9080                         /*NOTREACHED*/
9081                     neg_modifier:
9082                         RExC_parse++;
9083                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9084                         /*NOTREACHED*/
9085                     case ONCE_PAT_MOD: /* 'o' */
9086                     case GLOBAL_PAT_MOD: /* 'g' */
9087                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9088                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9089                             if (! (wastedflags & wflagbit) ) {
9090                                 wastedflags |= wflagbit;
9091                                 vWARN5(
9092                                     RExC_parse + 1,
9093                                     "Useless (%s%c) - %suse /%c modifier",
9094                                     flagsp == &negflags ? "?-" : "?",
9095                                     *RExC_parse,
9096                                     flagsp == &negflags ? "don't " : "",
9097                                     *RExC_parse
9098                                 );
9099                             }
9100                         }
9101                         break;
9102
9103                     case CONTINUE_PAT_MOD: /* 'c' */
9104                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9105                             if (! (wastedflags & WASTED_C) ) {
9106                                 wastedflags |= WASTED_GC;
9107                                 vWARN3(
9108                                     RExC_parse + 1,
9109                                     "Useless (%sc) - %suse /gc modifier",
9110                                     flagsp == &negflags ? "?-" : "?",
9111                                     flagsp == &negflags ? "don't " : ""
9112                                 );
9113                             }
9114                         }
9115                         break;
9116                     case KEEPCOPY_PAT_MOD: /* 'p' */
9117                         if (flagsp == &negflags) {
9118                             if (SIZE_ONLY)
9119                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9120                         } else {
9121                             *flagsp |= RXf_PMf_KEEPCOPY;
9122                         }
9123                         break;
9124                     case '-':
9125                         /* A flag is a default iff it is following a minus, so
9126                          * if there is a minus, it means will be trying to
9127                          * re-specify a default which is an error */
9128                         if (has_use_defaults || flagsp == &negflags) {
9129             fail_modifiers:
9130                             RExC_parse++;
9131                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9132                             /*NOTREACHED*/
9133                         }
9134                         flagsp = &negflags;
9135                         wastedflags = 0;  /* reset so (?g-c) warns twice */
9136                         break;
9137                     case ':':
9138                         paren = ':';
9139                         /*FALLTHROUGH*/
9140                     case ')':
9141                         RExC_flags |= posflags;
9142                         RExC_flags &= ~negflags;
9143                         set_regex_charset(&RExC_flags, cs);
9144                         if (paren != ':') {
9145                             oregflags |= posflags;
9146                             oregflags &= ~negflags;
9147                             set_regex_charset(&oregflags, cs);
9148                         }
9149                         nextchar(pRExC_state);
9150                         if (paren != ':') {
9151                             *flagp = TRYAGAIN;
9152                             return NULL;
9153                         } else {
9154                             ret = NULL;
9155                             goto parse_rest;
9156                         }
9157                         /*NOTREACHED*/
9158                     default:
9159                         RExC_parse++;
9160                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9161                         /*NOTREACHED*/
9162                     }
9163                     ++RExC_parse;
9164                 }
9165             }} /* one for the default block, one for the switch */
9166         }
9167         else {                  /* (...) */
9168           capturing_parens:
9169             parno = RExC_npar;
9170             RExC_npar++;
9171
9172             ret = reganode(pRExC_state, OPEN, parno);
9173             if (!SIZE_ONLY ){
9174                 if (!RExC_nestroot)
9175                     RExC_nestroot = parno;
9176                 if (RExC_seen & REG_SEEN_RECURSE
9177                     && !RExC_open_parens[parno-1])
9178                 {
9179                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9180                         "Setting open paren #%"IVdf" to %d\n",
9181                         (IV)parno, REG_NODE_NUM(ret)));
9182                     RExC_open_parens[parno-1]= ret;
9183                 }
9184             }
9185             Set_Node_Length(ret, 1); /* MJD */
9186             Set_Node_Offset(ret, RExC_parse); /* MJD */
9187             is_open = 1;
9188         }
9189     }
9190     else                        /* ! paren */
9191         ret = NULL;
9192
9193    parse_rest:
9194     /* Pick up the branches, linking them together. */
9195     parse_start = RExC_parse;   /* MJD */
9196     br = regbranch(pRExC_state, &flags, 1,depth+1);
9197
9198     /*     branch_len = (paren != 0); */
9199
9200     if (br == NULL)
9201         return(NULL);
9202     if (*RExC_parse == '|') {
9203         if (!SIZE_ONLY && RExC_extralen) {
9204             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9205         }
9206         else {                  /* MJD */
9207             reginsert(pRExC_state, BRANCH, br, depth+1);
9208             Set_Node_Length(br, paren != 0);
9209             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9210         }
9211         have_branch = 1;
9212         if (SIZE_ONLY)
9213             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9214     }
9215     else if (paren == ':') {
9216         *flagp |= flags&SIMPLE;
9217     }
9218     if (is_open) {                              /* Starts with OPEN. */
9219         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9220     }
9221     else if (paren != '?')              /* Not Conditional */
9222         ret = br;
9223     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9224     lastbr = br;
9225     while (*RExC_parse == '|') {
9226         if (!SIZE_ONLY && RExC_extralen) {
9227             ender = reganode(pRExC_state, LONGJMP,0);
9228             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9229         }
9230         if (SIZE_ONLY)
9231             RExC_extralen += 2;         /* Account for LONGJMP. */
9232         nextchar(pRExC_state);
9233         if (freeze_paren) {
9234             if (RExC_npar > after_freeze)
9235                 after_freeze = RExC_npar;
9236             RExC_npar = freeze_paren;
9237         }
9238         br = regbranch(pRExC_state, &flags, 0, depth+1);
9239
9240         if (br == NULL)
9241             return(NULL);
9242         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9243         lastbr = br;
9244         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9245     }
9246
9247     if (have_branch || paren != ':') {
9248         /* Make a closing node, and hook it on the end. */
9249         switch (paren) {
9250         case ':':
9251             ender = reg_node(pRExC_state, TAIL);
9252             break;
9253         case 1:
9254             ender = reganode(pRExC_state, CLOSE, parno);
9255             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9256                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9257                         "Setting close paren #%"IVdf" to %d\n",
9258                         (IV)parno, REG_NODE_NUM(ender)));
9259                 RExC_close_parens[parno-1]= ender;
9260                 if (RExC_nestroot == parno)
9261                     RExC_nestroot = 0;
9262             }
9263             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9264             Set_Node_Length(ender,1); /* MJD */
9265             break;
9266         case '<':
9267         case ',':
9268         case '=':
9269         case '!':
9270             *flagp &= ~HASWIDTH;
9271             /* FALL THROUGH */
9272         case '>':
9273             ender = reg_node(pRExC_state, SUCCEED);
9274             break;
9275         case 0:
9276             ender = reg_node(pRExC_state, END);
9277             if (!SIZE_ONLY) {
9278                 assert(!RExC_opend); /* there can only be one! */
9279                 RExC_opend = ender;
9280             }
9281             break;
9282         }
9283         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9284             SV * const mysv_val1=sv_newmortal();
9285             SV * const mysv_val2=sv_newmortal();
9286             DEBUG_PARSE_MSG("lsbr");
9287             regprop(RExC_rx, mysv_val1, lastbr);
9288             regprop(RExC_rx, mysv_val2, ender);
9289             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9290                           SvPV_nolen_const(mysv_val1),
9291                           (IV)REG_NODE_NUM(lastbr),
9292                           SvPV_nolen_const(mysv_val2),
9293                           (IV)REG_NODE_NUM(ender),
9294                           (IV)(ender - lastbr)
9295             );
9296         });
9297         REGTAIL(pRExC_state, lastbr, ender);
9298
9299         if (have_branch && !SIZE_ONLY) {
9300             char is_nothing= 1;
9301             if (depth==1)
9302                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9303
9304             /* Hook the tails of the branches to the closing node. */
9305             for (br = ret; br; br = regnext(br)) {
9306                 const U8 op = PL_regkind[OP(br)];
9307                 if (op == BRANCH) {
9308                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9309                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9310                         is_nothing= 0;
9311                 }
9312                 else if (op == BRANCHJ) {
9313                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9314                     /* for now we always disable this optimisation * /
9315                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9316                     */
9317                         is_nothing= 0;
9318                 }
9319             }
9320             if (is_nothing) {
9321                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9322                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9323                     SV * const mysv_val1=sv_newmortal();
9324                     SV * const mysv_val2=sv_newmortal();
9325                     DEBUG_PARSE_MSG("NADA");
9326                     regprop(RExC_rx, mysv_val1, ret);
9327                     regprop(RExC_rx, mysv_val2, ender);
9328                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9329                                   SvPV_nolen_const(mysv_val1),
9330                                   (IV)REG_NODE_NUM(ret),
9331                                   SvPV_nolen_const(mysv_val2),
9332                                   (IV)REG_NODE_NUM(ender),
9333                                   (IV)(ender - ret)
9334                     );
9335                 });
9336                 OP(br)= NOTHING;
9337                 if (OP(ender) == TAIL) {
9338                     NEXT_OFF(br)= 0;
9339                     RExC_emit= br + 1;
9340                 } else {
9341                     regnode *opt;
9342                     for ( opt= br + 1; opt < ender ; opt++ )
9343                         OP(opt)= OPTIMIZED;
9344                     NEXT_OFF(br)= ender - br;
9345                 }
9346             }
9347         }
9348     }
9349
9350     {
9351         const char *p;
9352         static const char parens[] = "=!<,>";
9353
9354         if (paren && (p = strchr(parens, paren))) {
9355             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9356             int flag = (p - parens) > 1;
9357
9358             if (paren == '>')
9359                 node = SUSPEND, flag = 0;
9360             reginsert(pRExC_state, node,ret, depth+1);
9361             Set_Node_Cur_Length(ret);
9362             Set_Node_Offset(ret, parse_start + 1);
9363             ret->flags = flag;
9364             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9365         }
9366     }
9367
9368     /* Check for proper termination. */
9369     if (paren) {
9370         RExC_flags = oregflags;
9371         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9372             RExC_parse = oregcomp_parse;
9373             vFAIL("Unmatched (");
9374         }
9375     }
9376     else if (!paren && RExC_parse < RExC_end) {
9377         if (*RExC_parse == ')') {
9378             RExC_parse++;
9379             vFAIL("Unmatched )");
9380         }
9381         else
9382             FAIL("Junk on end of regexp");      /* "Can't happen". */
9383         assert(0); /* NOTREACHED */
9384     }
9385
9386     if (RExC_in_lookbehind) {
9387         RExC_in_lookbehind--;
9388     }
9389     if (after_freeze > RExC_npar)
9390         RExC_npar = after_freeze;
9391     return(ret);
9392 }
9393
9394 /*
9395  - regbranch - one alternative of an | operator
9396  *
9397  * Implements the concatenation operator.
9398  */
9399 STATIC regnode *
9400 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9401 {
9402     dVAR;
9403     regnode *ret;
9404     regnode *chain = NULL;
9405     regnode *latest;
9406     I32 flags = 0, c = 0;
9407     GET_RE_DEBUG_FLAGS_DECL;
9408
9409     PERL_ARGS_ASSERT_REGBRANCH;
9410
9411     DEBUG_PARSE("brnc");
9412
9413     if (first)
9414         ret = NULL;
9415     else {
9416         if (!SIZE_ONLY && RExC_extralen)
9417             ret = reganode(pRExC_state, BRANCHJ,0);
9418         else {
9419             ret = reg_node(pRExC_state, BRANCH);
9420             Set_Node_Length(ret, 1);
9421         }
9422     }
9423
9424     if (!first && SIZE_ONLY)
9425         RExC_extralen += 1;                     /* BRANCHJ */
9426
9427     *flagp = WORST;                     /* Tentatively. */
9428
9429     RExC_parse--;
9430     nextchar(pRExC_state);
9431     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9432         flags &= ~TRYAGAIN;
9433         latest = regpiece(pRExC_state, &flags,depth+1);
9434         if (latest == NULL) {
9435             if (flags & TRYAGAIN)
9436                 continue;
9437             return(NULL);
9438         }
9439         else if (ret == NULL)
9440             ret = latest;
9441         *flagp |= flags&(HASWIDTH|POSTPONED);
9442         if (chain == NULL)      /* First piece. */
9443             *flagp |= flags&SPSTART;
9444         else {
9445             RExC_naughty++;
9446             REGTAIL(pRExC_state, chain, latest);
9447         }
9448         chain = latest;
9449         c++;
9450     }
9451     if (chain == NULL) {        /* Loop ran zero times. */
9452         chain = reg_node(pRExC_state, NOTHING);
9453         if (ret == NULL)
9454             ret = chain;
9455     }
9456     if (c == 1) {
9457         *flagp |= flags&SIMPLE;
9458     }
9459
9460     return ret;
9461 }
9462
9463 /*
9464  - regpiece - something followed by possible [*+?]
9465  *
9466  * Note that the branching code sequences used for ? and the general cases
9467  * of * and + are somewhat optimized:  they use the same NOTHING node as
9468  * both the endmarker for their branch list and the body of the last branch.
9469  * It might seem that this node could be dispensed with entirely, but the
9470  * endmarker role is not redundant.
9471  */
9472 STATIC regnode *
9473 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9474 {
9475     dVAR;
9476     regnode *ret;
9477     char op;
9478     char *next;
9479     I32 flags;
9480     const char * const origparse = RExC_parse;
9481     I32 min;
9482     I32 max = REG_INFTY;
9483 #ifdef RE_TRACK_PATTERN_OFFSETS
9484     char *parse_start;
9485 #endif
9486     const char *maxpos = NULL;
9487
9488     /* Save the original in case we change the emitted regop to a FAIL. */
9489     regnode * const orig_emit = RExC_emit;
9490
9491     GET_RE_DEBUG_FLAGS_DECL;
9492
9493     PERL_ARGS_ASSERT_REGPIECE;
9494
9495     DEBUG_PARSE("piec");
9496
9497     ret = regatom(pRExC_state, &flags,depth+1);
9498     if (ret == NULL) {
9499         if (flags & TRYAGAIN)
9500             *flagp |= TRYAGAIN;
9501         return(NULL);
9502     }
9503
9504     op = *RExC_parse;
9505
9506     if (op == '{' && regcurly(RExC_parse)) {
9507         maxpos = NULL;
9508 #ifdef RE_TRACK_PATTERN_OFFSETS
9509         parse_start = RExC_parse; /* MJD */
9510 #endif
9511         next = RExC_parse + 1;
9512         while (isDIGIT(*next) || *next == ',') {
9513             if (*next == ',') {
9514                 if (maxpos)
9515                     break;
9516                 else
9517                     maxpos = next;
9518             }
9519             next++;
9520         }
9521         if (*next == '}') {             /* got one */
9522             if (!maxpos)
9523                 maxpos = next;
9524             RExC_parse++;
9525             min = atoi(RExC_parse);
9526             if (*maxpos == ',')
9527                 maxpos++;
9528             else
9529                 maxpos = RExC_parse;
9530             max = atoi(maxpos);
9531             if (!max && *maxpos != '0')
9532                 max = REG_INFTY;                /* meaning "infinity" */
9533             else if (max >= REG_INFTY)
9534                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9535             RExC_parse = next;
9536             nextchar(pRExC_state);
9537             if (max < min) {    /* If can't match, warn and optimize to fail
9538                                    unconditionally */
9539                 if (SIZE_ONLY) {
9540                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9541
9542                     /* We can't back off the size because we have to reserve
9543                      * enough space for all the things we are about to throw
9544                      * away, but we can shrink it by the ammount we are about
9545                      * to re-use here */
9546                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9547                 }
9548                 else {
9549                     RExC_emit = orig_emit;
9550                 }
9551                 ret = reg_node(pRExC_state, OPFAIL);
9552                 return ret;
9553             }
9554
9555         do_curly:
9556             if ((flags&SIMPLE)) {
9557                 RExC_naughty += 2 + RExC_naughty / 2;
9558                 reginsert(pRExC_state, CURLY, ret, depth+1);
9559                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9560                 Set_Node_Cur_Length(ret);
9561             }
9562             else {
9563                 regnode * const w = reg_node(pRExC_state, WHILEM);
9564
9565                 w->flags = 0;
9566                 REGTAIL(pRExC_state, ret, w);
9567                 if (!SIZE_ONLY && RExC_extralen) {
9568                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9569                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9570                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9571                 }
9572                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9573                                 /* MJD hk */
9574                 Set_Node_Offset(ret, parse_start+1);
9575                 Set_Node_Length(ret,
9576                                 op == '{' ? (RExC_parse - parse_start) : 1);
9577
9578                 if (!SIZE_ONLY && RExC_extralen)
9579                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9580                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9581                 if (SIZE_ONLY)
9582                     RExC_whilem_seen++, RExC_extralen += 3;
9583                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9584             }
9585             ret->flags = 0;
9586
9587             if (min > 0)
9588                 *flagp = WORST;
9589             if (max > 0)
9590                 *flagp |= HASWIDTH;
9591             if (!SIZE_ONLY) {
9592                 ARG1_SET(ret, (U16)min);
9593                 ARG2_SET(ret, (U16)max);
9594             }
9595
9596             goto nest_check;
9597         }
9598     }
9599
9600     if (!ISMULT1(op)) {
9601         *flagp = flags;
9602         return(ret);
9603     }
9604
9605 #if 0                           /* Now runtime fix should be reliable. */
9606
9607     /* if this is reinstated, don't forget to put this back into perldiag:
9608
9609             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9610
9611            (F) The part of the regexp subject to either the * or + quantifier
9612            could match an empty string. The {#} shows in the regular
9613            expression about where the problem was discovered.
9614
9615     */
9616
9617     if (!(flags&HASWIDTH) && op != '?')
9618       vFAIL("Regexp *+ operand could be empty");
9619 #endif
9620
9621 #ifdef RE_TRACK_PATTERN_OFFSETS
9622     parse_start = RExC_parse;
9623 #endif
9624     nextchar(pRExC_state);
9625
9626     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9627
9628     if (op == '*' && (flags&SIMPLE)) {
9629         reginsert(pRExC_state, STAR, ret, depth+1);
9630         ret->flags = 0;
9631         RExC_naughty += 4;
9632     }
9633     else if (op == '*') {
9634         min = 0;
9635         goto do_curly;
9636     }
9637     else if (op == '+' && (flags&SIMPLE)) {
9638         reginsert(pRExC_state, PLUS, ret, depth+1);
9639         ret->flags = 0;
9640         RExC_naughty += 3;
9641     }
9642     else if (op == '+') {
9643         min = 1;
9644         goto do_curly;
9645     }
9646     else if (op == '?') {
9647         min = 0; max = 1;
9648         goto do_curly;
9649     }
9650   nest_check:
9651     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9652         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9653         ckWARN3reg(RExC_parse,
9654                    "%.*s matches null string many times",
9655                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9656                    origparse);
9657         ReREFCNT_inc(RExC_rx_sv);
9658     }
9659
9660     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9661         nextchar(pRExC_state);
9662         reginsert(pRExC_state, MINMOD, ret, depth+1);
9663         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9664     }
9665 #ifndef REG_ALLOW_MINMOD_SUSPEND
9666     else
9667 #endif
9668     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9669         regnode *ender;
9670         nextchar(pRExC_state);
9671         ender = reg_node(pRExC_state, SUCCEED);
9672         REGTAIL(pRExC_state, ret, ender);
9673         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9674         ret->flags = 0;
9675         ender = reg_node(pRExC_state, TAIL);
9676         REGTAIL(pRExC_state, ret, ender);
9677         /*ret= ender;*/
9678     }
9679
9680     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9681         RExC_parse++;
9682         vFAIL("Nested quantifiers");
9683     }
9684
9685     return(ret);
9686 }
9687
9688 STATIC bool
9689 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9690 {
9691
9692  /* This is expected to be called by a parser routine that has recognized '\N'
9693    and needs to handle the rest. RExC_parse is expected to point at the first
9694    char following the N at the time of the call.  On successful return,
9695    RExC_parse has been updated to point to just after the sequence identified
9696    by this routine, and <*flagp> has been updated.
9697
9698    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9699    character class.
9700
9701    \N may begin either a named sequence, or if outside a character class, mean
9702    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9703    attempted to decide which, and in the case of a named sequence, converted it
9704    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9705    where c1... are the characters in the sequence.  For single-quoted regexes,
9706    the tokenizer passes the \N sequence through unchanged; this code will not
9707    attempt to determine this nor expand those, instead raising a syntax error.
9708    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9709    or there is no '}', it signals that this \N occurrence means to match a
9710    non-newline.
9711
9712    Only the \N{U+...} form should occur in a character class, for the same
9713    reason that '.' inside a character class means to just match a period: it
9714    just doesn't make sense.
9715
9716    The function raises an error (via vFAIL), and doesn't return for various
9717    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9718    success; it returns FALSE otherwise.
9719
9720    If <valuep> is non-null, it means the caller can accept an input sequence
9721    consisting of a just a single code point; <*valuep> is set to that value
9722    if the input is such.
9723
9724    If <node_p> is non-null it signifies that the caller can accept any other
9725    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9726    is set as follows:
9727     1) \N means not-a-NL: points to a newly created REG_ANY node;
9728     2) \N{}:              points to a new NOTHING node;
9729     3) otherwise:         points to a new EXACT node containing the resolved
9730                           string.
9731    Note that FALSE is returned for single code point sequences if <valuep> is
9732    null.
9733  */
9734
9735     char * endbrace;    /* '}' following the name */
9736     char* p;
9737     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9738                            stream */
9739     bool has_multiple_chars; /* true if the input stream contains a sequence of
9740                                 more than one character */
9741
9742     GET_RE_DEBUG_FLAGS_DECL;
9743
9744     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9745
9746     GET_RE_DEBUG_FLAGS;
9747
9748     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9749
9750     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9751      * modifier.  The other meaning does not */
9752     p = (RExC_flags & RXf_PMf_EXTENDED)
9753         ? regwhite( pRExC_state, RExC_parse )
9754         : RExC_parse;
9755
9756     /* Disambiguate between \N meaning a named character versus \N meaning
9757      * [^\n].  The former is assumed when it can't be the latter. */
9758     if (*p != '{' || regcurly(p)) {
9759         RExC_parse = p;
9760         if (! node_p) {
9761             /* no bare \N in a charclass */
9762             if (in_char_class) {
9763                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9764             }
9765             return FALSE;
9766         }
9767         nextchar(pRExC_state);
9768         *node_p = reg_node(pRExC_state, REG_ANY);
9769         *flagp |= HASWIDTH|SIMPLE;
9770         RExC_naughty++;
9771         RExC_parse--;
9772         Set_Node_Length(*node_p, 1); /* MJD */
9773         return TRUE;
9774     }
9775
9776     /* Here, we have decided it should be a named character or sequence */
9777
9778     /* The test above made sure that the next real character is a '{', but
9779      * under the /x modifier, it could be separated by space (or a comment and
9780      * \n) and this is not allowed (for consistency with \x{...} and the
9781      * tokenizer handling of \N{NAME}). */
9782     if (*RExC_parse != '{') {
9783         vFAIL("Missing braces on \\N{}");
9784     }
9785
9786     RExC_parse++;       /* Skip past the '{' */
9787
9788     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9789         || ! (endbrace == RExC_parse            /* nothing between the {} */
9790               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9791                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9792     {
9793         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9794         vFAIL("\\N{NAME} must be resolved by the lexer");
9795     }
9796
9797     if (endbrace == RExC_parse) {   /* empty: \N{} */
9798         bool ret = TRUE;
9799         if (node_p) {
9800             *node_p = reg_node(pRExC_state,NOTHING);
9801         }
9802         else if (in_char_class) {
9803             if (SIZE_ONLY && in_char_class) {
9804                 ckWARNreg(RExC_parse,
9805                         "Ignoring zero length \\N{} in character class"
9806                 );
9807             }
9808             ret = FALSE;
9809         }
9810         else {
9811             return FALSE;
9812         }
9813         nextchar(pRExC_state);
9814         return ret;
9815     }
9816
9817     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9818     RExC_parse += 2;    /* Skip past the 'U+' */
9819
9820     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9821
9822     /* Code points are separated by dots.  If none, there is only one code
9823      * point, and is terminated by the brace */
9824     has_multiple_chars = (endchar < endbrace);
9825
9826     if (valuep && (! has_multiple_chars || in_char_class)) {
9827         /* We only pay attention to the first char of
9828         multichar strings being returned in char classes. I kinda wonder
9829         if this makes sense as it does change the behaviour
9830         from earlier versions, OTOH that behaviour was broken
9831         as well. XXX Solution is to recharacterize as
9832         [rest-of-class]|multi1|multi2... */
9833
9834         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9835         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9836             | PERL_SCAN_DISALLOW_PREFIX
9837             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9838
9839         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9840
9841         /* The tokenizer should have guaranteed validity, but it's possible to
9842          * bypass it by using single quoting, so check */
9843         if (length_of_hex == 0
9844             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9845         {
9846             RExC_parse += length_of_hex;        /* Includes all the valid */
9847             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9848                             ? UTF8SKIP(RExC_parse)
9849                             : 1;
9850             /* Guard against malformed utf8 */
9851             if (RExC_parse >= endchar) {
9852                 RExC_parse = endchar;
9853             }
9854             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9855         }
9856
9857         if (in_char_class && has_multiple_chars) {
9858             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9859         }
9860
9861         RExC_parse = endbrace + 1;
9862     }
9863     else if (! node_p || ! has_multiple_chars) {
9864
9865         /* Here, the input is legal, but not according to the caller's
9866          * options.  We fail without advancing the parse, so that the
9867          * caller can try again */
9868         RExC_parse = p;
9869         return FALSE;
9870     }
9871     else {
9872
9873         /* What is done here is to convert this to a sub-pattern of the form
9874          * (?:\x{char1}\x{char2}...)
9875          * and then call reg recursively.  That way, it retains its atomicness,
9876          * while not having to worry about special handling that some code
9877          * points may have.  toke.c has converted the original Unicode values
9878          * to native, so that we can just pass on the hex values unchanged.  We
9879          * do have to set a flag to keep recoding from happening in the
9880          * recursion */
9881
9882         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9883         STRLEN len;
9884         char *orig_end = RExC_end;
9885         I32 flags;
9886
9887         while (RExC_parse < endbrace) {
9888
9889             /* Convert to notation the rest of the code understands */
9890             sv_catpv(substitute_parse, "\\x{");
9891             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9892             sv_catpv(substitute_parse, "}");
9893
9894             /* Point to the beginning of the next character in the sequence. */
9895             RExC_parse = endchar + 1;
9896             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9897         }
9898         sv_catpv(substitute_parse, ")");
9899
9900         RExC_parse = SvPV(substitute_parse, len);
9901
9902         /* Don't allow empty number */
9903         if (len < 8) {
9904             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9905         }
9906         RExC_end = RExC_parse + len;
9907
9908         /* The values are Unicode, and therefore not subject to recoding */
9909         RExC_override_recoding = 1;
9910
9911         *node_p = reg(pRExC_state, 1, &flags, depth+1);
9912         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9913
9914         RExC_parse = endbrace;
9915         RExC_end = orig_end;
9916         RExC_override_recoding = 0;
9917
9918         nextchar(pRExC_state);
9919     }
9920
9921     return TRUE;
9922 }
9923
9924
9925 /*
9926  * reg_recode
9927  *
9928  * It returns the code point in utf8 for the value in *encp.
9929  *    value: a code value in the source encoding
9930  *    encp:  a pointer to an Encode object
9931  *
9932  * If the result from Encode is not a single character,
9933  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9934  */
9935 STATIC UV
9936 S_reg_recode(pTHX_ const char value, SV **encp)
9937 {
9938     STRLEN numlen = 1;
9939     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9940     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9941     const STRLEN newlen = SvCUR(sv);
9942     UV uv = UNICODE_REPLACEMENT;
9943
9944     PERL_ARGS_ASSERT_REG_RECODE;
9945
9946     if (newlen)
9947         uv = SvUTF8(sv)
9948              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9949              : *(U8*)s;
9950
9951     if (!newlen || numlen != newlen) {
9952         uv = UNICODE_REPLACEMENT;
9953         *encp = NULL;
9954     }
9955     return uv;
9956 }
9957
9958 PERL_STATIC_INLINE U8
9959 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9960 {
9961     U8 op;
9962
9963     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9964
9965     if (! FOLD) {
9966         return EXACT;
9967     }
9968
9969     op = get_regex_charset(RExC_flags);
9970     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9971         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9972                  been, so there is no hole */
9973     }
9974
9975     return op + EXACTF;
9976 }
9977
9978 PERL_STATIC_INLINE void
9979 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9980 {
9981     /* This knows the details about sizing an EXACTish node, setting flags for
9982      * it (by setting <*flagp>, and potentially populating it with a single
9983      * character.
9984      *
9985      * If <len> (the length in bytes) is non-zero, this function assumes that
9986      * the node has already been populated, and just does the sizing.  In this
9987      * case <code_point> should be the final code point that has already been
9988      * placed into the node.  This value will be ignored except that under some
9989      * circumstances <*flagp> is set based on it.
9990      *
9991      * If <len> is zero, the function assumes that the node is to contain only
9992      * the single character given by <code_point> and calculates what <len>
9993      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
9994      * additionally will populate the node's STRING with <code_point>, if <len>
9995      * is 0.  In both cases <*flagp> is appropriately set
9996      *
9997      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9998      * folded (the latter only when the rules indicate it can match 'ss') */
9999
10000     bool len_passed_in = cBOOL(len != 0);
10001     U8 character[UTF8_MAXBYTES_CASE+1];
10002
10003     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10004
10005     if (! len_passed_in) {
10006         if (UTF) {
10007             if (FOLD) {
10008                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
10009             }
10010             else {
10011                 uvchr_to_utf8( character, code_point);
10012                 len = UTF8SKIP(character);
10013             }
10014         }
10015         else if (! FOLD
10016                  || code_point != LATIN_SMALL_LETTER_SHARP_S
10017                  || ASCII_FOLD_RESTRICTED
10018                  || ! AT_LEAST_UNI_SEMANTICS)
10019         {
10020             *character = (U8) code_point;
10021             len = 1;
10022         }
10023         else {
10024             *character = 's';
10025             *(character + 1) = 's';
10026             len = 2;
10027         }
10028     }
10029
10030     if (SIZE_ONLY) {
10031         RExC_size += STR_SZ(len);
10032     }
10033     else {
10034         RExC_emit += STR_SZ(len);
10035         STR_LEN(node) = len;
10036         if (! len_passed_in) {
10037             Copy((char *) character, STRING(node), len, char);
10038         }
10039     }
10040
10041     *flagp |= HASWIDTH;
10042
10043     /* A single character node is SIMPLE, except for the special-cased SHARP S
10044      * under /di. */
10045     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10046         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10047             || ! FOLD || ! DEPENDS_SEMANTICS))
10048     {
10049         *flagp |= SIMPLE;
10050     }
10051 }
10052
10053 /*
10054  - regatom - the lowest level
10055
10056    Try to identify anything special at the start of the pattern. If there
10057    is, then handle it as required. This may involve generating a single regop,
10058    such as for an assertion; or it may involve recursing, such as to
10059    handle a () structure.
10060
10061    If the string doesn't start with something special then we gobble up
10062    as much literal text as we can.
10063
10064    Once we have been able to handle whatever type of thing started the
10065    sequence, we return.
10066
10067    Note: we have to be careful with escapes, as they can be both literal
10068    and special, and in the case of \10 and friends, context determines which.
10069
10070    A summary of the code structure is:
10071
10072    switch (first_byte) {
10073         cases for each special:
10074             handle this special;
10075             break;
10076         case '\\':
10077             switch (2nd byte) {
10078                 cases for each unambiguous special:
10079                     handle this special;
10080                     break;
10081                 cases for each ambigous special/literal:
10082                     disambiguate;
10083                     if (special)  handle here
10084                     else goto defchar;
10085                 default: // unambiguously literal:
10086                     goto defchar;
10087             }
10088         default:  // is a literal char
10089             // FALL THROUGH
10090         defchar:
10091             create EXACTish node for literal;
10092             while (more input and node isn't full) {
10093                 switch (input_byte) {
10094                    cases for each special;
10095                        make sure parse pointer is set so that the next call to
10096                            regatom will see this special first
10097                        goto loopdone; // EXACTish node terminated by prev. char
10098                    default:
10099                        append char to EXACTISH node;
10100                 }
10101                 get next input byte;
10102             }
10103         loopdone:
10104    }
10105    return the generated node;
10106
10107    Specifically there are two separate switches for handling
10108    escape sequences, with the one for handling literal escapes requiring
10109    a dummy entry for all of the special escapes that are actually handled
10110    by the other.
10111 */
10112
10113 STATIC regnode *
10114 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10115 {
10116     dVAR;
10117     regnode *ret = NULL;
10118     I32 flags;
10119     char *parse_start = RExC_parse;
10120     U8 op;
10121     GET_RE_DEBUG_FLAGS_DECL;
10122     DEBUG_PARSE("atom");
10123     *flagp = WORST;             /* Tentatively. */
10124
10125     PERL_ARGS_ASSERT_REGATOM;
10126
10127 tryagain:
10128     switch ((U8)*RExC_parse) {
10129     case '^':
10130         RExC_seen_zerolen++;
10131         nextchar(pRExC_state);
10132         if (RExC_flags & RXf_PMf_MULTILINE)
10133             ret = reg_node(pRExC_state, MBOL);
10134         else if (RExC_flags & RXf_PMf_SINGLELINE)
10135             ret = reg_node(pRExC_state, SBOL);
10136         else
10137             ret = reg_node(pRExC_state, BOL);
10138         Set_Node_Length(ret, 1); /* MJD */
10139         break;
10140     case '$':
10141         nextchar(pRExC_state);
10142         if (*RExC_parse)
10143             RExC_seen_zerolen++;
10144         if (RExC_flags & RXf_PMf_MULTILINE)
10145             ret = reg_node(pRExC_state, MEOL);
10146         else if (RExC_flags & RXf_PMf_SINGLELINE)
10147             ret = reg_node(pRExC_state, SEOL);
10148         else
10149             ret = reg_node(pRExC_state, EOL);
10150         Set_Node_Length(ret, 1); /* MJD */
10151         break;
10152     case '.':
10153         nextchar(pRExC_state);
10154         if (RExC_flags & RXf_PMf_SINGLELINE)
10155             ret = reg_node(pRExC_state, SANY);
10156         else
10157             ret = reg_node(pRExC_state, REG_ANY);
10158         *flagp |= HASWIDTH|SIMPLE;
10159         RExC_naughty++;
10160         Set_Node_Length(ret, 1); /* MJD */
10161         break;
10162     case '[':
10163     {
10164         char * const oregcomp_parse = ++RExC_parse;
10165         ret = regclass(pRExC_state, flagp,depth+1);
10166         if (*RExC_parse != ']') {
10167             RExC_parse = oregcomp_parse;
10168             vFAIL("Unmatched [");
10169         }
10170         nextchar(pRExC_state);
10171         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10172         break;
10173     }
10174     case '(':
10175         nextchar(pRExC_state);
10176         ret = reg(pRExC_state, 1, &flags,depth+1);
10177         if (ret == NULL) {
10178                 if (flags & TRYAGAIN) {
10179                     if (RExC_parse == RExC_end) {
10180                          /* Make parent create an empty node if needed. */
10181                         *flagp |= TRYAGAIN;
10182                         return(NULL);
10183                     }
10184                     goto tryagain;
10185                 }
10186                 return(NULL);
10187         }
10188         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10189         break;
10190     case '|':
10191     case ')':
10192         if (flags & TRYAGAIN) {
10193             *flagp |= TRYAGAIN;
10194             return NULL;
10195         }
10196         vFAIL("Internal urp");
10197                                 /* Supposed to be caught earlier. */
10198         break;
10199     case '?':
10200     case '+':
10201     case '*':
10202         RExC_parse++;
10203         vFAIL("Quantifier follows nothing");
10204         break;
10205     case '\\':
10206         /* Special Escapes
10207
10208            This switch handles escape sequences that resolve to some kind
10209            of special regop and not to literal text. Escape sequnces that
10210            resolve to literal text are handled below in the switch marked
10211            "Literal Escapes".
10212
10213            Every entry in this switch *must* have a corresponding entry
10214            in the literal escape switch. However, the opposite is not
10215            required, as the default for this switch is to jump to the
10216            literal text handling code.
10217         */
10218         switch ((U8)*++RExC_parse) {
10219         /* Special Escapes */
10220         case 'A':
10221             RExC_seen_zerolen++;
10222             ret = reg_node(pRExC_state, SBOL);
10223             *flagp |= SIMPLE;
10224             goto finish_meta_pat;
10225         case 'G':
10226             ret = reg_node(pRExC_state, GPOS);
10227             RExC_seen |= REG_SEEN_GPOS;
10228             *flagp |= SIMPLE;
10229             goto finish_meta_pat;
10230         case 'K':
10231             RExC_seen_zerolen++;
10232             ret = reg_node(pRExC_state, KEEPS);
10233             *flagp |= SIMPLE;
10234             /* XXX:dmq : disabling in-place substitution seems to
10235              * be necessary here to avoid cases of memory corruption, as
10236              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10237              */
10238             RExC_seen |= REG_SEEN_LOOKBEHIND;
10239             goto finish_meta_pat;
10240         case 'Z':
10241             ret = reg_node(pRExC_state, SEOL);
10242             *flagp |= SIMPLE;
10243             RExC_seen_zerolen++;                /* Do not optimize RE away */
10244             goto finish_meta_pat;
10245         case 'z':
10246             ret = reg_node(pRExC_state, EOS);
10247             *flagp |= SIMPLE;
10248             RExC_seen_zerolen++;                /* Do not optimize RE away */
10249             goto finish_meta_pat;
10250         case 'C':
10251             ret = reg_node(pRExC_state, CANY);
10252             RExC_seen |= REG_SEEN_CANY;
10253             *flagp |= HASWIDTH|SIMPLE;
10254             goto finish_meta_pat;
10255         case 'X':
10256             ret = reg_node(pRExC_state, CLUMP);
10257             *flagp |= HASWIDTH;
10258             goto finish_meta_pat;
10259         case 'w':
10260             op = ALNUM + get_regex_charset(RExC_flags);
10261             if (op > ALNUMA) {  /* /aa is same as /a */
10262                 op = ALNUMA;
10263             }
10264             ret = reg_node(pRExC_state, op);
10265             *flagp |= HASWIDTH|SIMPLE;
10266             goto finish_meta_pat;
10267         case 'W':
10268             op = NALNUM + get_regex_charset(RExC_flags);
10269             if (op > NALNUMA) { /* /aa is same as /a */
10270                 op = NALNUMA;
10271             }
10272             ret = reg_node(pRExC_state, op);
10273             *flagp |= HASWIDTH|SIMPLE;
10274             goto finish_meta_pat;
10275         case 'b':
10276             RExC_seen_zerolen++;
10277             RExC_seen |= REG_SEEN_LOOKBEHIND;
10278             op = BOUND + get_regex_charset(RExC_flags);
10279             if (op > BOUNDA) {  /* /aa is same as /a */
10280                 op = BOUNDA;
10281             }
10282             ret = reg_node(pRExC_state, op);
10283             FLAGS(ret) = get_regex_charset(RExC_flags);
10284             *flagp |= SIMPLE;
10285             goto finish_meta_pat;
10286         case 'B':
10287             RExC_seen_zerolen++;
10288             RExC_seen |= REG_SEEN_LOOKBEHIND;
10289             op = NBOUND + get_regex_charset(RExC_flags);
10290             if (op > NBOUNDA) { /* /aa is same as /a */
10291                 op = NBOUNDA;
10292             }
10293             ret = reg_node(pRExC_state, op);
10294             FLAGS(ret) = get_regex_charset(RExC_flags);
10295             *flagp |= SIMPLE;
10296             goto finish_meta_pat;
10297         case 's':
10298             op = SPACE + get_regex_charset(RExC_flags);
10299             if (op > SPACEA) {  /* /aa is same as /a */
10300                 op = SPACEA;
10301             }
10302             ret = reg_node(pRExC_state, op);
10303             *flagp |= HASWIDTH|SIMPLE;
10304             goto finish_meta_pat;
10305         case 'S':
10306             op = NSPACE + get_regex_charset(RExC_flags);
10307             if (op > NSPACEA) { /* /aa is same as /a */
10308                 op = NSPACEA;
10309             }
10310             ret = reg_node(pRExC_state, op);
10311             *flagp |= HASWIDTH|SIMPLE;
10312             goto finish_meta_pat;
10313         case 'D':
10314             op = NDIGIT;
10315             goto join_D_and_d;
10316         case 'd':
10317             op = DIGIT;
10318         join_D_and_d:
10319             {
10320                 U8 offset = get_regex_charset(RExC_flags);
10321                 if (offset == REGEX_UNICODE_CHARSET) {
10322                     offset = REGEX_DEPENDS_CHARSET;
10323                 }
10324                 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10325                     offset = REGEX_ASCII_RESTRICTED_CHARSET;
10326                 }
10327                 op += offset;
10328             }
10329             ret = reg_node(pRExC_state, op);
10330             *flagp |= HASWIDTH|SIMPLE;
10331             goto finish_meta_pat;
10332         case 'R':
10333             ret = reg_node(pRExC_state, LNBREAK);
10334             *flagp |= HASWIDTH|SIMPLE;
10335             goto finish_meta_pat;
10336         case 'h':
10337             ret = reg_node(pRExC_state, HORIZWS);
10338             *flagp |= HASWIDTH|SIMPLE;
10339             goto finish_meta_pat;
10340         case 'H':
10341             ret = reg_node(pRExC_state, NHORIZWS);
10342             *flagp |= HASWIDTH|SIMPLE;
10343             goto finish_meta_pat;
10344         case 'v':
10345             ret = reg_node(pRExC_state, VERTWS);
10346             *flagp |= HASWIDTH|SIMPLE;
10347             goto finish_meta_pat;
10348         case 'V':
10349             ret = reg_node(pRExC_state, NVERTWS);
10350             *flagp |= HASWIDTH|SIMPLE;
10351          finish_meta_pat:
10352             nextchar(pRExC_state);
10353             Set_Node_Length(ret, 2); /* MJD */
10354             break;
10355         case 'p':
10356         case 'P':
10357             {
10358                 char* const oldregxend = RExC_end;
10359 #ifdef DEBUGGING
10360                 char* parse_start = RExC_parse - 2;
10361 #endif
10362
10363                 if (RExC_parse[1] == '{') {
10364                   /* a lovely hack--pretend we saw [\pX] instead */
10365                     RExC_end = strchr(RExC_parse, '}');
10366                     if (!RExC_end) {
10367                         const U8 c = (U8)*RExC_parse;
10368                         RExC_parse += 2;
10369                         RExC_end = oldregxend;
10370                         vFAIL2("Missing right brace on \\%c{}", c);
10371                     }
10372                     RExC_end++;
10373                 }
10374                 else {
10375                     RExC_end = RExC_parse + 2;
10376                     if (RExC_end > oldregxend)
10377                         RExC_end = oldregxend;
10378                 }
10379                 RExC_parse--;
10380
10381                 ret = regclass(pRExC_state, flagp,depth+1);
10382
10383                 RExC_end = oldregxend;
10384                 RExC_parse--;
10385
10386                 Set_Node_Offset(ret, parse_start + 2);
10387                 Set_Node_Cur_Length(ret);
10388                 nextchar(pRExC_state);
10389             }
10390             break;
10391         case 'N':
10392             /* Handle \N and \N{NAME} with multiple code points here and not
10393              * below because it can be multicharacter. join_exact() will join
10394              * them up later on.  Also this makes sure that things like
10395              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10396              * The options to the grok function call causes it to fail if the
10397              * sequence is just a single code point.  We then go treat it as
10398              * just another character in the current EXACT node, and hence it
10399              * gets uniform treatment with all the other characters.  The
10400              * special treatment for quantifiers is not needed for such single
10401              * character sequences */
10402             ++RExC_parse;
10403             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10404                 RExC_parse--;
10405                 goto defchar;
10406             }
10407             break;
10408         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10409         parse_named_seq:
10410         {
10411             char ch= RExC_parse[1];
10412             if (ch != '<' && ch != '\'' && ch != '{') {
10413                 RExC_parse++;
10414                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10415             } else {
10416                 /* this pretty much dupes the code for (?P=...) in reg(), if
10417                    you change this make sure you change that */
10418                 char* name_start = (RExC_parse += 2);
10419                 U32 num = 0;
10420                 SV *sv_dat = reg_scan_name(pRExC_state,
10421                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10422                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10423                 if (RExC_parse == name_start || *RExC_parse != ch)
10424                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10425
10426                 if (!SIZE_ONLY) {
10427                     num = add_data( pRExC_state, 1, "S" );
10428                     RExC_rxi->data->data[num]=(void*)sv_dat;
10429                     SvREFCNT_inc_simple_void(sv_dat);
10430                 }
10431
10432                 RExC_sawback = 1;
10433                 ret = reganode(pRExC_state,
10434                                ((! FOLD)
10435                                  ? NREF
10436                                  : (ASCII_FOLD_RESTRICTED)
10437                                    ? NREFFA
10438                                    : (AT_LEAST_UNI_SEMANTICS)
10439                                      ? NREFFU
10440                                      : (LOC)
10441                                        ? NREFFL
10442                                        : NREFF),
10443                                 num);
10444                 *flagp |= HASWIDTH;
10445
10446                 /* override incorrect value set in reganode MJD */
10447                 Set_Node_Offset(ret, parse_start+1);
10448                 Set_Node_Cur_Length(ret); /* MJD */
10449                 nextchar(pRExC_state);
10450
10451             }
10452             break;
10453         }
10454         case 'g':
10455         case '1': case '2': case '3': case '4':
10456         case '5': case '6': case '7': case '8': case '9':
10457             {
10458                 I32 num;
10459                 bool isg = *RExC_parse == 'g';
10460                 bool isrel = 0;
10461                 bool hasbrace = 0;
10462                 if (isg) {
10463                     RExC_parse++;
10464                     if (*RExC_parse == '{') {
10465                         RExC_parse++;
10466                         hasbrace = 1;
10467                     }
10468                     if (*RExC_parse == '-') {
10469                         RExC_parse++;
10470                         isrel = 1;
10471                     }
10472                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10473                         if (isrel) RExC_parse--;
10474                         RExC_parse -= 2;
10475                         goto parse_named_seq;
10476                 }   }
10477                 num = atoi(RExC_parse);
10478                 if (isg && num == 0)
10479                     vFAIL("Reference to invalid group 0");
10480                 if (isrel) {
10481                     num = RExC_npar - num;
10482                     if (num < 1)
10483                         vFAIL("Reference to nonexistent or unclosed group");
10484                 }
10485                 if (!isg && num > 9 && num >= RExC_npar)
10486                     /* Probably a character specified in octal, e.g. \35 */
10487                     goto defchar;
10488                 else {
10489                     char * const parse_start = RExC_parse - 1; /* MJD */
10490                     while (isDIGIT(*RExC_parse))
10491                         RExC_parse++;
10492                     if (parse_start == RExC_parse - 1)
10493                         vFAIL("Unterminated \\g... pattern");
10494                     if (hasbrace) {
10495                         if (*RExC_parse != '}')
10496                             vFAIL("Unterminated \\g{...} pattern");
10497                         RExC_parse++;
10498                     }
10499                     if (!SIZE_ONLY) {
10500                         if (num > (I32)RExC_rx->nparens)
10501                             vFAIL("Reference to nonexistent group");
10502                     }
10503                     RExC_sawback = 1;
10504                     ret = reganode(pRExC_state,
10505                                    ((! FOLD)
10506                                      ? REF
10507                                      : (ASCII_FOLD_RESTRICTED)
10508                                        ? REFFA
10509                                        : (AT_LEAST_UNI_SEMANTICS)
10510                                          ? REFFU
10511                                          : (LOC)
10512                                            ? REFFL
10513                                            : REFF),
10514                                     num);
10515                     *flagp |= HASWIDTH;
10516
10517                     /* override incorrect value set in reganode MJD */
10518                     Set_Node_Offset(ret, parse_start+1);
10519                     Set_Node_Cur_Length(ret); /* MJD */
10520                     RExC_parse--;
10521                     nextchar(pRExC_state);
10522                 }
10523             }
10524             break;
10525         case '\0':
10526             if (RExC_parse >= RExC_end)
10527                 FAIL("Trailing \\");
10528             /* FALL THROUGH */
10529         default:
10530             /* Do not generate "unrecognized" warnings here, we fall
10531                back into the quick-grab loop below */
10532             parse_start--;
10533             goto defchar;
10534         }
10535         break;
10536
10537     case '#':
10538         if (RExC_flags & RXf_PMf_EXTENDED) {
10539             if ( reg_skipcomment( pRExC_state ) )
10540                 goto tryagain;
10541         }
10542         /* FALL THROUGH */
10543
10544     default:
10545
10546             parse_start = RExC_parse - 1;
10547
10548             RExC_parse++;
10549
10550         defchar: {
10551             STRLEN len = 0;
10552             UV ender;
10553             char *p;
10554             char *s;
10555 #define MAX_NODE_STRING_SIZE 127
10556             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10557             char *s0;
10558             U8 upper_parse = MAX_NODE_STRING_SIZE;
10559             STRLEN foldlen;
10560             U8 node_type;
10561             bool next_is_quantifier;
10562             char * oldp = NULL;
10563
10564             /* If a folding node contains only code points that don't
10565              * participate in folds, it can be changed into an EXACT node,
10566              * which allows the optimizer more things to look for */
10567             bool maybe_exact;
10568
10569             ender = 0;
10570             node_type = compute_EXACTish(pRExC_state);
10571             ret = reg_node(pRExC_state, node_type);
10572
10573             /* In pass1, folded, we use a temporary buffer instead of the
10574              * actual node, as the node doesn't exist yet */
10575             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10576
10577             s0 = s;
10578
10579         reparse:
10580
10581             /* We do the EXACTFish to EXACT node only if folding, and not if in
10582              * locale, as whether a character folds or not isn't known until
10583              * runtime */
10584             maybe_exact = FOLD && ! LOC;
10585
10586             /* XXX The node can hold up to 255 bytes, yet this only goes to
10587              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10588              * 255 allows us to not have to worry about overflow due to
10589              * converting to utf8 and fold expansion, but that value is
10590              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10591              * split up by this limit into a single one using the real max of
10592              * 255.  Even at 127, this breaks under rare circumstances.  If
10593              * folding, we do not want to split a node at a character that is a
10594              * non-final in a multi-char fold, as an input string could just
10595              * happen to want to match across the node boundary.  The join
10596              * would solve that problem if the join actually happens.  But a
10597              * series of more than two nodes in a row each of 127 would cause
10598              * the first join to succeed to get to 254, but then there wouldn't
10599              * be room for the next one, which could at be one of those split
10600              * multi-char folds.  I don't know of any fool-proof solution.  One
10601              * could back off to end with only a code point that isn't such a
10602              * non-final, but it is possible for there not to be any in the
10603              * entire node. */
10604             for (p = RExC_parse - 1;
10605                  len < upper_parse && p < RExC_end;
10606                  len++)
10607             {
10608                 oldp = p;
10609
10610                 if (RExC_flags & RXf_PMf_EXTENDED)
10611                     p = regwhite( pRExC_state, p );
10612                 switch ((U8)*p) {
10613                 case '^':
10614                 case '$':
10615                 case '.':
10616                 case '[':
10617                 case '(':
10618                 case ')':
10619                 case '|':
10620                     goto loopdone;
10621                 case '\\':
10622                     /* Literal Escapes Switch
10623
10624                        This switch is meant to handle escape sequences that
10625                        resolve to a literal character.
10626
10627                        Every escape sequence that represents something
10628                        else, like an assertion or a char class, is handled
10629                        in the switch marked 'Special Escapes' above in this
10630                        routine, but also has an entry here as anything that
10631                        isn't explicitly mentioned here will be treated as
10632                        an unescaped equivalent literal.
10633                     */
10634
10635                     switch ((U8)*++p) {
10636                     /* These are all the special escapes. */
10637                     case 'A':             /* Start assertion */
10638                     case 'b': case 'B':   /* Word-boundary assertion*/
10639                     case 'C':             /* Single char !DANGEROUS! */
10640                     case 'd': case 'D':   /* digit class */
10641                     case 'g': case 'G':   /* generic-backref, pos assertion */
10642                     case 'h': case 'H':   /* HORIZWS */
10643                     case 'k': case 'K':   /* named backref, keep marker */
10644                     case 'p': case 'P':   /* Unicode property */
10645                               case 'R':   /* LNBREAK */
10646                     case 's': case 'S':   /* space class */
10647                     case 'v': case 'V':   /* VERTWS */
10648                     case 'w': case 'W':   /* word class */
10649                     case 'X':             /* eXtended Unicode "combining character sequence" */
10650                     case 'z': case 'Z':   /* End of line/string assertion */
10651                         --p;
10652                         goto loopdone;
10653
10654                     /* Anything after here is an escape that resolves to a
10655                        literal. (Except digits, which may or may not)
10656                      */
10657                     case 'n':
10658                         ender = '\n';
10659                         p++;
10660                         break;
10661                     case 'N': /* Handle a single-code point named character. */
10662                         /* The options cause it to fail if a multiple code
10663                          * point sequence.  Handle those in the switch() above
10664                          * */
10665                         RExC_parse = p + 1;
10666                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10667                                             flagp, depth, FALSE))
10668                         {
10669                             RExC_parse = p = oldp;
10670                             goto loopdone;
10671                         }
10672                         p = RExC_parse;
10673                         if (ender > 0xff) {
10674                             REQUIRE_UTF8;
10675                         }
10676                         break;
10677                     case 'r':
10678                         ender = '\r';
10679                         p++;
10680                         break;
10681                     case 't':
10682                         ender = '\t';
10683                         p++;
10684                         break;
10685                     case 'f':
10686                         ender = '\f';
10687                         p++;
10688                         break;
10689                     case 'e':
10690                           ender = ASCII_TO_NATIVE('\033');
10691                         p++;
10692                         break;
10693                     case 'a':
10694                           ender = ASCII_TO_NATIVE('\007');
10695                         p++;
10696                         break;
10697                     case 'o':
10698                         {
10699                             STRLEN brace_len = len;
10700                             UV result;
10701                             const char* error_msg;
10702
10703                             bool valid = grok_bslash_o(p,
10704                                                        &result,
10705                                                        &brace_len,
10706                                                        &error_msg,
10707                                                        1);
10708                             p += brace_len;
10709                             if (! valid) {
10710                                 RExC_parse = p; /* going to die anyway; point
10711                                                    to exact spot of failure */
10712                                 vFAIL(error_msg);
10713                             }
10714                             else
10715                             {
10716                                 ender = result;
10717                             }
10718                             if (PL_encoding && ender < 0x100) {
10719                                 goto recode_encoding;
10720                             }
10721                             if (ender > 0xff) {
10722                                 REQUIRE_UTF8;
10723                             }
10724                             break;
10725                         }
10726                     case 'x':
10727                         {
10728                             STRLEN brace_len = len;
10729                             UV result;
10730                             const char* error_msg;
10731
10732                             bool valid = grok_bslash_x(p,
10733                                                        &result,
10734                                                        &brace_len,
10735                                                        &error_msg,
10736                                                        1);
10737                             p += brace_len;
10738                             if (! valid) {
10739                                 RExC_parse = p; /* going to die anyway; point
10740                                                    to exact spot of failure */
10741                                 vFAIL(error_msg);
10742                             }
10743                             else {
10744                                 ender = result;
10745                             }
10746                             if (PL_encoding && ender < 0x100) {
10747                                 goto recode_encoding;
10748                             }
10749                             if (ender > 0xff) {
10750                                 REQUIRE_UTF8;
10751                             }
10752                             break;
10753                         }
10754                     case 'c':
10755                         p++;
10756                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10757                         break;
10758                     case '0': case '1': case '2': case '3':case '4':
10759                     case '5': case '6': case '7':
10760                         if (*p == '0' ||
10761                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10762                         {
10763                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10764                             STRLEN numlen = 3;
10765                             ender = grok_oct(p, &numlen, &flags, NULL);
10766                             if (ender > 0xff) {
10767                                 REQUIRE_UTF8;
10768                             }
10769                             p += numlen;
10770                         }
10771                         else {
10772                             --p;
10773                             goto loopdone;
10774                         }
10775                         if (PL_encoding && ender < 0x100)
10776                             goto recode_encoding;
10777                         break;
10778                     recode_encoding:
10779                         if (! RExC_override_recoding) {
10780                             SV* enc = PL_encoding;
10781                             ender = reg_recode((const char)(U8)ender, &enc);
10782                             if (!enc && SIZE_ONLY)
10783                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10784                             REQUIRE_UTF8;
10785                         }
10786                         break;
10787                     case '\0':
10788                         if (p >= RExC_end)
10789                             FAIL("Trailing \\");
10790                         /* FALL THROUGH */
10791                     default:
10792                         if (!SIZE_ONLY&& isALNUMC(*p)) {
10793                             ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10794                         }
10795                         goto normal_default;
10796                     }
10797                     break;
10798                 case '{':
10799                     /* Currently we don't warn when the lbrace is at the start
10800                      * of a construct.  This catches it in the middle of a
10801                      * literal string, or when its the first thing after
10802                      * something like "\b" */
10803                     if (! SIZE_ONLY
10804                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10805                     {
10806                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10807                     }
10808                     /*FALLTHROUGH*/
10809                 default:
10810                   normal_default:
10811                     if (UTF8_IS_START(*p) && UTF) {
10812                         STRLEN numlen;
10813                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10814                                                &numlen, UTF8_ALLOW_DEFAULT);
10815                         p += numlen;
10816                     }
10817                     else
10818                         ender = (U8) *p++;
10819                     break;
10820                 } /* End of switch on the literal */
10821
10822                 /* Here, have looked at the literal character and <ender>
10823                  * contains its ordinal, <p> points to the character after it
10824                  */
10825
10826                 if ( RExC_flags & RXf_PMf_EXTENDED)
10827                     p = regwhite( pRExC_state, p );
10828
10829                 /* If the next thing is a quantifier, it applies to this
10830                  * character only, which means that this character has to be in
10831                  * its own node and can't just be appended to the string in an
10832                  * existing node, so if there are already other characters in
10833                  * the node, close the node with just them, and set up to do
10834                  * this character again next time through, when it will be the
10835                  * only thing in its new node */
10836                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10837                 {
10838                     p = oldp;
10839                     goto loopdone;
10840                 }
10841
10842                 if (FOLD) {
10843                     if (UTF
10844                             /* See comments for join_exact() as to why we fold
10845                              * this non-UTF at compile time */
10846                         || (node_type == EXACTFU
10847                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10848                     {
10849
10850
10851                         /* Prime the casefolded buffer.  Locale rules, which
10852                          * apply only to code points < 256, aren't known until
10853                          * execution, so for them, just output the original
10854                          * character using utf8.  If we start to fold non-UTF
10855                          * patterns, be sure to update join_exact() */
10856                         if (LOC && ender < 256) {
10857                             if (UNI_IS_INVARIANT(ender)) {
10858                                 *s = (U8) ender;
10859                                 foldlen = 1;
10860                             } else {
10861                                 *s = UTF8_TWO_BYTE_HI(ender);
10862                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10863                                 foldlen = 2;
10864                             }
10865                         }
10866                         else {
10867                             UV folded = _to_uni_fold_flags(
10868                                            ender,
10869                                            (U8 *) s,
10870                                            &foldlen,
10871                                            FOLD_FLAGS_FULL
10872                                            | ((LOC) ?  FOLD_FLAGS_LOCALE
10873                                                     : (ASCII_FOLD_RESTRICTED)
10874                                                       ? FOLD_FLAGS_NOMIX_ASCII
10875                                                       : 0)
10876                                             );
10877
10878                             /* If this node only contains non-folding code
10879                              * points so far, see if this new one is also
10880                              * non-folding */
10881                             if (maybe_exact) {
10882                                 if (folded != ender) {
10883                                     maybe_exact = FALSE;
10884                                 }
10885                                 else {
10886                                     /* Here the fold is the original; we have
10887                                      * to check further to see if anything
10888                                      * folds to it */
10889                                     if (! PL_utf8_foldable) {
10890                                         SV* swash = swash_init("utf8",
10891                                                            "_Perl_Any_Folds",
10892                                                            &PL_sv_undef, 1, 0);
10893                                         PL_utf8_foldable =
10894                                                     _get_swash_invlist(swash);
10895                                         SvREFCNT_dec(swash);
10896                                     }
10897                                     if (_invlist_contains_cp(PL_utf8_foldable,
10898                                                              ender))
10899                                     {
10900                                         maybe_exact = FALSE;
10901                                     }
10902                                 }
10903                             }
10904                             ender = folded;
10905                         }
10906                         s += foldlen;
10907
10908                         /* The loop increments <len> each time, as all but this
10909                          * path (and the one just below for UTF) through it add
10910                          * a single byte to the EXACTish node.  But this one
10911                          * has changed len to be the correct final value, so
10912                          * subtract one to cancel out the increment that
10913                          * follows */
10914                         len += foldlen - 1;
10915                     }
10916                     else {
10917                         *(s++) = ender;
10918                         maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10919                     }
10920                 }
10921                 else if (UTF) {
10922                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10923                     if (unilen > 0) {
10924                        s   += unilen;
10925                        len += unilen;
10926                     }
10927
10928                     /* See comment just above for - 1 */
10929                     len--;
10930                 }
10931                 else {
10932                     REGC((char)ender, s++);
10933                 }
10934
10935                 if (next_is_quantifier) {
10936
10937                     /* Here, the next input is a quantifier, and to get here,
10938                      * the current character is the only one in the node.
10939                      * Also, here <len> doesn't include the final byte for this
10940                      * character */
10941                     len++;
10942                     goto loopdone;
10943                 }
10944
10945             } /* End of loop through literal characters */
10946
10947             /* Here we have either exhausted the input or ran out of room in
10948              * the node.  (If we encountered a character that can't be in the
10949              * node, transfer is made directly to <loopdone>, and so we
10950              * wouldn't have fallen off the end of the loop.)  In the latter
10951              * case, we artificially have to split the node into two, because
10952              * we just don't have enough space to hold everything.  This
10953              * creates a problem if the final character participates in a
10954              * multi-character fold in the non-final position, as a match that
10955              * should have occurred won't, due to the way nodes are matched,
10956              * and our artificial boundary.  So back off until we find a non-
10957              * problematic character -- one that isn't at the beginning or
10958              * middle of such a fold.  (Either it doesn't participate in any
10959              * folds, or appears only in the final position of all the folds it
10960              * does participate in.)  A better solution with far fewer false
10961              * positives, and that would fill the nodes more completely, would
10962              * be to actually have available all the multi-character folds to
10963              * test against, and to back-off only far enough to be sure that
10964              * this node isn't ending with a partial one.  <upper_parse> is set
10965              * further below (if we need to reparse the node) to include just
10966              * up through that final non-problematic character that this code
10967              * identifies, so when it is set to less than the full node, we can
10968              * skip the rest of this */
10969             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10970
10971                 const STRLEN full_len = len;
10972
10973                 assert(len >= MAX_NODE_STRING_SIZE);
10974
10975                 /* Here, <s> points to the final byte of the final character.
10976                  * Look backwards through the string until find a non-
10977                  * problematic character */
10978
10979                 if (! UTF) {
10980
10981                     /* These two have no multi-char folds to non-UTF characters
10982                      */
10983                     if (ASCII_FOLD_RESTRICTED || LOC) {
10984                         goto loopdone;
10985                     }
10986
10987                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10988                     len = s - s0 + 1;
10989                 }
10990                 else {
10991                     if (!  PL_NonL1NonFinalFold) {
10992                         PL_NonL1NonFinalFold = _new_invlist_C_array(
10993                                         NonL1_Perl_Non_Final_Folds_invlist);
10994                     }
10995
10996                     /* Point to the first byte of the final character */
10997                     s = (char *) utf8_hop((U8 *) s, -1);
10998
10999                     while (s >= s0) {   /* Search backwards until find
11000                                            non-problematic char */
11001                         if (UTF8_IS_INVARIANT(*s)) {
11002
11003                             /* There are no ascii characters that participate
11004                              * in multi-char folds under /aa.  In EBCDIC, the
11005                              * non-ascii invariants are all control characters,
11006                              * so don't ever participate in any folds. */
11007                             if (ASCII_FOLD_RESTRICTED
11008                                 || ! IS_NON_FINAL_FOLD(*s))
11009                             {
11010                                 break;
11011                             }
11012                         }
11013                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11014
11015                             /* No Latin1 characters participate in multi-char
11016                              * folds under /l */
11017                             if (LOC
11018                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11019                                                                 *s, *(s+1))))
11020                             {
11021                                 break;
11022                             }
11023                         }
11024                         else if (! _invlist_contains_cp(
11025                                         PL_NonL1NonFinalFold,
11026                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
11027                         {
11028                             break;
11029                         }
11030
11031                         /* Here, the current character is problematic in that
11032                          * it does occur in the non-final position of some
11033                          * fold, so try the character before it, but have to
11034                          * special case the very first byte in the string, so
11035                          * we don't read outside the string */
11036                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11037                     } /* End of loop backwards through the string */
11038
11039                     /* If there were only problematic characters in the string,
11040                      * <s> will point to before s0, in which case the length
11041                      * should be 0, otherwise include the length of the
11042                      * non-problematic character just found */
11043                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11044                 }
11045
11046                 /* Here, have found the final character, if any, that is
11047                  * non-problematic as far as ending the node without splitting
11048                  * it across a potential multi-char fold.  <len> contains the
11049                  * number of bytes in the node up-to and including that
11050                  * character, or is 0 if there is no such character, meaning
11051                  * the whole node contains only problematic characters.  In
11052                  * this case, give up and just take the node as-is.  We can't
11053                  * do any better */
11054                 if (len == 0) {
11055                     len = full_len;
11056                 } else {
11057
11058                     /* Here, the node does contain some characters that aren't
11059                      * problematic.  If one such is the final character in the
11060                      * node, we are done */
11061                     if (len == full_len) {
11062                         goto loopdone;
11063                     }
11064                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11065
11066                         /* If the final character is problematic, but the
11067                          * penultimate is not, back-off that last character to
11068                          * later start a new node with it */
11069                         p = oldp;
11070                         goto loopdone;
11071                     }
11072
11073                     /* Here, the final non-problematic character is earlier
11074                      * in the input than the penultimate character.  What we do
11075                      * is reparse from the beginning, going up only as far as
11076                      * this final ok one, thus guaranteeing that the node ends
11077                      * in an acceptable character.  The reason we reparse is
11078                      * that we know how far in the character is, but we don't
11079                      * know how to correlate its position with the input parse.
11080                      * An alternate implementation would be to build that
11081                      * correlation as we go along during the original parse,
11082                      * but that would entail extra work for every node, whereas
11083                      * this code gets executed only when the string is too
11084                      * large for the node, and the final two characters are
11085                      * problematic, an infrequent occurrence.  Yet another
11086                      * possible strategy would be to save the tail of the
11087                      * string, and the next time regatom is called, initialize
11088                      * with that.  The problem with this is that unless you
11089                      * back off one more character, you won't be guaranteed
11090                      * regatom will get called again, unless regbranch,
11091                      * regpiece ... are also changed.  If you do back off that
11092                      * extra character, so that there is input guaranteed to
11093                      * force calling regatom, you can't handle the case where
11094                      * just the first character in the node is acceptable.  I
11095                      * (khw) decided to try this method which doesn't have that
11096                      * pitfall; if performance issues are found, we can do a
11097                      * combination of the current approach plus that one */
11098                     upper_parse = len;
11099                     len = 0;
11100                     s = s0;
11101                     goto reparse;
11102                 }
11103             }   /* End of verifying node ends with an appropriate char */
11104
11105         loopdone:   /* Jumped to when encounters something that shouldn't be in
11106                        the node */
11107
11108             /* If 'maybe_exact' is still set here, means there are no
11109              * code points in the node that participate in folds */
11110             if (FOLD && maybe_exact) {
11111                 OP(ret) = EXACT;
11112             }
11113
11114             /* I (khw) don't know if you can get here with zero length, but the
11115              * old code handled this situation by creating a zero-length EXACT
11116              * node.  Might as well be NOTHING instead */
11117             if (len == 0) {
11118                 OP(ret) = NOTHING;
11119             }
11120             else{
11121                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11122             }
11123
11124             RExC_parse = p - 1;
11125             Set_Node_Cur_Length(ret); /* MJD */
11126             nextchar(pRExC_state);
11127             {
11128                 /* len is STRLEN which is unsigned, need to copy to signed */
11129                 IV iv = len;
11130                 if (iv < 0)
11131                     vFAIL("Internal disaster");
11132             }
11133
11134         } /* End of label 'defchar:' */
11135         break;
11136     } /* End of giant switch on input character */
11137
11138     return(ret);
11139 }
11140
11141 STATIC char *
11142 S_regwhite( RExC_state_t *pRExC_state, char *p )
11143 {
11144     const char *e = RExC_end;
11145
11146     PERL_ARGS_ASSERT_REGWHITE;
11147
11148     while (p < e) {
11149         if (isSPACE(*p))
11150             ++p;
11151         else if (*p == '#') {
11152             bool ended = 0;
11153             do {
11154                 if (*p++ == '\n') {
11155                     ended = 1;
11156                     break;
11157                 }
11158             } while (p < e);
11159             if (!ended)
11160                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11161         }
11162         else
11163             break;
11164     }
11165     return p;
11166 }
11167
11168 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11169    Character classes ([:foo:]) can also be negated ([:^foo:]).
11170    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11171    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11172    but trigger failures because they are currently unimplemented. */
11173
11174 #define POSIXCC_DONE(c)   ((c) == ':')
11175 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11176 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11177
11178 PERL_STATIC_INLINE I32
11179 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
11180 {
11181     dVAR;
11182     I32 namedclass = OOB_NAMEDCLASS;
11183
11184     PERL_ARGS_ASSERT_REGPPOSIXCC;
11185
11186     if (value == '[' && RExC_parse + 1 < RExC_end &&
11187         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11188         POSIXCC(UCHARAT(RExC_parse))) {
11189         const char c = UCHARAT(RExC_parse);
11190         char* const s = RExC_parse++;
11191
11192         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11193             RExC_parse++;
11194         if (RExC_parse == RExC_end)
11195             /* Grandfather lone [:, [=, [. */
11196             RExC_parse = s;
11197         else {
11198             const char* const t = RExC_parse++; /* skip over the c */
11199             assert(*t == c);
11200
11201             if (UCHARAT(RExC_parse) == ']') {
11202                 const char *posixcc = s + 1;
11203                 RExC_parse++; /* skip over the ending ] */
11204
11205                 if (*s == ':') {
11206                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11207                     const I32 skip = t - posixcc;
11208
11209                     /* Initially switch on the length of the name.  */
11210                     switch (skip) {
11211                     case 4:
11212                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11213                             namedclass = ANYOF_WORDCHAR;
11214                         break;
11215                     case 5:
11216                         /* Names all of length 5.  */
11217                         /* alnum alpha ascii blank cntrl digit graph lower
11218                            print punct space upper  */
11219                         /* Offset 4 gives the best switch position.  */
11220                         switch (posixcc[4]) {
11221                         case 'a':
11222                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11223                                 namedclass = ANYOF_ALPHA;
11224                             break;
11225                         case 'e':
11226                             if (memEQ(posixcc, "spac", 4)) /* space */
11227                                 namedclass = ANYOF_PSXSPC;
11228                             break;
11229                         case 'h':
11230                             if (memEQ(posixcc, "grap", 4)) /* graph */
11231                                 namedclass = ANYOF_GRAPH;
11232                             break;
11233                         case 'i':
11234                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11235                                 namedclass = ANYOF_ASCII;
11236                             break;
11237                         case 'k':
11238                             if (memEQ(posixcc, "blan", 4)) /* blank */
11239                                 namedclass = ANYOF_BLANK;
11240                             break;
11241                         case 'l':
11242                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11243                                 namedclass = ANYOF_CNTRL;
11244                             break;
11245                         case 'm':
11246                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11247                                 namedclass = ANYOF_ALNUMC;
11248                             break;
11249                         case 'r':
11250                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11251                                 namedclass = ANYOF_LOWER;
11252                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11253                                 namedclass = ANYOF_UPPER;
11254                             break;
11255                         case 't':
11256                             if (memEQ(posixcc, "digi", 4)) /* digit */
11257                                 namedclass = ANYOF_DIGIT;
11258                             else if (memEQ(posixcc, "prin", 4)) /* print */
11259                                 namedclass = ANYOF_PRINT;
11260                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11261                                 namedclass = ANYOF_PUNCT;
11262                             break;
11263                         }
11264                         break;
11265                     case 6:
11266                         if (memEQ(posixcc, "xdigit", 6))
11267                             namedclass = ANYOF_XDIGIT;
11268                         break;
11269                     }
11270
11271                     if (namedclass == OOB_NAMEDCLASS)
11272                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11273                                       t - s - 1, s + 1);
11274
11275                     /* The #defines are structured so each complement is +1 to
11276                      * the normal one */
11277                     if (complement) {
11278                         namedclass++;
11279                     }
11280                     assert (posixcc[skip] == ':');
11281                     assert (posixcc[skip+1] == ']');
11282                 } else if (!SIZE_ONLY) {
11283                     /* [[=foo=]] and [[.foo.]] are still future. */
11284
11285                     /* adjust RExC_parse so the warning shows after
11286                        the class closes */
11287                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11288                         RExC_parse++;
11289                     SvREFCNT_dec(free_me);
11290                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11291                 }
11292             } else {
11293                 /* Maternal grandfather:
11294                  * "[:" ending in ":" but not in ":]" */
11295                 RExC_parse = s;
11296             }
11297         }
11298     }
11299
11300     return namedclass;
11301 }
11302
11303 /* Generate the code to add a full posix character <class> to the bracketed
11304  * character class given by <node>.  (<node> is needed only under locale rules)
11305  * destlist     is the inversion list for non-locale rules that this class is
11306  *              to be added to
11307  * sourcelist   is the ASCII-range inversion list to add under /a rules
11308  * Xsourcelist  is the full Unicode range list to use otherwise. */
11309 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
11310     if (LOC) {                                                             \
11311         SV* scratch_list = NULL;                                           \
11312                                                                            \
11313         /* Set this class in the node for runtime matching */              \
11314         ANYOF_CLASS_SET(node, class);                                      \
11315                                                                            \
11316         /* For above Latin1 code points, we use the full Unicode range */  \
11317         _invlist_intersection(PL_AboveLatin1,                              \
11318                               Xsourcelist,                                 \
11319                               &scratch_list);                              \
11320         /* And set the output to it, adding instead if there already is an \
11321          * output.  Checking if <destlist> is NULL first saves an extra    \
11322          * clone.  Its reference count will be decremented at the next     \
11323          * union, etc, or if this is the only instance, at the end of the  \
11324          * routine */                                                      \
11325         if (! destlist) {                                                  \
11326             destlist = scratch_list;                                       \
11327         }                                                                  \
11328         else {                                                             \
11329             _invlist_union(destlist, scratch_list, &destlist);             \
11330             SvREFCNT_dec(scratch_list);                                    \
11331         }                                                                  \
11332     }                                                                      \
11333     else {                                                                 \
11334         /* For non-locale, just add it to any existing list */             \
11335         _invlist_union(destlist,                                           \
11336                        (AT_LEAST_ASCII_RESTRICTED)                         \
11337                            ? sourcelist                                    \
11338                            : Xsourcelist,                                  \
11339                        &destlist);                                         \
11340     }
11341
11342 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11343  */
11344 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
11345     if (LOC) {                                                             \
11346         SV* scratch_list = NULL;                                           \
11347         ANYOF_CLASS_SET(node, class);                                      \
11348         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
11349         if (! destlist) {                                                  \
11350             destlist = scratch_list;                                       \
11351         }                                                                  \
11352         else {                                                             \
11353             _invlist_union(destlist, scratch_list, &destlist);             \
11354             SvREFCNT_dec(scratch_list);                                    \
11355         }                                                                  \
11356     }                                                                      \
11357     else {                                                                 \
11358         _invlist_union_complement_2nd(destlist,                            \
11359                                     (AT_LEAST_ASCII_RESTRICTED)            \
11360                                         ? sourcelist                       \
11361                                         : Xsourcelist,                     \
11362                                     &destlist);                            \
11363         /* Under /d, everything in the upper half of the Latin1 range      \
11364          * matches this complement */                                      \
11365         if (DEPENDS_SEMANTICS) {                                           \
11366             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
11367         }                                                                  \
11368     }
11369
11370 /* Generate the code to add a posix character <class> to the bracketed
11371  * character class given by <node>.  (<node> is needed only under locale rules)
11372  * destlist       is the inversion list for non-locale rules that this class is
11373  *                to be added to
11374  * sourcelist     is the ASCII-range inversion list to add under /a rules
11375  * l1_sourcelist  is the Latin1 range list to use otherwise.
11376  * Xpropertyname  is the name to add to <run_time_list> of the property to
11377  *                specify the code points above Latin1 that will have to be
11378  *                determined at run-time
11379  * run_time_list  is a SV* that contains text names of properties that are to
11380  *                be computed at run time.  This concatenates <Xpropertyname>
11381  *                to it, appropriately
11382  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11383  * time */
11384 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
11385                               l1_sourcelist, Xpropertyname, run_time_list) \
11386         /* First, resolve whether to use the ASCII-only list or the L1     \
11387          * list */                                                         \
11388         DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist,      \
11389                 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11390                 Xpropertyname, run_time_list)
11391
11392 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11393                 Xpropertyname, run_time_list)                              \
11394     /* If not /a matching, there are going to be code points we will have  \
11395      * to defer to runtime to look-up */                                   \
11396     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
11397         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11398     }                                                                      \
11399     if (LOC) {                                                             \
11400         ANYOF_CLASS_SET(node, class);                                      \
11401     }                                                                      \
11402     else {                                                                 \
11403         _invlist_union(destlist, sourcelist, &destlist);                   \
11404     }
11405
11406 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
11407  * this and DO_N_POSIX.  Sets <matches_above_unicode> only if it can; unchanged
11408  * otherwise */
11409 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
11410        l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11411     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
11412         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
11413     }                                                                      \
11414     else {                                                                 \
11415         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11416         matches_above_unicode = TRUE;                                      \
11417         if (LOC) {                                                         \
11418             ANYOF_CLASS_SET(node, namedclass);                             \
11419         }                                                                  \
11420         else {                                                             \
11421             SV* scratch_list = NULL;                                       \
11422             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
11423             if (! destlist) {                                              \
11424                 destlist = scratch_list;                                   \
11425             }                                                              \
11426             else {                                                         \
11427                 _invlist_union(destlist, scratch_list, &destlist);         \
11428                 SvREFCNT_dec(scratch_list);                                \
11429             }                                                              \
11430             if (DEPENDS_SEMANTICS) {                                       \
11431                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
11432             }                                                              \
11433         }                                                                  \
11434     }
11435
11436 /* The names of properties whose definitions are not known at compile time are
11437  * stored in this SV, after a constant heading.  So if the length has been
11438  * changed since initialization, then there is a run-time definition. */
11439 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11440
11441 /* This converts the named class defined in regcomp.h to its equivalent class
11442  * number defined in handy.h. */
11443 #define namedclass_to_classnum(class)  ((class) / 2)
11444
11445 STATIC regnode *
11446 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11447 {
11448     /* parse a bracketed class specification.  Most of these will produce an ANYOF node;
11449      * but something like [a] will produce an EXACT node; [aA], an EXACTFish
11450      * node; [[:ascii:]], a POSIXA node; etc.  It is more complex under /i with
11451      * multi-character folds: it will be rewritten following the paradigm of
11452      * this example, where the <multi-fold>s are characters which fold to
11453      * multiple character sequences:
11454      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11455      * gets effectively rewritten as:
11456      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11457      * reg() gets called (recursively) on the rewritten version, and this
11458      * function will return what it constructs.  (Actually the <multi-fold>s
11459      * aren't physically removed from the [abcdefghi], it's just that they are
11460      * ignored in the recursion by means of a flag:
11461      * <RExC_in_multi_char_class>.)
11462      *
11463      * ANYOF nodes contain a bit map for the first 256 characters, with the
11464      * corresponding bit set if that character is in the list.  For characters
11465      * above 255, a range list or swash is used.  There are extra bits for \w,
11466      * etc. in locale ANYOFs, as what these match is not determinable at
11467      * compile time */
11468
11469     dVAR;
11470     UV nextvalue;
11471     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11472     IV range = 0;
11473     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11474     regnode *ret;
11475     STRLEN numlen;
11476     IV namedclass = OOB_NAMEDCLASS;
11477     char *rangebegin = NULL;
11478     bool need_class = 0;
11479     SV *listsv = NULL;
11480     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11481                                       than just initialized.  */
11482     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11483     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
11484                                extended beyond the Latin1 range */
11485     UV element_count = 0;   /* Number of distinct elements in the class.
11486                                Optimizations may be possible if this is tiny */
11487     AV * multi_char_matches = NULL; /* Code points that fold to more than one
11488                                        character; used under /i */
11489     UV n;
11490
11491     /* Unicode properties are stored in a swash; this holds the current one
11492      * being parsed.  If this swash is the only above-latin1 component of the
11493      * character class, an optimization is to pass it directly on to the
11494      * execution engine.  Otherwise, it is set to NULL to indicate that there
11495      * are other things in the class that have to be dealt with at execution
11496      * time */
11497     SV* swash = NULL;           /* Code points that match \p{} \P{} */
11498
11499     /* Set if a component of this character class is user-defined; just passed
11500      * on to the engine */
11501     bool has_user_defined_property = FALSE;
11502
11503     /* inversion list of code points this node matches only when the target
11504      * string is in UTF-8.  (Because is under /d) */
11505     SV* depends_list = NULL;
11506
11507     /* inversion list of code points this node matches.  For much of the
11508      * function, it includes only those that match regardless of the utf8ness
11509      * of the target string */
11510     SV* cp_list = NULL;
11511
11512 #ifdef EBCDIC
11513     /* In a range, counts how many 0-2 of the ends of it came from literals,
11514      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
11515     UV literal_endpoint = 0;
11516 #endif
11517     bool invert = FALSE;    /* Is this class to be complemented */
11518
11519     /* Is there any thing like \W or [:^digit:] that matches above the legal
11520      * Unicode range? */
11521     bool runtime_posix_matches_above_Unicode = FALSE;
11522
11523     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11524         case we need to change the emitted regop to an EXACT. */
11525     const char * orig_parse = RExC_parse;
11526     const I32 orig_size = RExC_size;
11527     GET_RE_DEBUG_FLAGS_DECL;
11528
11529     PERL_ARGS_ASSERT_REGCLASS;
11530 #ifndef DEBUGGING
11531     PERL_UNUSED_ARG(depth);
11532 #endif
11533
11534     DEBUG_PARSE("clas");
11535
11536     /* Assume we are going to generate an ANYOF node. */
11537     ret = reganode(pRExC_state, ANYOF, 0);
11538
11539     if (!SIZE_ONLY) {
11540         ANYOF_FLAGS(ret) = 0;
11541     }
11542
11543     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11544         RExC_parse++;
11545         invert = TRUE;
11546         RExC_naughty++;
11547     }
11548
11549     if (SIZE_ONLY) {
11550         RExC_size += ANYOF_SKIP;
11551         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11552     }
11553     else {
11554         RExC_emit += ANYOF_SKIP;
11555         if (LOC) {
11556             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11557         }
11558         listsv = newSVpvs("# comment\n");
11559         initial_listsv_len = SvCUR(listsv);
11560     }
11561
11562     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11563
11564     if (!SIZE_ONLY && POSIXCC(nextvalue))
11565     {
11566         const char *s = RExC_parse;
11567         const char  c = *s++;
11568
11569         while (isALNUM(*s))
11570             s++;
11571         if (*s && c == *s && s[1] == ']') {
11572             SAVEFREESV(RExC_rx_sv);
11573             SAVEFREESV(listsv);
11574             ckWARN3reg(s+2,
11575                        "POSIX syntax [%c %c] belongs inside character classes",
11576                        c, c);
11577             ReREFCNT_inc(RExC_rx_sv);
11578             SvREFCNT_inc_simple_void_NN(listsv);
11579         }
11580     }
11581
11582     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11583     if (UCHARAT(RExC_parse) == ']')
11584         goto charclassloop;
11585
11586 parseit:
11587     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11588
11589     charclassloop:
11590
11591         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11592         save_value = value;
11593         save_prevvalue = prevvalue;
11594
11595         if (!range) {
11596             rangebegin = RExC_parse;
11597             element_count++;
11598         }
11599         if (UTF) {
11600             value = utf8n_to_uvchr((U8*)RExC_parse,
11601                                    RExC_end - RExC_parse,
11602                                    &numlen, UTF8_ALLOW_DEFAULT);
11603             RExC_parse += numlen;
11604         }
11605         else
11606             value = UCHARAT(RExC_parse++);
11607
11608         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11609         if (value == '[' && POSIXCC(nextvalue))
11610             namedclass = regpposixcc(pRExC_state, value, listsv);
11611         else if (value == '\\') {
11612             if (UTF) {
11613                 value = utf8n_to_uvchr((U8*)RExC_parse,
11614                                    RExC_end - RExC_parse,
11615                                    &numlen, UTF8_ALLOW_DEFAULT);
11616                 RExC_parse += numlen;
11617             }
11618             else
11619                 value = UCHARAT(RExC_parse++);
11620             /* Some compilers cannot handle switching on 64-bit integer
11621              * values, therefore value cannot be an UV.  Yes, this will
11622              * be a problem later if we want switch on Unicode.
11623              * A similar issue a little bit later when switching on
11624              * namedclass. --jhi */
11625             switch ((I32)value) {
11626             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
11627             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
11628             case 's':   namedclass = ANYOF_SPACE;       break;
11629             case 'S':   namedclass = ANYOF_NSPACE;      break;
11630             case 'd':   namedclass = ANYOF_DIGIT;       break;
11631             case 'D':   namedclass = ANYOF_NDIGIT;      break;
11632             case 'v':   namedclass = ANYOF_VERTWS;      break;
11633             case 'V':   namedclass = ANYOF_NVERTWS;     break;
11634             case 'h':   namedclass = ANYOF_HORIZWS;     break;
11635             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
11636             case 'N':  /* Handle \N{NAME} in class */
11637                 {
11638                     /* We only pay attention to the first char of
11639                     multichar strings being returned. I kinda wonder
11640                     if this makes sense as it does change the behaviour
11641                     from earlier versions, OTOH that behaviour was broken
11642                     as well. */
11643                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11644                                       TRUE /* => charclass */))
11645                     {
11646                         goto parseit;
11647                     }
11648                 }
11649                 break;
11650             case 'p':
11651             case 'P':
11652                 {
11653                 char *e;
11654
11655                 /* This routine will handle any undefined properties */
11656                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11657
11658                 if (RExC_parse >= RExC_end)
11659                     vFAIL2("Empty \\%c{}", (U8)value);
11660                 if (*RExC_parse == '{') {
11661                     const U8 c = (U8)value;
11662                     e = strchr(RExC_parse++, '}');
11663                     if (!e)
11664                         vFAIL2("Missing right brace on \\%c{}", c);
11665                     while (isSPACE(UCHARAT(RExC_parse)))
11666                         RExC_parse++;
11667                     if (e == RExC_parse)
11668                         vFAIL2("Empty \\%c{}", c);
11669                     n = e - RExC_parse;
11670                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11671                         n--;
11672                 }
11673                 else {
11674                     e = RExC_parse;
11675                     n = 1;
11676                 }
11677                 if (!SIZE_ONLY) {
11678                     SV* invlist;
11679                     char* name;
11680
11681                     if (UCHARAT(RExC_parse) == '^') {
11682                          RExC_parse++;
11683                          n--;
11684                          value = value == 'p' ? 'P' : 'p'; /* toggle */
11685                          while (isSPACE(UCHARAT(RExC_parse))) {
11686                               RExC_parse++;
11687                               n--;
11688                          }
11689                     }
11690                     /* Try to get the definition of the property into
11691                      * <invlist>.  If /i is in effect, the effective property
11692                      * will have its name be <__NAME_i>.  The design is
11693                      * discussed in commit
11694                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11695                     Newx(name, n + sizeof("_i__\n"), char);
11696
11697                     sprintf(name, "%s%.*s%s\n",
11698                                     (FOLD) ? "__" : "",
11699                                     (int)n,
11700                                     RExC_parse,
11701                                     (FOLD) ? "_i" : ""
11702                     );
11703
11704                     /* Look up the property name, and get its swash and
11705                      * inversion list, if the property is found  */
11706                     if (swash) {
11707                         SvREFCNT_dec(swash);
11708                     }
11709                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
11710                                              1, /* binary */
11711                                              0, /* not tr/// */
11712                                              NULL, /* No inversion list */
11713                                              &swash_init_flags
11714                                             );
11715                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11716                         if (swash) {
11717                             SvREFCNT_dec(swash);
11718                             swash = NULL;
11719                         }
11720
11721                         /* Here didn't find it.  It could be a user-defined
11722                          * property that will be available at run-time.  Add it
11723                          * to the list to look up then */
11724                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11725                                         (value == 'p' ? '+' : '!'),
11726                                         name);
11727                         has_user_defined_property = TRUE;
11728
11729                         /* We don't know yet, so have to assume that the
11730                          * property could match something in the Latin1 range,
11731                          * hence something that isn't utf8.  Note that this
11732                          * would cause things in <depends_list> to match
11733                          * inappropriately, except that any \p{}, including
11734                          * this one forces Unicode semantics, which means there
11735                          * is <no depends_list> */
11736                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11737                     }
11738                     else {
11739
11740                         /* Here, did get the swash and its inversion list.  If
11741                          * the swash is from a user-defined property, then this
11742                          * whole character class should be regarded as such */
11743                         has_user_defined_property =
11744                                     (swash_init_flags
11745                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11746
11747                         /* Invert if asking for the complement */
11748                         if (value == 'P') {
11749                             _invlist_union_complement_2nd(properties,
11750                                                           invlist,
11751                                                           &properties);
11752
11753                             /* The swash can't be used as-is, because we've
11754                              * inverted things; delay removing it to here after
11755                              * have copied its invlist above */
11756                             SvREFCNT_dec(swash);
11757                             swash = NULL;
11758                         }
11759                         else {
11760                             _invlist_union(properties, invlist, &properties);
11761                         }
11762                     }
11763                     Safefree(name);
11764                 }
11765                 RExC_parse = e + 1;
11766                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's named */
11767
11768                 /* \p means they want Unicode semantics */
11769                 RExC_uni_semantics = 1;
11770                 }
11771                 break;
11772             case 'n':   value = '\n';                   break;
11773             case 'r':   value = '\r';                   break;
11774             case 't':   value = '\t';                   break;
11775             case 'f':   value = '\f';                   break;
11776             case 'b':   value = '\b';                   break;
11777             case 'e':   value = ASCII_TO_NATIVE('\033');break;
11778             case 'a':   value = ASCII_TO_NATIVE('\007');break;
11779             case 'o':
11780                 RExC_parse--;   /* function expects to be pointed at the 'o' */
11781                 {
11782                     const char* error_msg;
11783                     bool valid = grok_bslash_o(RExC_parse,
11784                                                &value,
11785                                                &numlen,
11786                                                &error_msg,
11787                                                SIZE_ONLY);
11788                     RExC_parse += numlen;
11789                     if (! valid) {
11790                         vFAIL(error_msg);
11791                     }
11792                 }
11793                 if (PL_encoding && value < 0x100) {
11794                     goto recode_encoding;
11795                 }
11796                 break;
11797             case 'x':
11798                 RExC_parse--;   /* function expects to be pointed at the 'x' */
11799                 {
11800                     const char* error_msg;
11801                     bool valid = grok_bslash_x(RExC_parse,
11802                                                &value,
11803                                                &numlen,
11804                                                &error_msg,
11805                                                1);
11806                     RExC_parse += numlen;
11807                     if (! valid) {
11808                         vFAIL(error_msg);
11809                     }
11810                 }
11811                 if (PL_encoding && value < 0x100)
11812                     goto recode_encoding;
11813                 break;
11814             case 'c':
11815                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11816                 break;
11817             case '0': case '1': case '2': case '3': case '4':
11818             case '5': case '6': case '7':
11819                 {
11820                     /* Take 1-3 octal digits */
11821                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11822                     numlen = 3;
11823                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11824                     RExC_parse += numlen;
11825                     if (PL_encoding && value < 0x100)
11826                         goto recode_encoding;
11827                     break;
11828                 }
11829             recode_encoding:
11830                 if (! RExC_override_recoding) {
11831                     SV* enc = PL_encoding;
11832                     value = reg_recode((const char)(U8)value, &enc);
11833                     if (!enc && SIZE_ONLY)
11834                         ckWARNreg(RExC_parse,
11835                                   "Invalid escape in the specified encoding");
11836                     break;
11837                 }
11838             default:
11839                 /* Allow \_ to not give an error */
11840                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11841                     SAVEFREESV(RExC_rx_sv);
11842                     SAVEFREESV(listsv);
11843                     ckWARN2reg(RExC_parse,
11844                                "Unrecognized escape \\%c in character class passed through",
11845                                (int)value);
11846                     ReREFCNT_inc(RExC_rx_sv);
11847                     SvREFCNT_inc_simple_void_NN(listsv);
11848                 }
11849                 break;
11850             }
11851         } /* end of \blah */
11852 #ifdef EBCDIC
11853         else
11854             literal_endpoint++;
11855 #endif
11856
11857             /* What matches in a locale is not known until runtime.  This
11858              * includes what the Posix classes (like \w, [:space:]) match.
11859              * Room must be reserved (one time per class) to store such
11860              * classes, either if Perl is compiled so that locale nodes always
11861              * should have this space, or if there is such class info to be
11862              * stored.  The space will contain a bit for each named class that
11863              * is to be matched against.  This isn't needed for \p{} and
11864              * pseudo-classes, as they are not affected by locale, and hence
11865              * are dealt with separately */
11866             if (LOC
11867                 && ! need_class
11868                 && (ANYOF_LOCALE == ANYOF_CLASS
11869                     || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11870             {
11871                 need_class = 1;
11872                 if (SIZE_ONLY) {
11873                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11874                 }
11875                 else {
11876                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11877                     ANYOF_CLASS_ZERO(ret);
11878                 }
11879                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11880             }
11881
11882         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11883
11884             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
11885              * literal, as is the character that began the false range, i.e.
11886              * the 'a' in the examples */
11887             if (range) {
11888                 if (!SIZE_ONLY) {
11889                     const int w =
11890                         RExC_parse >= rangebegin ?
11891                         RExC_parse - rangebegin : 0;
11892                     SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11893                     SAVEFREESV(listsv);
11894                     ckWARN4reg(RExC_parse,
11895                                "False [] range \"%*.*s\"",
11896                                w, w, rangebegin);
11897                     ReREFCNT_inc(RExC_rx_sv);
11898                     SvREFCNT_inc_simple_void_NN(listsv);
11899                     cp_list = add_cp_to_invlist(cp_list, '-');
11900                     cp_list = add_cp_to_invlist(cp_list, prevvalue);
11901                 }
11902
11903                 range = 0; /* this was not a true range */
11904                 element_count += 2; /* So counts for three values */
11905             }
11906
11907             if (! SIZE_ONLY) {
11908                 switch ((I32)namedclass) {
11909
11910                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11911                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11912                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11913                     break;
11914                 case ANYOF_NALNUMC:
11915                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11916                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11917                         runtime_posix_matches_above_Unicode);
11918                     break;
11919                 case ANYOF_ALPHA:
11920                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11921                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11922                     break;
11923                 case ANYOF_NALPHA:
11924                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11925                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11926                         runtime_posix_matches_above_Unicode);
11927                     break;
11928                 case ANYOF_ASCII:
11929 #ifdef HAS_ISASCII
11930                     if (LOC) {
11931                         ANYOF_CLASS_SET(ret, namedclass);
11932                     }
11933                     else
11934 #endif  /* Not isascii(); just use the hard-coded definition for it */
11935                         _invlist_union(posixes, PL_ASCII, &posixes);
11936                     break;
11937                 case ANYOF_NASCII:
11938 #ifdef HAS_ISASCII
11939                     if (LOC) {
11940                         ANYOF_CLASS_SET(ret, namedclass);
11941                     }
11942                     else {
11943 #endif
11944                         _invlist_union_complement_2nd(posixes,
11945                                                     PL_ASCII, &posixes);
11946                         if (DEPENDS_SEMANTICS) {
11947                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11948                         }
11949 #ifdef HAS_ISASCII
11950                     }
11951 #endif
11952                     break;
11953                 case ANYOF_BLANK:
11954                     if (hasISBLANK || ! LOC) {
11955                         DO_POSIX(ret, namedclass, posixes,
11956                                             PL_PosixBlank, PL_XPosixBlank);
11957                     }
11958                     else { /* There is no isblank() and we are in locale:  We
11959                               use the ASCII range and the above-Latin1 range
11960                               code points */
11961                         SV* scratch_list = NULL;
11962
11963                         /* Include all above-Latin1 blanks */
11964                         _invlist_intersection(PL_AboveLatin1,
11965                                               PL_XPosixBlank,
11966                                               &scratch_list);
11967                         /* Add it to the running total of posix classes */
11968                         if (! posixes) {
11969                             posixes = scratch_list;
11970                         }
11971                         else {
11972                             _invlist_union(posixes, scratch_list, &posixes);
11973                             SvREFCNT_dec(scratch_list);
11974                         }
11975                         /* Add the ASCII-range blanks to the running total. */
11976                         _invlist_union(posixes, PL_PosixBlank, &posixes);
11977                     }
11978                     break;
11979                 case ANYOF_NBLANK:
11980                     if (hasISBLANK || ! LOC) {
11981                         DO_N_POSIX(ret, namedclass, posixes,
11982                                                 PL_PosixBlank, PL_XPosixBlank);
11983                     }
11984                     else { /* There is no isblank() and we are in locale */
11985                         SV* scratch_list = NULL;
11986
11987                         /* Include all above-Latin1 non-blanks */
11988                         _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11989                                           &scratch_list);
11990
11991                         /* Add them to the running total of posix classes */
11992                         _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11993                                           &scratch_list);
11994                         if (! posixes) {
11995                             posixes = scratch_list;
11996                         }
11997                         else {
11998                             _invlist_union(posixes, scratch_list, &posixes);
11999                             SvREFCNT_dec(scratch_list);
12000                         }
12001
12002                         /* Get the list of all non-ASCII-blanks in Latin 1, and
12003                          * add them to the running total */
12004                         _invlist_subtract(PL_Latin1, PL_PosixBlank,
12005                                           &scratch_list);
12006                         _invlist_union(posixes, scratch_list, &posixes);
12007                         SvREFCNT_dec(scratch_list);
12008                     }
12009                     break;
12010                 case ANYOF_CNTRL:
12011                     DO_POSIX(ret, namedclass, posixes,
12012                                             PL_PosixCntrl, PL_XPosixCntrl);
12013                     break;
12014                 case ANYOF_NCNTRL:
12015                     DO_N_POSIX(ret, namedclass, posixes,
12016                                             PL_PosixCntrl, PL_XPosixCntrl);
12017                     break;
12018                 case ANYOF_DIGIT:
12019                     /* There are no digits in the Latin1 range outside of
12020                      * ASCII, so call the macro that doesn't have to resolve
12021                      * them */
12022                     DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
12023                         PL_PosixDigit, "XPosixDigit", listsv);
12024                     break;
12025                 case ANYOF_NDIGIT:
12026                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12027                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
12028                         runtime_posix_matches_above_Unicode);
12029                     break;
12030                 case ANYOF_GRAPH:
12031                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12032                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
12033                     break;
12034                 case ANYOF_NGRAPH:
12035                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12036                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
12037                         runtime_posix_matches_above_Unicode);
12038                     break;
12039                 case ANYOF_HORIZWS:
12040                     /* For these, we use the cp_list, as /d doesn't make a
12041                      * difference in what these match.  There would be problems
12042                      * if these characters had folds other than themselves, as
12043                      * cp_list is subject to folding.  It turns out that \h
12044                      * is just a synonym for XPosixBlank */
12045                     _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
12046                     break;
12047                 case ANYOF_NHORIZWS:
12048                     _invlist_union_complement_2nd(cp_list,
12049                                                  PL_XPosixBlank, &cp_list);
12050                     break;
12051                 case ANYOF_LOWER:
12052                 case ANYOF_NLOWER:
12053                 {   /* These require special handling, as they differ under
12054                        folding, matching Cased there (which in the ASCII range
12055                        is the same as Alpha */
12056
12057                     SV* ascii_source;
12058                     SV* l1_source;
12059                     const char *Xname;
12060
12061                     if (FOLD && ! LOC) {
12062                         ascii_source = PL_PosixAlpha;
12063                         l1_source = PL_L1Cased;
12064                         Xname = "Cased";
12065                     }
12066                     else {
12067                         ascii_source = PL_PosixLower;
12068                         l1_source = PL_L1PosixLower;
12069                         Xname = "XPosixLower";
12070                     }
12071                     if (namedclass == ANYOF_LOWER) {
12072                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12073                                     ascii_source, l1_source, Xname, listsv);
12074                     }
12075                     else {
12076                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12077                             posixes, ascii_source, l1_source, Xname, listsv,
12078                             runtime_posix_matches_above_Unicode);
12079                     }
12080                     break;
12081                 }
12082                 case ANYOF_PRINT:
12083                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12084                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
12085                     break;
12086                 case ANYOF_NPRINT:
12087                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12088                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
12089                         runtime_posix_matches_above_Unicode);
12090                     break;
12091                 case ANYOF_PUNCT:
12092                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12093                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
12094                     break;
12095                 case ANYOF_NPUNCT:
12096                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12097                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
12098                         runtime_posix_matches_above_Unicode);
12099                     break;
12100                 case ANYOF_PSXSPC:
12101                     DO_POSIX(ret, namedclass, posixes,
12102                                             PL_PosixSpace, PL_XPosixSpace);
12103                     break;
12104                 case ANYOF_NPSXSPC:
12105                     DO_N_POSIX(ret, namedclass, posixes,
12106                                             PL_PosixSpace, PL_XPosixSpace);
12107                     break;
12108                 case ANYOF_SPACE:
12109                     DO_POSIX(ret, namedclass, posixes,
12110                                             PL_PerlSpace, PL_XPerlSpace);
12111                     break;
12112                 case ANYOF_NSPACE:
12113                     DO_N_POSIX(ret, namedclass, posixes,
12114                                             PL_PerlSpace, PL_XPerlSpace);
12115                     break;
12116                 case ANYOF_UPPER:   /* Same as LOWER, above */
12117                 case ANYOF_NUPPER:
12118                 {
12119                     SV* ascii_source;
12120                     SV* l1_source;
12121                     const char *Xname;
12122
12123                     if (FOLD && ! LOC) {
12124                         ascii_source = PL_PosixAlpha;
12125                         l1_source = PL_L1Cased;
12126                         Xname = "Cased";
12127                     }
12128                     else {
12129                         ascii_source = PL_PosixUpper;
12130                         l1_source = PL_L1PosixUpper;
12131                         Xname = "XPosixUpper";
12132                     }
12133                     if (namedclass == ANYOF_UPPER) {
12134                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12135                                     ascii_source, l1_source, Xname, listsv);
12136                     }
12137                     else {
12138                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12139                         posixes, ascii_source, l1_source, Xname, listsv,
12140                         runtime_posix_matches_above_Unicode);
12141                     }
12142                     break;
12143                 }
12144                 case ANYOF_WORDCHAR:
12145                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12146                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
12147                     break;
12148                 case ANYOF_NWORDCHAR:
12149                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12150                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
12151                             runtime_posix_matches_above_Unicode);
12152                     break;
12153                 case ANYOF_VERTWS:
12154                     /* For these, we use the cp_list, as /d doesn't make a
12155                      * difference in what these match.  There would be problems
12156                      * if these characters had folds other than themselves, as
12157                      * cp_list is subject to folding */
12158                     _invlist_union(cp_list, PL_VertSpace, &cp_list);
12159                     break;
12160                 case ANYOF_NVERTWS:
12161                     _invlist_union_complement_2nd(cp_list,
12162                                                     PL_VertSpace, &cp_list);
12163                     break;
12164                 case ANYOF_XDIGIT:
12165                     DO_POSIX(ret, namedclass, posixes,
12166                                             PL_PosixXDigit, PL_XPosixXDigit);
12167                     break;
12168                 case ANYOF_NXDIGIT:
12169                     DO_N_POSIX(ret, namedclass, posixes,
12170                                             PL_PosixXDigit, PL_XPosixXDigit);
12171                     break;
12172                 case ANYOF_UNIPROP: /* this is to handle \p and \P */
12173                     break;
12174                 default:
12175                     vFAIL("Invalid [::] class");
12176                     break;
12177                 }
12178
12179                 continue;   /* Go get next character */
12180             }
12181         } /* end of namedclass \blah */
12182
12183         if (range) {
12184             if (prevvalue > value) /* b-a */ {
12185                 const int w = RExC_parse - rangebegin;
12186                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12187                 range = 0; /* not a valid range */
12188             }
12189         }
12190         else {
12191             prevvalue = value; /* save the beginning of the potential range */
12192             if (RExC_parse+1 < RExC_end
12193                 && *RExC_parse == '-'
12194                 && RExC_parse[1] != ']')
12195             {
12196                 RExC_parse++;
12197
12198                 /* a bad range like \w-, [:word:]- ? */
12199                 if (namedclass > OOB_NAMEDCLASS) {
12200                     if (ckWARN(WARN_REGEXP)) {
12201                         const int w =
12202                             RExC_parse >= rangebegin ?
12203                             RExC_parse - rangebegin : 0;
12204                         vWARN4(RExC_parse,
12205                                "False [] range \"%*.*s\"",
12206                                w, w, rangebegin);
12207                     }
12208                     if (!SIZE_ONLY) {
12209                         cp_list = add_cp_to_invlist(cp_list, '-');
12210                     }
12211                     element_count++;
12212                 } else
12213                     range = 1;  /* yeah, it's a range! */
12214                 continue;       /* but do it the next time */
12215             }
12216         }
12217
12218         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12219          * if not */
12220
12221         /* non-Latin1 code point implies unicode semantics.  Must be set in
12222          * pass1 so is there for the whole of pass 2 */
12223         if (value > 255) {
12224             RExC_uni_semantics = 1;
12225         }
12226
12227         /* Ready to process either the single value, or the completed range.
12228          * For single-valued non-inverted ranges, we consider the possibility
12229          * of multi-char folds.  (We made a conscious decision to not do this
12230          * for the other cases because it can often lead to non-intuitive
12231          * results.  For example, you have the peculiar case that:
12232          *  "s s" =~ /^[^\xDF]+$/i => Y
12233          *  "ss"  =~ /^[^\xDF]+$/i => N
12234          *
12235          * See [perl #89750] */
12236         if (FOLD && ! invert && value == prevvalue) {
12237             if (value == LATIN_SMALL_LETTER_SHARP_S
12238                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12239                                                         value)))
12240             {
12241                 /* Here <value> is indeed a multi-char fold.  Get what it is */
12242
12243                 U8 foldbuf[UTF8_MAXBYTES_CASE];
12244                 STRLEN foldlen;
12245
12246                 UV folded = _to_uni_fold_flags(
12247                                 value,
12248                                 foldbuf,
12249                                 &foldlen,
12250                                 FOLD_FLAGS_FULL
12251                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
12252                                             : (ASCII_FOLD_RESTRICTED)
12253                                               ? FOLD_FLAGS_NOMIX_ASCII
12254                                               : 0)
12255                                 );
12256
12257                 /* Here, <folded> should be the first character of the
12258                  * multi-char fold of <value>, with <foldbuf> containing the
12259                  * whole thing.  But, if this fold is not allowed (because of
12260                  * the flags), <fold> will be the same as <value>, and should
12261                  * be processed like any other character, so skip the special
12262                  * handling */
12263                 if (folded != value) {
12264
12265                     /* Skip if we are recursed, currently parsing the class
12266                      * again.  Otherwise add this character to the list of
12267                      * multi-char folds. */
12268                     if (! RExC_in_multi_char_class) {
12269                         AV** this_array_ptr;
12270                         AV* this_array;
12271                         STRLEN cp_count = utf8_length(foldbuf,
12272                                                       foldbuf + foldlen);
12273                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12274
12275                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12276
12277
12278                         if (! multi_char_matches) {
12279                             multi_char_matches = newAV();
12280                         }
12281
12282                         /* <multi_char_matches> is actually an array of arrays.
12283                          * There will be one or two top-level elements: [2],
12284                          * and/or [3].  The [2] element is an array, each
12285                          * element thereof is a character which folds to two
12286                          * characters; likewise for [3].  (Unicode guarantees a
12287                          * maximum of 3 characters in any fold.)  When we
12288                          * rewrite the character class below, we will do so
12289                          * such that the longest folds are written first, so
12290                          * that it prefers the longest matching strings first.
12291                          * This is done even if it turns out that any
12292                          * quantifier is non-greedy, out of programmer
12293                          * laziness.  Tom Christiansen has agreed that this is
12294                          * ok.  This makes the test for the ligature 'ffi' come
12295                          * before the test for 'ff' */
12296                         if (av_exists(multi_char_matches, cp_count)) {
12297                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
12298                                                              cp_count, FALSE);
12299                             this_array = *this_array_ptr;
12300                         }
12301                         else {
12302                             this_array = newAV();
12303                             av_store(multi_char_matches, cp_count,
12304                                      (SV*) this_array);
12305                         }
12306                         av_push(this_array, multi_fold);
12307                     }
12308
12309                     /* This element should not be processed further in this
12310                      * class */
12311                     element_count--;
12312                     value = save_value;
12313                     prevvalue = save_prevvalue;
12314                     continue;
12315                 }
12316             }
12317         }
12318
12319         /* Deal with this element of the class */
12320         if (! SIZE_ONLY) {
12321 #ifndef EBCDIC
12322             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12323 #else
12324             UV* this_range = _new_invlist(1);
12325             _append_range_to_invlist(this_range, prevvalue, value);
12326
12327             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12328              * If this range was specified using something like 'i-j', we want
12329              * to include only the 'i' and the 'j', and not anything in
12330              * between, so exclude non-ASCII, non-alphabetics from it.
12331              * However, if the range was specified with something like
12332              * [\x89-\x91] or [\x89-j], all code points within it should be
12333              * included.  literal_endpoint==2 means both ends of the range used
12334              * a literal character, not \x{foo} */
12335             if (literal_endpoint == 2
12336                 && (prevvalue >= 'a' && value <= 'z')
12337                     || (prevvalue >= 'A' && value <= 'Z'))
12338             {
12339                 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12340                 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12341             }
12342             _invlist_union(cp_list, this_range, &cp_list);
12343             literal_endpoint = 0;
12344 #endif
12345         }
12346
12347         range = 0; /* this range (if it was one) is done now */
12348     } /* End of loop through all the text within the brackets */
12349
12350     /* If anything in the class expands to more than one character, we have to
12351      * deal with them by building up a substitute parse string, and recursively
12352      * calling reg() on it, instead of proceeding */
12353     if (multi_char_matches) {
12354         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12355         I32 cp_count;
12356         STRLEN len;
12357         char *save_end = RExC_end;
12358         char *save_parse = RExC_parse;
12359         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
12360                                        a "|" */
12361         I32 reg_flags;
12362
12363         assert(! invert);
12364 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
12365            because too confusing */
12366         if (invert) {
12367             sv_catpv(substitute_parse, "(?:");
12368         }
12369 #endif
12370
12371         /* Look at the longest folds first */
12372         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12373
12374             if (av_exists(multi_char_matches, cp_count)) {
12375                 AV** this_array_ptr;
12376                 SV* this_sequence;
12377
12378                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12379                                                  cp_count, FALSE);
12380                 while ((this_sequence = av_pop(*this_array_ptr)) !=
12381                                                                 &PL_sv_undef)
12382                 {
12383                     if (! first_time) {
12384                         sv_catpv(substitute_parse, "|");
12385                     }
12386                     first_time = FALSE;
12387
12388                     sv_catpv(substitute_parse, SvPVX(this_sequence));
12389                 }
12390             }
12391         }
12392
12393         /* If the character class contains anything else besides these
12394          * multi-character folds, have to include it in recursive parsing */
12395         if (element_count) {
12396             sv_catpv(substitute_parse, "|[");
12397             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12398             sv_catpv(substitute_parse, "]");
12399         }
12400
12401         sv_catpv(substitute_parse, ")");
12402 #if 0
12403         if (invert) {
12404             /* This is a way to get the parse to skip forward a whole named
12405              * sequence instead of matching the 2nd character when it fails the
12406              * first */
12407             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12408         }
12409 #endif
12410
12411         RExC_parse = SvPV(substitute_parse, len);
12412         RExC_end = RExC_parse + len;
12413         RExC_in_multi_char_class = 1;
12414         RExC_emit = (regnode *)orig_emit;
12415
12416         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
12417
12418         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12419
12420         RExC_parse = save_parse;
12421         RExC_end = save_end;
12422         RExC_in_multi_char_class = 0;
12423         SvREFCNT_dec(multi_char_matches);
12424         SvREFCNT_dec(listsv);
12425         return ret;
12426     }
12427
12428     /* If the character class contains only a single element, it may be
12429      * optimizable into another node type which is smaller and runs faster.
12430      * Check if this is the case for this class */
12431     if (element_count == 1) {
12432         U8 op = END;
12433         U8 arg = 0;
12434
12435         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12436                                               [:digit:] or \p{foo} */
12437
12438             /* Certain named classes have equivalents that can appear outside a
12439              * character class, e.g. \w, \H.  We use these instead of a
12440              * character class. */
12441             switch ((I32)namedclass) {
12442                 U8 offset;
12443
12444                 /* The first group is for node types that depend on the charset
12445                  * modifier to the regex.  We first calculate the base node
12446                  * type, and if it should be inverted */
12447
12448                 case ANYOF_NWORDCHAR:
12449                     invert = ! invert;
12450                     /* FALLTHROUGH */
12451                 case ANYOF_WORDCHAR:
12452                     op = ALNUM;
12453                     goto join_charset_classes;
12454
12455                 case ANYOF_NSPACE:
12456                     invert = ! invert;
12457                     /* FALLTHROUGH */
12458                 case ANYOF_SPACE:
12459                     op = SPACE;
12460                     goto join_charset_classes;
12461
12462                 case ANYOF_NDIGIT:
12463                     invert = ! invert;
12464                     /* FALLTHROUGH */
12465                 case ANYOF_DIGIT:
12466                     op = DIGIT;
12467
12468                   join_charset_classes:
12469
12470                     /* Now that we have the base node type, we take advantage
12471                      * of the enum ordering of the charset modifiers to get the
12472                      * exact node type,  For example the base SPACE also has
12473                      * SPACEL, SPACEU, and SPACEA */
12474
12475                     offset = get_regex_charset(RExC_flags);
12476
12477                     /* /aa is the same as /a for these */
12478                     if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12479                         offset = REGEX_ASCII_RESTRICTED_CHARSET;
12480                     }
12481                     else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12482                         offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12483                     }
12484
12485                     op += offset;
12486
12487                     /* The number of varieties of each of these is the same,
12488                      * hence, so is the delta between the normal and
12489                      * complemented nodes */
12490                     if (invert) {
12491                         op += NALNUM - ALNUM;
12492                     }
12493                     *flagp |= HASWIDTH|SIMPLE;
12494                     break;
12495
12496                 /* The second group doesn't depend of the charset modifiers.
12497                  * We just have normal and complemented */
12498                 case ANYOF_NHORIZWS:
12499                     invert = ! invert;
12500                     /* FALLTHROUGH */
12501                 case ANYOF_HORIZWS:
12502                   is_horizws:
12503                     op = (invert) ? NHORIZWS : HORIZWS;
12504                     *flagp |= HASWIDTH|SIMPLE;
12505                     break;
12506
12507                 case ANYOF_NVERTWS:
12508                     invert = ! invert;
12509                     /* FALLTHROUGH */
12510                 case ANYOF_VERTWS:
12511                     op = (invert) ? NVERTWS : VERTWS;
12512                     *flagp |= HASWIDTH|SIMPLE;
12513                     break;
12514
12515                 case ANYOF_UNIPROP:
12516                     break;
12517
12518                 case ANYOF_NBLANK:
12519                     invert = ! invert;
12520                     /* FALLTHROUGH */
12521                 case ANYOF_BLANK:
12522                     if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12523                         goto is_horizws;
12524                     }
12525                     /* FALLTHROUGH */
12526                 default:
12527                     /* A generic posix class.  All the /a ones can be handled
12528                      * by the POSIXA opcode.  And all are closed under folding
12529                      * in the ASCII range, so FOLD doesn't matter */
12530                     if (AT_LEAST_ASCII_RESTRICTED
12531                         || (! LOC && namedclass == ANYOF_ASCII))
12532                     {
12533                         /* The odd numbered ones are the complements of the
12534                          * next-lower even number one */
12535                         if (namedclass % 2 == 1) {
12536                             invert = ! invert;
12537                             namedclass--;
12538                         }
12539                         arg = namedclass_to_classnum(namedclass);
12540                         op = (invert) ? NPOSIXA : POSIXA;
12541                     }
12542                     break;
12543             }
12544         }
12545         else if (value == prevvalue) {
12546
12547             /* Here, the class consists of just a single code point */
12548
12549             if (invert) {
12550                 if (! LOC && value == '\n') {
12551                     op = REG_ANY; /* Optimize [^\n] */
12552                     *flagp |= HASWIDTH|SIMPLE;
12553                     RExC_naughty++;
12554                 }
12555             }
12556             else if (value < 256 || UTF) {
12557
12558                 /* Optimize a single value into an EXACTish node, but not if it
12559                  * would require converting the pattern to UTF-8. */
12560                 op = compute_EXACTish(pRExC_state);
12561             }
12562         } /* Otherwise is a range */
12563         else if (! LOC) {   /* locale could vary these */
12564             if (prevvalue == '0') {
12565                 if (value == '9') {
12566                     op = (invert) ? NDIGITA : DIGITA;
12567                     *flagp |= HASWIDTH|SIMPLE;
12568                 }
12569             }
12570         }
12571
12572         /* Here, we have changed <op> away from its initial value iff we found
12573          * an optimization */
12574         if (op != END) {
12575
12576             /* Throw away this ANYOF regnode, and emit the calculated one,
12577              * which should correspond to the beginning, not current, state of
12578              * the parse */
12579             const char * cur_parse = RExC_parse;
12580             RExC_parse = (char *)orig_parse;
12581             if ( SIZE_ONLY) {
12582                 if (! LOC) {
12583
12584                     /* To get locale nodes to not use the full ANYOF size would
12585                      * require moving the code above that writes the portions
12586                      * of it that aren't in other nodes to after this point.
12587                      * e.g.  ANYOF_CLASS_SET */
12588                     RExC_size = orig_size;
12589                 }
12590             }
12591             else {
12592                 RExC_emit = (regnode *)orig_emit;
12593             }
12594
12595             ret = reg_node(pRExC_state, op);
12596
12597             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
12598                 if (! SIZE_ONLY) {
12599                     FLAGS(ret) = arg;
12600                 }
12601                 *flagp |= HASWIDTH|SIMPLE;
12602             }
12603             else if (PL_regkind[op] == EXACT) {
12604                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12605             }
12606
12607             RExC_parse = (char *) cur_parse;
12608
12609             SvREFCNT_dec(posixes);
12610             SvREFCNT_dec(listsv);
12611             SvREFCNT_dec(cp_list);
12612             return ret;
12613         }
12614     }
12615
12616     if (SIZE_ONLY)
12617         return ret;
12618     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12619
12620     /* If folding, we calculate all characters that could fold to or from the
12621      * ones already on the list */
12622     if (FOLD && cp_list) {
12623         UV start, end;  /* End points of code point ranges */
12624
12625         SV* fold_intersection = NULL;
12626
12627         /* If the highest code point is within Latin1, we can use the
12628          * compiled-in Alphas list, and not have to go out to disk.  This
12629          * yields two false positives, the masculine and feminine ordinal
12630          * indicators, which are weeded out below using the
12631          * IS_IN_SOME_FOLD_L1() macro */
12632         if (invlist_highest(cp_list) < 256) {
12633             _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12634         }
12635         else {
12636
12637             /* Here, there are non-Latin1 code points, so we will have to go
12638              * fetch the list of all the characters that participate in folds
12639              */
12640             if (! PL_utf8_foldable) {
12641                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12642                                        &PL_sv_undef, 1, 0);
12643                 PL_utf8_foldable = _get_swash_invlist(swash);
12644                 SvREFCNT_dec(swash);
12645             }
12646
12647             /* This is a hash that for a particular fold gives all characters
12648              * that are involved in it */
12649             if (! PL_utf8_foldclosures) {
12650
12651                 /* If we were unable to find any folds, then we likely won't be
12652                  * able to find the closures.  So just create an empty list.
12653                  * Folding will effectively be restricted to the non-Unicode
12654                  * rules hard-coded into Perl.  (This case happens legitimately
12655                  * during compilation of Perl itself before the Unicode tables
12656                  * are generated) */
12657                 if (_invlist_len(PL_utf8_foldable) == 0) {
12658                     PL_utf8_foldclosures = newHV();
12659                 }
12660                 else {
12661                     /* If the folds haven't been read in, call a fold function
12662                      * to force that */
12663                     if (! PL_utf8_tofold) {
12664                         U8 dummy[UTF8_MAXBYTES+1];
12665
12666                         /* This string is just a short named one above \xff */
12667                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
12668                         assert(PL_utf8_tofold); /* Verify that worked */
12669                     }
12670                     PL_utf8_foldclosures =
12671                                     _swash_inversion_hash(PL_utf8_tofold);
12672                 }
12673             }
12674
12675             /* Only the characters in this class that participate in folds need
12676              * be checked.  Get the intersection of this class and all the
12677              * possible characters that are foldable.  This can quickly narrow
12678              * down a large class */
12679             _invlist_intersection(PL_utf8_foldable, cp_list,
12680                                   &fold_intersection);
12681         }
12682
12683         /* Now look at the foldable characters in this class individually */
12684         invlist_iterinit(fold_intersection);
12685         while (invlist_iternext(fold_intersection, &start, &end)) {
12686             UV j;
12687
12688             /* Locale folding for Latin1 characters is deferred until runtime */
12689             if (LOC && start < 256) {
12690                 start = 256;
12691             }
12692
12693             /* Look at every character in the range */
12694             for (j = start; j <= end; j++) {
12695
12696                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12697                 STRLEN foldlen;
12698                 SV** listp;
12699
12700                 if (j < 256) {
12701
12702                     /* We have the latin1 folding rules hard-coded here so that
12703                      * an innocent-looking character class, like /[ks]/i won't
12704                      * have to go out to disk to find the possible matches.
12705                      * XXX It would be better to generate these via regen, in
12706                      * case a new version of the Unicode standard adds new
12707                      * mappings, though that is not really likely, and may be
12708                      * caught by the default: case of the switch below. */
12709
12710                     if (IS_IN_SOME_FOLD_L1(j)) {
12711
12712                         /* ASCII is always matched; non-ASCII is matched only
12713                          * under Unicode rules */
12714                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12715                             cp_list =
12716                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12717                         }
12718                         else {
12719                             depends_list =
12720                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12721                         }
12722                     }
12723
12724                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12725                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12726                     {
12727                         /* Certain Latin1 characters have matches outside
12728                          * Latin1.  To get here, <j> is one of those
12729                          * characters.   None of these matches is valid for
12730                          * ASCII characters under /aa, which is why the 'if'
12731                          * just above excludes those.  These matches only
12732                          * happen when the target string is utf8.  The code
12733                          * below adds the single fold closures for <j> to the
12734                          * inversion list. */
12735                         switch (j) {
12736                             case 'k':
12737                             case 'K':
12738                                 cp_list =
12739                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
12740                                 break;
12741                             case 's':
12742                             case 'S':
12743                                 cp_list = add_cp_to_invlist(cp_list,
12744                                                     LATIN_SMALL_LETTER_LONG_S);
12745                                 break;
12746                             case MICRO_SIGN:
12747                                 cp_list = add_cp_to_invlist(cp_list,
12748                                                     GREEK_CAPITAL_LETTER_MU);
12749                                 cp_list = add_cp_to_invlist(cp_list,
12750                                                     GREEK_SMALL_LETTER_MU);
12751                                 break;
12752                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12753                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12754                                 cp_list =
12755                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12756                                 break;
12757                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12758                                 cp_list = add_cp_to_invlist(cp_list,
12759                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12760                                 break;
12761                             case LATIN_SMALL_LETTER_SHARP_S:
12762                                 cp_list = add_cp_to_invlist(cp_list,
12763                                                 LATIN_CAPITAL_LETTER_SHARP_S);
12764                                 break;
12765                             case 'F': case 'f':
12766                             case 'I': case 'i':
12767                             case 'L': case 'l':
12768                             case 'T': case 't':
12769                             case 'A': case 'a':
12770                             case 'H': case 'h':
12771                             case 'J': case 'j':
12772                             case 'N': case 'n':
12773                             case 'W': case 'w':
12774                             case 'Y': case 'y':
12775                                 /* These all are targets of multi-character
12776                                  * folds from code points that require UTF8 to
12777                                  * express, so they can't match unless the
12778                                  * target string is in UTF-8, so no action here
12779                                  * is necessary, as regexec.c properly handles
12780                                  * the general case for UTF-8 matching and
12781                                  * multi-char folds */
12782                                 break;
12783                             default:
12784                                 /* Use deprecated warning to increase the
12785                                  * chances of this being output */
12786                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12787                                 break;
12788                         }
12789                     }
12790                     continue;
12791                 }
12792
12793                 /* Here is an above Latin1 character.  We don't have the rules
12794                  * hard-coded for it.  First, get its fold.  This is the simple
12795                  * fold, as the multi-character folds have been handled earlier
12796                  * and separated out */
12797                 _to_uni_fold_flags(j, foldbuf, &foldlen,
12798                                                ((LOC)
12799                                                ? FOLD_FLAGS_LOCALE
12800                                                : (ASCII_FOLD_RESTRICTED)
12801                                                   ? FOLD_FLAGS_NOMIX_ASCII
12802                                                   : 0));
12803
12804                 /* Single character fold of above Latin1.  Add everything in
12805                  * its fold closure to the list that this node should match.
12806                  * The fold closures data structure is a hash with the keys
12807                  * being the UTF-8 of every character that is folded to, like
12808                  * 'k', and the values each an array of all code points that
12809                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
12810                  * Multi-character folds are not included */
12811                 if ((listp = hv_fetch(PL_utf8_foldclosures,
12812                                       (char *) foldbuf, foldlen, FALSE)))
12813                 {
12814                     AV* list = (AV*) *listp;
12815                     IV k;
12816                     for (k = 0; k <= av_len(list); k++) {
12817                         SV** c_p = av_fetch(list, k, FALSE);
12818                         UV c;
12819                         if (c_p == NULL) {
12820                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12821                         }
12822                         c = SvUV(*c_p);
12823
12824                         /* /aa doesn't allow folds between ASCII and non-; /l
12825                          * doesn't allow them between above and below 256 */
12826                         if ((ASCII_FOLD_RESTRICTED
12827                                   && (isASCII(c) != isASCII(j)))
12828                             || (LOC && ((c < 256) != (j < 256))))
12829                         {
12830                             continue;
12831                         }
12832
12833                         /* Folds involving non-ascii Latin1 characters
12834                          * under /d are added to a separate list */
12835                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12836                         {
12837                             cp_list = add_cp_to_invlist(cp_list, c);
12838                         }
12839                         else {
12840                           depends_list = add_cp_to_invlist(depends_list, c);
12841                         }
12842                     }
12843                 }
12844             }
12845         }
12846         SvREFCNT_dec(fold_intersection);
12847     }
12848
12849     /* And combine the result (if any) with any inversion list from posix
12850      * classes.  The lists are kept separate up to now because we don't want to
12851      * fold the classes (folding of those is automatically handled by the swash
12852      * fetching code) */
12853     if (posixes) {
12854         if (! DEPENDS_SEMANTICS) {
12855             if (cp_list) {
12856                 _invlist_union(cp_list, posixes, &cp_list);
12857                 SvREFCNT_dec(posixes);
12858             }
12859             else {
12860                 cp_list = posixes;
12861             }
12862         }
12863         else {
12864             /* Under /d, we put into a separate list the Latin1 things that
12865              * match only when the target string is utf8 */
12866             SV* nonascii_but_latin1_properties = NULL;
12867             _invlist_intersection(posixes, PL_Latin1,
12868                                   &nonascii_but_latin1_properties);
12869             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12870                               &nonascii_but_latin1_properties);
12871             _invlist_subtract(posixes, nonascii_but_latin1_properties,
12872                               &posixes);
12873             if (cp_list) {
12874                 _invlist_union(cp_list, posixes, &cp_list);
12875                 SvREFCNT_dec(posixes);
12876             }
12877             else {
12878                 cp_list = posixes;
12879             }
12880
12881             if (depends_list) {
12882                 _invlist_union(depends_list, nonascii_but_latin1_properties,
12883                                &depends_list);
12884                 SvREFCNT_dec(nonascii_but_latin1_properties);
12885             }
12886             else {
12887                 depends_list = nonascii_but_latin1_properties;
12888             }
12889         }
12890     }
12891
12892     /* And combine the result (if any) with any inversion list from properties.
12893      * The lists are kept separate up to now so that we can distinguish the two
12894      * in regards to matching above-Unicode.  A run-time warning is generated
12895      * if a Unicode property is matched against a non-Unicode code point. But,
12896      * we allow user-defined properties to match anything, without any warning,
12897      * and we also suppress the warning if there is a portion of the character
12898      * class that isn't a Unicode property, and which matches above Unicode, \W
12899      * or [\x{110000}] for example.
12900      * (Note that in this case, unlike the Posix one above, there is no
12901      * <depends_list>, because having a Unicode property forces Unicode
12902      * semantics */
12903     if (properties) {
12904         bool warn_super = ! has_user_defined_property;
12905         if (cp_list) {
12906
12907             /* If it matters to the final outcome, see if a non-property
12908              * component of the class matches above Unicode.  If so, the
12909              * warning gets suppressed.  This is true even if just a single
12910              * such code point is specified, as though not strictly correct if
12911              * another such code point is matched against, the fact that they
12912              * are using above-Unicode code points indicates they should know
12913              * the issues involved */
12914             if (warn_super) {
12915                 bool non_prop_matches_above_Unicode =
12916                             runtime_posix_matches_above_Unicode
12917                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12918                 if (invert) {
12919                     non_prop_matches_above_Unicode =
12920                                             !  non_prop_matches_above_Unicode;
12921                 }
12922                 warn_super = ! non_prop_matches_above_Unicode;
12923             }
12924
12925             _invlist_union(properties, cp_list, &cp_list);
12926             SvREFCNT_dec(properties);
12927         }
12928         else {
12929             cp_list = properties;
12930         }
12931
12932         if (warn_super) {
12933             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12934         }
12935     }
12936
12937     /* Here, we have calculated what code points should be in the character
12938      * class.
12939      *
12940      * Now we can see about various optimizations.  Fold calculation (which we
12941      * did above) needs to take place before inversion.  Otherwise /[^k]/i
12942      * would invert to include K, which under /i would match k, which it
12943      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
12944      * folded until runtime */
12945
12946     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12947      * at compile time.  Besides not inverting folded locale now, we can't
12948      * invert if there are things such as \w, which aren't known until runtime
12949      * */
12950     if (invert
12951         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12952         && ! depends_list
12953         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12954     {
12955         _invlist_invert(cp_list);
12956
12957         /* Any swash can't be used as-is, because we've inverted things */
12958         if (swash) {
12959             SvREFCNT_dec(swash);
12960             swash = NULL;
12961         }
12962
12963         /* Clear the invert flag since have just done it here */
12964         invert = FALSE;
12965     }
12966
12967     /* If we didn't do folding, it's because some information isn't available
12968      * until runtime; set the run-time fold flag for these.  (We don't have to
12969      * worry about properties folding, as that is taken care of by the swash
12970      * fetching) */
12971     if (FOLD && LOC)
12972     {
12973        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
12974     }
12975
12976     /* Some character classes are equivalent to other nodes.  Such nodes take
12977      * up less room and generally fewer operations to execute than ANYOF nodes.
12978      * Above, we checked for and optimized into some such equivalents for
12979      * certain common classes that are easy to test.  Getting to this point in
12980      * the code means that the class didn't get optimized there.  Since this
12981      * code is only executed in Pass 2, it is too late to save space--it has
12982      * been allocated in Pass 1, and currently isn't given back.  But turning
12983      * things into an EXACTish node can allow the optimizer to join it to any
12984      * adjacent such nodes.  And if the class is equivalent to things like /./,
12985      * expensive run-time swashes can be avoided.  Now that we have more
12986      * complete information, we can find things necessarily missed by the
12987      * earlier code.  I (khw) am not sure how much to look for here.  It would
12988      * be easy, but perhaps too slow, to check any candidates against all the
12989      * node types they could possibly match using _invlistEQ(). */
12990
12991     if (cp_list
12992         && ! invert
12993         && ! depends_list
12994         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12995         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12996     {
12997        UV start, end;
12998        U8 op = END;  /* The optimzation node-type */
12999         const char * cur_parse= RExC_parse;
13000
13001        invlist_iterinit(cp_list);
13002        if (! invlist_iternext(cp_list, &start, &end)) {
13003
13004             /* Here, the list is empty.  This happens, for example, when a
13005              * Unicode property is the only thing in the character class, and
13006              * it doesn't match anything.  (perluniprops.pod notes such
13007              * properties) */
13008             op = OPFAIL;
13009             *flagp |= HASWIDTH|SIMPLE;
13010         }
13011         else if (start == end) {    /* The range is a single code point */
13012             if (! invlist_iternext(cp_list, &start, &end)
13013
13014                     /* Don't do this optimization if it would require changing
13015                      * the pattern to UTF-8 */
13016                 && (start < 256 || UTF))
13017             {
13018                 /* Here, the list contains a single code point.  Can optimize
13019                  * into an EXACT node */
13020
13021                 value = start;
13022
13023                 if (! FOLD) {
13024                     op = EXACT;
13025                 }
13026                 else if (LOC) {
13027
13028                     /* A locale node under folding with one code point can be
13029                      * an EXACTFL, as its fold won't be calculated until
13030                      * runtime */
13031                     op = EXACTFL;
13032                 }
13033                 else {
13034
13035                     /* Here, we are generally folding, but there is only one
13036                      * code point to match.  If we have to, we use an EXACT
13037                      * node, but it would be better for joining with adjacent
13038                      * nodes in the optimization pass if we used the same
13039                      * EXACTFish node that any such are likely to be.  We can
13040                      * do this iff the code point doesn't participate in any
13041                      * folds.  For example, an EXACTF of a colon is the same as
13042                      * an EXACT one, since nothing folds to or from a colon. */
13043                     if (value < 256) {
13044                         if (IS_IN_SOME_FOLD_L1(value)) {
13045                             op = EXACT;
13046                         }
13047                     }
13048                     else {
13049                         if (! PL_utf8_foldable) {
13050                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13051                                                 &PL_sv_undef, 1, 0);
13052                             PL_utf8_foldable = _get_swash_invlist(swash);
13053                             SvREFCNT_dec(swash);
13054                         }
13055                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13056                             op = EXACT;
13057                         }
13058                     }
13059
13060                     /* If we haven't found the node type, above, it means we
13061                      * can use the prevailing one */
13062                     if (op == END) {
13063                         op = compute_EXACTish(pRExC_state);
13064                     }
13065                 }
13066             }
13067         }
13068         else if (start == 0) {
13069             if (end == UV_MAX) {
13070                 op = SANY;
13071                 *flagp |= HASWIDTH|SIMPLE;
13072                 RExC_naughty++;
13073             }
13074             else if (end == '\n' - 1
13075                     && invlist_iternext(cp_list, &start, &end)
13076                     && start == '\n' + 1 && end == UV_MAX)
13077             {
13078                 op = REG_ANY;
13079                 *flagp |= HASWIDTH|SIMPLE;
13080                 RExC_naughty++;
13081             }
13082         }
13083
13084         if (op != END) {
13085             RExC_parse = (char *)orig_parse;
13086             RExC_emit = (regnode *)orig_emit;
13087
13088             ret = reg_node(pRExC_state, op);
13089
13090             RExC_parse = (char *)cur_parse;
13091
13092             if (PL_regkind[op] == EXACT) {
13093                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13094             }
13095
13096             SvREFCNT_dec(cp_list);
13097             SvREFCNT_dec(listsv);
13098             return ret;
13099         }
13100     }
13101
13102     /* Here, <cp_list> contains all the code points we can determine at
13103      * compile time that match under all conditions.  Go through it, and
13104      * for things that belong in the bitmap, put them there, and delete from
13105      * <cp_list>.  While we are at it, see if everything above 255 is in the
13106      * list, and if so, set a flag to speed up execution */
13107     ANYOF_BITMAP_ZERO(ret);
13108     if (cp_list) {
13109
13110         /* This gets set if we actually need to modify things */
13111         bool change_invlist = FALSE;
13112
13113         UV start, end;
13114
13115         /* Start looking through <cp_list> */
13116         invlist_iterinit(cp_list);
13117         while (invlist_iternext(cp_list, &start, &end)) {
13118             UV high;
13119             int i;
13120
13121             if (end == UV_MAX && start <= 256) {
13122                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13123             }
13124
13125             /* Quit if are above what we should change */
13126             if (start > 255) {
13127                 break;
13128             }
13129
13130             change_invlist = TRUE;
13131
13132             /* Set all the bits in the range, up to the max that we are doing */
13133             high = (end < 255) ? end : 255;
13134             for (i = start; i <= (int) high; i++) {
13135                 if (! ANYOF_BITMAP_TEST(ret, i)) {
13136                     ANYOF_BITMAP_SET(ret, i);
13137                     prevvalue = value;
13138                     value = i;
13139                 }
13140             }
13141         }
13142
13143         /* Done with loop; remove any code points that are in the bitmap from
13144          * <cp_list> */
13145         if (change_invlist) {
13146             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13147         }
13148
13149         /* If have completely emptied it, remove it completely */
13150         if (_invlist_len(cp_list) == 0) {
13151             SvREFCNT_dec(cp_list);
13152             cp_list = NULL;
13153         }
13154     }
13155
13156     if (invert) {
13157         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13158     }
13159
13160     /* Here, the bitmap has been populated with all the Latin1 code points that
13161      * always match.  Can now add to the overall list those that match only
13162      * when the target string is UTF-8 (<depends_list>). */
13163     if (depends_list) {
13164         if (cp_list) {
13165             _invlist_union(cp_list, depends_list, &cp_list);
13166             SvREFCNT_dec(depends_list);
13167         }
13168         else {
13169             cp_list = depends_list;
13170         }
13171     }
13172
13173     /* If there is a swash and more than one element, we can't use the swash in
13174      * the optimization below. */
13175     if (swash && element_count > 1) {
13176         SvREFCNT_dec(swash);
13177         swash = NULL;
13178     }
13179
13180     if (! cp_list
13181         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13182     {
13183         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13184         SvREFCNT_dec(listsv);
13185     }
13186     else {
13187         /* av[0] stores the character class description in its textual form:
13188          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13189          *       appropriate swash, and is also useful for dumping the regnode.
13190          * av[1] if NULL, is a placeholder to later contain the swash computed
13191          *       from av[0].  But if no further computation need be done, the
13192          *       swash is stored there now.
13193          * av[2] stores the cp_list inversion list for use in addition or
13194          *       instead of av[0]; used only if av[1] is NULL
13195          * av[3] is set if any component of the class is from a user-defined
13196          *       property; used only if av[1] is NULL */
13197         AV * const av = newAV();
13198         SV *rv;
13199
13200         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13201                         ? listsv
13202                         : (SvREFCNT_dec(listsv), &PL_sv_undef));
13203         if (swash) {
13204             av_store(av, 1, swash);
13205             SvREFCNT_dec(cp_list);
13206         }
13207         else {
13208             av_store(av, 1, NULL);
13209             if (cp_list) {
13210                 av_store(av, 2, cp_list);
13211                 av_store(av, 3, newSVuv(has_user_defined_property));
13212             }
13213         }
13214
13215         rv = newRV_noinc(MUTABLE_SV(av));
13216         n = add_data(pRExC_state, 1, "s");
13217         RExC_rxi->data->data[n] = (void*)rv;
13218         ARG_SET(ret, n);
13219     }
13220
13221     *flagp |= HASWIDTH|SIMPLE;
13222     return ret;
13223 }
13224 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13225
13226
13227 /* reg_skipcomment()
13228
13229    Absorbs an /x style # comments from the input stream.
13230    Returns true if there is more text remaining in the stream.
13231    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13232    terminates the pattern without including a newline.
13233
13234    Note its the callers responsibility to ensure that we are
13235    actually in /x mode
13236
13237 */
13238
13239 STATIC bool
13240 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13241 {
13242     bool ended = 0;
13243
13244     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13245
13246     while (RExC_parse < RExC_end)
13247         if (*RExC_parse++ == '\n') {
13248             ended = 1;
13249             break;
13250         }
13251     if (!ended) {
13252         /* we ran off the end of the pattern without ending
13253            the comment, so we have to add an \n when wrapping */
13254         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13255         return 0;
13256     } else
13257         return 1;
13258 }
13259
13260 /* nextchar()
13261
13262    Advances the parse position, and optionally absorbs
13263    "whitespace" from the inputstream.
13264
13265    Without /x "whitespace" means (?#...) style comments only,
13266    with /x this means (?#...) and # comments and whitespace proper.
13267
13268    Returns the RExC_parse point from BEFORE the scan occurs.
13269
13270    This is the /x friendly way of saying RExC_parse++.
13271 */
13272
13273 STATIC char*
13274 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13275 {
13276     char* const retval = RExC_parse++;
13277
13278     PERL_ARGS_ASSERT_NEXTCHAR;
13279
13280     for (;;) {
13281         if (RExC_end - RExC_parse >= 3
13282             && *RExC_parse == '('
13283             && RExC_parse[1] == '?'
13284             && RExC_parse[2] == '#')
13285         {
13286             while (*RExC_parse != ')') {
13287                 if (RExC_parse == RExC_end)
13288                     FAIL("Sequence (?#... not terminated");
13289                 RExC_parse++;
13290             }
13291             RExC_parse++;
13292             continue;
13293         }
13294         if (RExC_flags & RXf_PMf_EXTENDED) {
13295             if (isSPACE(*RExC_parse)) {
13296                 RExC_parse++;
13297                 continue;
13298             }
13299             else if (*RExC_parse == '#') {
13300                 if ( reg_skipcomment( pRExC_state ) )
13301                     continue;
13302             }
13303         }
13304         return retval;
13305     }
13306 }
13307
13308 /*
13309 - reg_node - emit a node
13310 */
13311 STATIC regnode *                        /* Location. */
13312 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13313 {
13314     dVAR;
13315     regnode *ptr;
13316     regnode * const ret = RExC_emit;
13317     GET_RE_DEBUG_FLAGS_DECL;
13318
13319     PERL_ARGS_ASSERT_REG_NODE;
13320
13321     if (SIZE_ONLY) {
13322         SIZE_ALIGN(RExC_size);
13323         RExC_size += 1;
13324         return(ret);
13325     }
13326     if (RExC_emit >= RExC_emit_bound)
13327         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13328                    op, RExC_emit, RExC_emit_bound);
13329
13330     NODE_ALIGN_FILL(ret);
13331     ptr = ret;
13332     FILL_ADVANCE_NODE(ptr, op);
13333 #ifdef RE_TRACK_PATTERN_OFFSETS
13334     if (RExC_offsets) {         /* MJD */
13335         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13336               "reg_node", __LINE__,
13337               PL_reg_name[op],
13338               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13339                 ? "Overwriting end of array!\n" : "OK",
13340               (UV)(RExC_emit - RExC_emit_start),
13341               (UV)(RExC_parse - RExC_start),
13342               (UV)RExC_offsets[0]));
13343         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13344     }
13345 #endif
13346     RExC_emit = ptr;
13347     return(ret);
13348 }
13349
13350 /*
13351 - reganode - emit a node with an argument
13352 */
13353 STATIC regnode *                        /* Location. */
13354 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13355 {
13356     dVAR;
13357     regnode *ptr;
13358     regnode * const ret = RExC_emit;
13359     GET_RE_DEBUG_FLAGS_DECL;
13360
13361     PERL_ARGS_ASSERT_REGANODE;
13362
13363     if (SIZE_ONLY) {
13364         SIZE_ALIGN(RExC_size);
13365         RExC_size += 2;
13366         /*
13367            We can't do this:
13368
13369            assert(2==regarglen[op]+1);
13370
13371            Anything larger than this has to allocate the extra amount.
13372            If we changed this to be:
13373
13374            RExC_size += (1 + regarglen[op]);
13375
13376            then it wouldn't matter. Its not clear what side effect
13377            might come from that so its not done so far.
13378            -- dmq
13379         */
13380         return(ret);
13381     }
13382     if (RExC_emit >= RExC_emit_bound)
13383         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13384                    op, RExC_emit, RExC_emit_bound);
13385
13386     NODE_ALIGN_FILL(ret);
13387     ptr = ret;
13388     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13389 #ifdef RE_TRACK_PATTERN_OFFSETS
13390     if (RExC_offsets) {         /* MJD */
13391         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13392               "reganode",
13393               __LINE__,
13394               PL_reg_name[op],
13395               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13396               "Overwriting end of array!\n" : "OK",
13397               (UV)(RExC_emit - RExC_emit_start),
13398               (UV)(RExC_parse - RExC_start),
13399               (UV)RExC_offsets[0]));
13400         Set_Cur_Node_Offset;
13401     }
13402 #endif
13403     RExC_emit = ptr;
13404     return(ret);
13405 }
13406
13407 /*
13408 - reguni - emit (if appropriate) a Unicode character
13409 */
13410 STATIC STRLEN
13411 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13412 {
13413     dVAR;
13414
13415     PERL_ARGS_ASSERT_REGUNI;
13416
13417     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13418 }
13419
13420 /*
13421 - reginsert - insert an operator in front of already-emitted operand
13422 *
13423 * Means relocating the operand.
13424 */
13425 STATIC void
13426 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13427 {
13428     dVAR;
13429     regnode *src;
13430     regnode *dst;
13431     regnode *place;
13432     const int offset = regarglen[(U8)op];
13433     const int size = NODE_STEP_REGNODE + offset;
13434     GET_RE_DEBUG_FLAGS_DECL;
13435
13436     PERL_ARGS_ASSERT_REGINSERT;
13437     PERL_UNUSED_ARG(depth);
13438 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13439     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13440     if (SIZE_ONLY) {
13441         RExC_size += size;
13442         return;
13443     }
13444
13445     src = RExC_emit;
13446     RExC_emit += size;
13447     dst = RExC_emit;
13448     if (RExC_open_parens) {
13449         int paren;
13450         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13451         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13452             if ( RExC_open_parens[paren] >= opnd ) {
13453                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13454                 RExC_open_parens[paren] += size;
13455             } else {
13456                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13457             }
13458             if ( RExC_close_parens[paren] >= opnd ) {
13459                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13460                 RExC_close_parens[paren] += size;
13461             } else {
13462                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13463             }
13464         }
13465     }
13466
13467     while (src > opnd) {
13468         StructCopy(--src, --dst, regnode);
13469 #ifdef RE_TRACK_PATTERN_OFFSETS
13470         if (RExC_offsets) {     /* MJD 20010112 */
13471             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13472                   "reg_insert",
13473                   __LINE__,
13474                   PL_reg_name[op],
13475                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13476                     ? "Overwriting end of array!\n" : "OK",
13477                   (UV)(src - RExC_emit_start),
13478                   (UV)(dst - RExC_emit_start),
13479                   (UV)RExC_offsets[0]));
13480             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13481             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13482         }
13483 #endif
13484     }
13485
13486
13487     place = opnd;               /* Op node, where operand used to be. */
13488 #ifdef RE_TRACK_PATTERN_OFFSETS
13489     if (RExC_offsets) {         /* MJD */
13490         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13491               "reginsert",
13492               __LINE__,
13493               PL_reg_name[op],
13494               (UV)(place - RExC_emit_start) > RExC_offsets[0]
13495               ? "Overwriting end of array!\n" : "OK",
13496               (UV)(place - RExC_emit_start),
13497               (UV)(RExC_parse - RExC_start),
13498               (UV)RExC_offsets[0]));
13499         Set_Node_Offset(place, RExC_parse);
13500         Set_Node_Length(place, 1);
13501     }
13502 #endif
13503     src = NEXTOPER(place);
13504     FILL_ADVANCE_NODE(place, op);
13505     Zero(src, offset, regnode);
13506 }
13507
13508 /*
13509 - regtail - set the next-pointer at the end of a node chain of p to val.
13510 - SEE ALSO: regtail_study
13511 */
13512 /* TODO: All three parms should be const */
13513 STATIC void
13514 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13515 {
13516     dVAR;
13517     regnode *scan;
13518     GET_RE_DEBUG_FLAGS_DECL;
13519
13520     PERL_ARGS_ASSERT_REGTAIL;
13521 #ifndef DEBUGGING
13522     PERL_UNUSED_ARG(depth);
13523 #endif
13524
13525     if (SIZE_ONLY)
13526         return;
13527
13528     /* Find last node. */
13529     scan = p;
13530     for (;;) {
13531         regnode * const temp = regnext(scan);
13532         DEBUG_PARSE_r({
13533             SV * const mysv=sv_newmortal();
13534             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13535             regprop(RExC_rx, mysv, scan);
13536             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13537                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13538                     (temp == NULL ? "->" : ""),
13539                     (temp == NULL ? PL_reg_name[OP(val)] : "")
13540             );
13541         });
13542         if (temp == NULL)
13543             break;
13544         scan = temp;
13545     }
13546
13547     if (reg_off_by_arg[OP(scan)]) {
13548         ARG_SET(scan, val - scan);
13549     }
13550     else {
13551         NEXT_OFF(scan) = val - scan;
13552     }
13553 }
13554
13555 #ifdef DEBUGGING
13556 /*
13557 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13558 - Look for optimizable sequences at the same time.
13559 - currently only looks for EXACT chains.
13560
13561 This is experimental code. The idea is to use this routine to perform
13562 in place optimizations on branches and groups as they are constructed,
13563 with the long term intention of removing optimization from study_chunk so
13564 that it is purely analytical.
13565
13566 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13567 to control which is which.
13568
13569 */
13570 /* TODO: All four parms should be const */
13571
13572 STATIC U8
13573 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13574 {
13575     dVAR;
13576     regnode *scan;
13577     U8 exact = PSEUDO;
13578 #ifdef EXPERIMENTAL_INPLACESCAN
13579     I32 min = 0;
13580 #endif
13581     GET_RE_DEBUG_FLAGS_DECL;
13582
13583     PERL_ARGS_ASSERT_REGTAIL_STUDY;
13584
13585
13586     if (SIZE_ONLY)
13587         return exact;
13588
13589     /* Find last node. */
13590
13591     scan = p;
13592     for (;;) {
13593         regnode * const temp = regnext(scan);
13594 #ifdef EXPERIMENTAL_INPLACESCAN
13595         if (PL_regkind[OP(scan)] == EXACT) {
13596             bool has_exactf_sharp_s;    /* Unexamined in this routine */
13597             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13598                 return EXACT;
13599         }
13600 #endif
13601         if ( exact ) {
13602             switch (OP(scan)) {
13603                 case EXACT:
13604                 case EXACTF:
13605                 case EXACTFA:
13606                 case EXACTFU:
13607                 case EXACTFU_SS:
13608                 case EXACTFU_TRICKYFOLD:
13609                 case EXACTFL:
13610                         if( exact == PSEUDO )
13611                             exact= OP(scan);
13612                         else if ( exact != OP(scan) )
13613                             exact= 0;
13614                 case NOTHING:
13615                     break;
13616                 default:
13617                     exact= 0;
13618             }
13619         }
13620         DEBUG_PARSE_r({
13621             SV * const mysv=sv_newmortal();
13622             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13623             regprop(RExC_rx, mysv, scan);
13624             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13625                 SvPV_nolen_const(mysv),
13626                 REG_NODE_NUM(scan),
13627                 PL_reg_name[exact]);
13628         });
13629         if (temp == NULL)
13630             break;
13631         scan = temp;
13632     }
13633     DEBUG_PARSE_r({
13634         SV * const mysv_val=sv_newmortal();
13635         DEBUG_PARSE_MSG("");
13636         regprop(RExC_rx, mysv_val, val);
13637         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13638                       SvPV_nolen_const(mysv_val),
13639                       (IV)REG_NODE_NUM(val),
13640                       (IV)(val - scan)
13641         );
13642     });
13643     if (reg_off_by_arg[OP(scan)]) {
13644         ARG_SET(scan, val - scan);
13645     }
13646     else {
13647         NEXT_OFF(scan) = val - scan;
13648     }
13649
13650     return exact;
13651 }
13652 #endif
13653
13654 /*
13655  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13656  */
13657 #ifdef DEBUGGING
13658 static void
13659 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13660 {
13661     int bit;
13662     int set=0;
13663     regex_charset cs;
13664
13665     for (bit=0; bit<32; bit++) {
13666         if (flags & (1<<bit)) {
13667             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
13668                 continue;
13669             }
13670             if (!set++ && lead)
13671                 PerlIO_printf(Perl_debug_log, "%s",lead);
13672             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13673         }
13674     }
13675     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13676             if (!set++ && lead) {
13677                 PerlIO_printf(Perl_debug_log, "%s",lead);
13678             }
13679             switch (cs) {
13680                 case REGEX_UNICODE_CHARSET:
13681                     PerlIO_printf(Perl_debug_log, "UNICODE");
13682                     break;
13683                 case REGEX_LOCALE_CHARSET:
13684                     PerlIO_printf(Perl_debug_log, "LOCALE");
13685                     break;
13686                 case REGEX_ASCII_RESTRICTED_CHARSET:
13687                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13688                     break;
13689                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13690                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13691                     break;
13692                 default:
13693                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13694                     break;
13695             }
13696     }
13697     if (lead)  {
13698         if (set)
13699             PerlIO_printf(Perl_debug_log, "\n");
13700         else
13701             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13702     }
13703 }
13704 #endif
13705
13706 void
13707 Perl_regdump(pTHX_ const regexp *r)
13708 {
13709 #ifdef DEBUGGING
13710     dVAR;
13711     SV * const sv = sv_newmortal();
13712     SV *dsv= sv_newmortal();
13713     RXi_GET_DECL(r,ri);
13714     GET_RE_DEBUG_FLAGS_DECL;
13715
13716     PERL_ARGS_ASSERT_REGDUMP;
13717
13718     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13719
13720     /* Header fields of interest. */
13721     if (r->anchored_substr) {
13722         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13723             RE_SV_DUMPLEN(r->anchored_substr), 30);
13724         PerlIO_printf(Perl_debug_log,
13725                       "anchored %s%s at %"IVdf" ",
13726                       s, RE_SV_TAIL(r->anchored_substr),
13727                       (IV)r->anchored_offset);
13728     } else if (r->anchored_utf8) {
13729         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13730             RE_SV_DUMPLEN(r->anchored_utf8), 30);
13731         PerlIO_printf(Perl_debug_log,
13732                       "anchored utf8 %s%s at %"IVdf" ",
13733                       s, RE_SV_TAIL(r->anchored_utf8),
13734                       (IV)r->anchored_offset);
13735     }
13736     if (r->float_substr) {
13737         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13738             RE_SV_DUMPLEN(r->float_substr), 30);
13739         PerlIO_printf(Perl_debug_log,
13740                       "floating %s%s at %"IVdf"..%"UVuf" ",
13741                       s, RE_SV_TAIL(r->float_substr),
13742                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13743     } else if (r->float_utf8) {
13744         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13745             RE_SV_DUMPLEN(r->float_utf8), 30);
13746         PerlIO_printf(Perl_debug_log,
13747                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13748                       s, RE_SV_TAIL(r->float_utf8),
13749                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13750     }
13751     if (r->check_substr || r->check_utf8)
13752         PerlIO_printf(Perl_debug_log,
13753                       (const char *)
13754                       (r->check_substr == r->float_substr
13755                        && r->check_utf8 == r->float_utf8
13756                        ? "(checking floating" : "(checking anchored"));
13757     if (r->extflags & RXf_NOSCAN)
13758         PerlIO_printf(Perl_debug_log, " noscan");
13759     if (r->extflags & RXf_CHECK_ALL)
13760         PerlIO_printf(Perl_debug_log, " isall");
13761     if (r->check_substr || r->check_utf8)
13762         PerlIO_printf(Perl_debug_log, ") ");
13763
13764     if (ri->regstclass) {
13765         regprop(r, sv, ri->regstclass);
13766         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13767     }
13768     if (r->extflags & RXf_ANCH) {
13769         PerlIO_printf(Perl_debug_log, "anchored");
13770         if (r->extflags & RXf_ANCH_BOL)
13771             PerlIO_printf(Perl_debug_log, "(BOL)");
13772         if (r->extflags & RXf_ANCH_MBOL)
13773             PerlIO_printf(Perl_debug_log, "(MBOL)");
13774         if (r->extflags & RXf_ANCH_SBOL)
13775             PerlIO_printf(Perl_debug_log, "(SBOL)");
13776         if (r->extflags & RXf_ANCH_GPOS)
13777             PerlIO_printf(Perl_debug_log, "(GPOS)");
13778         PerlIO_putc(Perl_debug_log, ' ');
13779     }
13780     if (r->extflags & RXf_GPOS_SEEN)
13781         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13782     if (r->intflags & PREGf_SKIP)
13783         PerlIO_printf(Perl_debug_log, "plus ");
13784     if (r->intflags & PREGf_IMPLICIT)
13785         PerlIO_printf(Perl_debug_log, "implicit ");
13786     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13787     if (r->extflags & RXf_EVAL_SEEN)
13788         PerlIO_printf(Perl_debug_log, "with eval ");
13789     PerlIO_printf(Perl_debug_log, "\n");
13790     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13791 #else
13792     PERL_ARGS_ASSERT_REGDUMP;
13793     PERL_UNUSED_CONTEXT;
13794     PERL_UNUSED_ARG(r);
13795 #endif  /* DEBUGGING */
13796 }
13797
13798 /*
13799 - regprop - printable representation of opcode
13800 */
13801 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13802 STMT_START { \
13803         if (do_sep) {                           \
13804             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13805             if (flags & ANYOF_INVERT)           \
13806                 /*make sure the invert info is in each */ \
13807                 sv_catpvs(sv, "^");             \
13808             do_sep = 0;                         \
13809         }                                       \
13810 } STMT_END
13811
13812 void
13813 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13814 {
13815 #ifdef DEBUGGING
13816     dVAR;
13817     int k;
13818
13819     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13820     static const char * const anyofs[] = {
13821 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
13822     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 || _CC_ALNUMC != 7 \
13823     || _CC_GRAPH != 8 || _CC_SPACE != 9 || _CC_BLANK != 10 \
13824     || _CC_XDIGIT != 11 || _CC_PSXSPC != 12 || _CC_CNTRL != 13 \
13825     || _CC_ASCII != 14 || _CC_VERTSPACE != 15
13826   #error Need to adjust order of anyofs[]
13827 #endif
13828         "[\\w]",
13829         "[\\W]",
13830         "[\\d]",
13831         "[\\D]",
13832         "[:alpha:]",
13833         "[:^alpha:]",
13834         "[:lower:]",
13835         "[:^lower:]",
13836         "[:upper:]",
13837         "[:^upper:]",
13838         "[:punct:]",
13839         "[:^punct:]",
13840         "[:print:]",
13841         "[:^print:]",
13842         "[:alnum:]",
13843         "[:^alnum:]",
13844         "[:graph:]",
13845         "[:^graph:]",
13846         "[\\s]",
13847         "[\\S]",
13848         "[:blank:]",
13849         "[:^blank:]",
13850         "[:xdigit:]",
13851         "[:^xdigit:]",
13852         "[:space:]",
13853         "[:^space:]",
13854         "[:cntrl:]",
13855         "[:^cntrl:]",
13856         "[:ascii:]",
13857         "[:^ascii:]",
13858         "[\\v]",
13859         "[\\V]"
13860     };
13861     RXi_GET_DECL(prog,progi);
13862     GET_RE_DEBUG_FLAGS_DECL;
13863
13864     PERL_ARGS_ASSERT_REGPROP;
13865
13866     sv_setpvs(sv, "");
13867
13868     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
13869         /* It would be nice to FAIL() here, but this may be called from
13870            regexec.c, and it would be hard to supply pRExC_state. */
13871         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13872     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13873
13874     k = PL_regkind[OP(o)];
13875
13876     if (k == EXACT) {
13877         sv_catpvs(sv, " ");
13878         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13879          * is a crude hack but it may be the best for now since
13880          * we have no flag "this EXACTish node was UTF-8"
13881          * --jhi */
13882         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13883                   PERL_PV_ESCAPE_UNI_DETECT |
13884                   PERL_PV_ESCAPE_NONASCII   |
13885                   PERL_PV_PRETTY_ELLIPSES   |
13886                   PERL_PV_PRETTY_LTGT       |
13887                   PERL_PV_PRETTY_NOCLEAR
13888                   );
13889     } else if (k == TRIE) {
13890         /* print the details of the trie in dumpuntil instead, as
13891          * progi->data isn't available here */
13892         const char op = OP(o);
13893         const U32 n = ARG(o);
13894         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13895                (reg_ac_data *)progi->data->data[n] :
13896                NULL;
13897         const reg_trie_data * const trie
13898             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13899
13900         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13901         DEBUG_TRIE_COMPILE_r(
13902             Perl_sv_catpvf(aTHX_ sv,
13903                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13904                 (UV)trie->startstate,
13905                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13906                 (UV)trie->wordcount,
13907                 (UV)trie->minlen,
13908                 (UV)trie->maxlen,
13909                 (UV)TRIE_CHARCOUNT(trie),
13910                 (UV)trie->uniquecharcount
13911             )
13912         );
13913         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13914             int i;
13915             int rangestart = -1;
13916             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13917             sv_catpvs(sv, "[");
13918             for (i = 0; i <= 256; i++) {
13919                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13920                     if (rangestart == -1)
13921                         rangestart = i;
13922                 } else if (rangestart != -1) {
13923                     if (i <= rangestart + 3)
13924                         for (; rangestart < i; rangestart++)
13925                             put_byte(sv, rangestart);
13926                     else {
13927                         put_byte(sv, rangestart);
13928                         sv_catpvs(sv, "-");
13929                         put_byte(sv, i - 1);
13930                     }
13931                     rangestart = -1;
13932                 }
13933             }
13934             sv_catpvs(sv, "]");
13935         }
13936
13937     } else if (k == CURLY) {
13938         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13939             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13940         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13941     }
13942     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
13943         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13944     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13945         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
13946         if ( RXp_PAREN_NAMES(prog) ) {
13947             if ( k != REF || (OP(o) < NREF)) {
13948                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13949                 SV **name= av_fetch(list, ARG(o), 0 );
13950                 if (name)
13951                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13952             }
13953             else {
13954                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13955                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13956                 I32 *nums=(I32*)SvPVX(sv_dat);
13957                 SV **name= av_fetch(list, nums[0], 0 );
13958                 I32 n;
13959                 if (name) {
13960                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
13961                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13962                                     (n ? "," : ""), (IV)nums[n]);
13963                     }
13964                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13965                 }
13966             }
13967         }
13968     } else if (k == GOSUB)
13969         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13970     else if (k == VERB) {
13971         if (!o->flags)
13972             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13973                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13974     } else if (k == LOGICAL)
13975         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
13976     else if (k == ANYOF) {
13977         int i, rangestart = -1;
13978         const U8 flags = ANYOF_FLAGS(o);
13979         int do_sep = 0;
13980
13981
13982         if (flags & ANYOF_LOCALE)
13983             sv_catpvs(sv, "{loc}");
13984         if (flags & ANYOF_LOC_FOLD)
13985             sv_catpvs(sv, "{i}");
13986         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13987         if (flags & ANYOF_INVERT)
13988             sv_catpvs(sv, "^");
13989
13990         /* output what the standard cp 0-255 bitmap matches */
13991         for (i = 0; i <= 256; i++) {
13992             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13993                 if (rangestart == -1)
13994                     rangestart = i;
13995             } else if (rangestart != -1) {
13996                 if (i <= rangestart + 3)
13997                     for (; rangestart < i; rangestart++)
13998                         put_byte(sv, rangestart);
13999                 else {
14000                     put_byte(sv, rangestart);
14001                     sv_catpvs(sv, "-");
14002                     put_byte(sv, i - 1);
14003                 }
14004                 do_sep = 1;
14005                 rangestart = -1;
14006             }
14007         }
14008
14009         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14010         /* output any special charclass tests (used entirely under use locale) */
14011         if (ANYOF_CLASS_TEST_ANY_SET(o))
14012             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14013                 if (ANYOF_CLASS_TEST(o,i)) {
14014                     sv_catpv(sv, anyofs[i]);
14015                     do_sep = 1;
14016                 }
14017
14018         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14019
14020         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14021             sv_catpvs(sv, "{non-utf8-latin1-all}");
14022         }
14023
14024         /* output information about the unicode matching */
14025         if (flags & ANYOF_UNICODE_ALL)
14026             sv_catpvs(sv, "{unicode_all}");
14027         else if (ANYOF_NONBITMAP(o))
14028             sv_catpvs(sv, "{unicode}");
14029         if (flags & ANYOF_NONBITMAP_NON_UTF8)
14030             sv_catpvs(sv, "{outside bitmap}");
14031
14032         if (ANYOF_NONBITMAP(o)) {
14033             SV *lv; /* Set if there is something outside the bit map */
14034             SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14035             bool byte_output = FALSE;   /* If something in the bitmap has been
14036                                            output */
14037
14038             if (lv && lv != &PL_sv_undef) {
14039                 if (sw) {
14040                     U8 s[UTF8_MAXBYTES_CASE+1];
14041
14042                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14043                         uvchr_to_utf8(s, i);
14044
14045                         if (i < 256
14046                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
14047                                                                things already
14048                                                                output as part
14049                                                                of the bitmap */
14050                             && swash_fetch(sw, s, TRUE))
14051                         {
14052                             if (rangestart == -1)
14053                                 rangestart = i;
14054                         } else if (rangestart != -1) {
14055                             byte_output = TRUE;
14056                             if (i <= rangestart + 3)
14057                                 for (; rangestart < i; rangestart++) {
14058                                     put_byte(sv, rangestart);
14059                                 }
14060                             else {
14061                                 put_byte(sv, rangestart);
14062                                 sv_catpvs(sv, "-");
14063                                 put_byte(sv, i-1);
14064                             }
14065                             rangestart = -1;
14066                         }
14067                     }
14068                 }
14069
14070                 {
14071                     char *s = savesvpv(lv);
14072                     char * const origs = s;
14073
14074                     while (*s && *s != '\n')
14075                         s++;
14076
14077                     if (*s == '\n') {
14078                         const char * const t = ++s;
14079
14080                         if (byte_output) {
14081                             sv_catpvs(sv, " ");
14082                         }
14083
14084                         while (*s) {
14085                             if (*s == '\n') {
14086
14087                                 /* Truncate very long output */
14088                                 if (s - origs > 256) {
14089                                     Perl_sv_catpvf(aTHX_ sv,
14090                                                    "%.*s...",
14091                                                    (int) (s - origs - 1),
14092                                                    t);
14093                                     goto out_dump;
14094                                 }
14095                                 *s = ' ';
14096                             }
14097                             else if (*s == '\t') {
14098                                 *s = '-';
14099                             }
14100                             s++;
14101                         }
14102                         if (s[-1] == ' ')
14103                             s[-1] = 0;
14104
14105                         sv_catpv(sv, t);
14106                     }
14107
14108                 out_dump:
14109
14110                     Safefree(origs);
14111                 }
14112                 SvREFCNT_dec(lv);
14113             }
14114         }
14115
14116         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14117     }
14118     else if (k == POSIXD || k == NPOSIXD) {
14119         U8 index = FLAGS(o) * 2;
14120         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14121             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14122         }
14123         else {
14124             sv_catpv(sv, anyofs[index]);
14125         }
14126     }
14127     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14128         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14129 #else
14130     PERL_UNUSED_CONTEXT;
14131     PERL_UNUSED_ARG(sv);
14132     PERL_UNUSED_ARG(o);
14133     PERL_UNUSED_ARG(prog);
14134 #endif  /* DEBUGGING */
14135 }
14136
14137 SV *
14138 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14139 {                               /* Assume that RE_INTUIT is set */
14140     dVAR;
14141     struct regexp *const prog = ReANY(r);
14142     GET_RE_DEBUG_FLAGS_DECL;
14143
14144     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14145     PERL_UNUSED_CONTEXT;
14146
14147     DEBUG_COMPILE_r(
14148         {
14149             const char * const s = SvPV_nolen_const(prog->check_substr
14150                       ? prog->check_substr : prog->check_utf8);
14151
14152             if (!PL_colorset) reginitcolors();
14153             PerlIO_printf(Perl_debug_log,
14154                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14155                       PL_colors[4],
14156                       prog->check_substr ? "" : "utf8 ",
14157                       PL_colors[5],PL_colors[0],
14158                       s,
14159                       PL_colors[1],
14160                       (strlen(s) > 60 ? "..." : ""));
14161         } );
14162
14163     return prog->check_substr ? prog->check_substr : prog->check_utf8;
14164 }
14165
14166 /*
14167    pregfree()
14168
14169    handles refcounting and freeing the perl core regexp structure. When
14170    it is necessary to actually free the structure the first thing it
14171    does is call the 'free' method of the regexp_engine associated to
14172    the regexp, allowing the handling of the void *pprivate; member
14173    first. (This routine is not overridable by extensions, which is why
14174    the extensions free is called first.)
14175
14176    See regdupe and regdupe_internal if you change anything here.
14177 */
14178 #ifndef PERL_IN_XSUB_RE
14179 void
14180 Perl_pregfree(pTHX_ REGEXP *r)
14181 {
14182     SvREFCNT_dec(r);
14183 }
14184
14185 void
14186 Perl_pregfree2(pTHX_ REGEXP *rx)
14187 {
14188     dVAR;
14189     struct regexp *const r = ReANY(rx);
14190     GET_RE_DEBUG_FLAGS_DECL;
14191
14192     PERL_ARGS_ASSERT_PREGFREE2;
14193
14194     if (r->mother_re) {
14195         ReREFCNT_dec(r->mother_re);
14196     } else {
14197         CALLREGFREE_PVT(rx); /* free the private data */
14198         SvREFCNT_dec(RXp_PAREN_NAMES(r));
14199         Safefree(r->xpv_len_u.xpvlenu_pv);
14200     }
14201     if (r->substrs) {
14202         SvREFCNT_dec(r->anchored_substr);
14203         SvREFCNT_dec(r->anchored_utf8);
14204         SvREFCNT_dec(r->float_substr);
14205         SvREFCNT_dec(r->float_utf8);
14206         Safefree(r->substrs);
14207     }
14208     RX_MATCH_COPY_FREE(rx);
14209 #ifdef PERL_ANY_COW
14210     SvREFCNT_dec(r->saved_copy);
14211 #endif
14212     Safefree(r->offs);
14213     SvREFCNT_dec(r->qr_anoncv);
14214     rx->sv_u.svu_rx = 0;
14215 }
14216
14217 /*  reg_temp_copy()
14218
14219     This is a hacky workaround to the structural issue of match results
14220     being stored in the regexp structure which is in turn stored in
14221     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14222     could be PL_curpm in multiple contexts, and could require multiple
14223     result sets being associated with the pattern simultaneously, such
14224     as when doing a recursive match with (??{$qr})
14225
14226     The solution is to make a lightweight copy of the regexp structure
14227     when a qr// is returned from the code executed by (??{$qr}) this
14228     lightweight copy doesn't actually own any of its data except for
14229     the starp/end and the actual regexp structure itself.
14230
14231 */
14232
14233
14234 REGEXP *
14235 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14236 {
14237     struct regexp *ret;
14238     struct regexp *const r = ReANY(rx);
14239     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14240
14241     PERL_ARGS_ASSERT_REG_TEMP_COPY;
14242
14243     if (!ret_x)
14244         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14245     else {
14246         SvOK_off((SV *)ret_x);
14247         if (islv) {
14248             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14249                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
14250                made both spots point to the same regexp body.) */
14251             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14252             assert(!SvPVX(ret_x));
14253             ret_x->sv_u.svu_rx = temp->sv_any;
14254             temp->sv_any = NULL;
14255             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14256             SvREFCNT_dec(temp);
14257             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14258                ing below will not set it. */
14259             SvCUR_set(ret_x, SvCUR(rx));
14260         }
14261     }
14262     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14263        sv_force_normal(sv) is called.  */
14264     SvFAKE_on(ret_x);
14265     ret = ReANY(ret_x);
14266
14267     SvFLAGS(ret_x) |= SvUTF8(rx);
14268     /* We share the same string buffer as the original regexp, on which we
14269        hold a reference count, incremented when mother_re is set below.
14270        The string pointer is copied here, being part of the regexp struct.
14271      */
14272     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14273            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14274     if (r->offs) {
14275         const I32 npar = r->nparens+1;
14276         Newx(ret->offs, npar, regexp_paren_pair);
14277         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14278     }
14279     if (r->substrs) {
14280         Newx(ret->substrs, 1, struct reg_substr_data);
14281         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14282
14283         SvREFCNT_inc_void(ret->anchored_substr);
14284         SvREFCNT_inc_void(ret->anchored_utf8);
14285         SvREFCNT_inc_void(ret->float_substr);
14286         SvREFCNT_inc_void(ret->float_utf8);
14287
14288         /* check_substr and check_utf8, if non-NULL, point to either their
14289            anchored or float namesakes, and don't hold a second reference.  */
14290     }
14291     RX_MATCH_COPIED_off(ret_x);
14292 #ifdef PERL_ANY_COW
14293     ret->saved_copy = NULL;
14294 #endif
14295     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14296     SvREFCNT_inc_void(ret->qr_anoncv);
14297
14298     return ret_x;
14299 }
14300 #endif
14301
14302 /* regfree_internal()
14303
14304    Free the private data in a regexp. This is overloadable by
14305    extensions. Perl takes care of the regexp structure in pregfree(),
14306    this covers the *pprivate pointer which technically perl doesn't
14307    know about, however of course we have to handle the
14308    regexp_internal structure when no extension is in use.
14309
14310    Note this is called before freeing anything in the regexp
14311    structure.
14312  */
14313
14314 void
14315 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14316 {
14317     dVAR;
14318     struct regexp *const r = ReANY(rx);
14319     RXi_GET_DECL(r,ri);
14320     GET_RE_DEBUG_FLAGS_DECL;
14321
14322     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14323
14324     DEBUG_COMPILE_r({
14325         if (!PL_colorset)
14326             reginitcolors();
14327         {
14328             SV *dsv= sv_newmortal();
14329             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14330                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14331             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14332                 PL_colors[4],PL_colors[5],s);
14333         }
14334     });
14335 #ifdef RE_TRACK_PATTERN_OFFSETS
14336     if (ri->u.offsets)
14337         Safefree(ri->u.offsets);             /* 20010421 MJD */
14338 #endif
14339     if (ri->code_blocks) {
14340         int n;
14341         for (n = 0; n < ri->num_code_blocks; n++)
14342             SvREFCNT_dec(ri->code_blocks[n].src_regex);
14343         Safefree(ri->code_blocks);
14344     }
14345
14346     if (ri->data) {
14347         int n = ri->data->count;
14348
14349         while (--n >= 0) {
14350           /* If you add a ->what type here, update the comment in regcomp.h */
14351             switch (ri->data->what[n]) {
14352             case 'a':
14353             case 'r':
14354             case 's':
14355             case 'S':
14356             case 'u':
14357                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14358                 break;
14359             case 'f':
14360                 Safefree(ri->data->data[n]);
14361                 break;
14362             case 'l':
14363             case 'L':
14364                 break;
14365             case 'T':
14366                 { /* Aho Corasick add-on structure for a trie node.
14367                      Used in stclass optimization only */
14368                     U32 refcount;
14369                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14370                     OP_REFCNT_LOCK;
14371                     refcount = --aho->refcount;
14372                     OP_REFCNT_UNLOCK;
14373                     if ( !refcount ) {
14374                         PerlMemShared_free(aho->states);
14375                         PerlMemShared_free(aho->fail);
14376                          /* do this last!!!! */
14377                         PerlMemShared_free(ri->data->data[n]);
14378                         PerlMemShared_free(ri->regstclass);
14379                     }
14380                 }
14381                 break;
14382             case 't':
14383                 {
14384                     /* trie structure. */
14385                     U32 refcount;
14386                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14387                     OP_REFCNT_LOCK;
14388                     refcount = --trie->refcount;
14389                     OP_REFCNT_UNLOCK;
14390                     if ( !refcount ) {
14391                         PerlMemShared_free(trie->charmap);
14392                         PerlMemShared_free(trie->states);
14393                         PerlMemShared_free(trie->trans);
14394                         if (trie->bitmap)
14395                             PerlMemShared_free(trie->bitmap);
14396                         if (trie->jump)
14397                             PerlMemShared_free(trie->jump);
14398                         PerlMemShared_free(trie->wordinfo);
14399                         /* do this last!!!! */
14400                         PerlMemShared_free(ri->data->data[n]);
14401                     }
14402                 }
14403                 break;
14404             default:
14405                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14406             }
14407         }
14408         Safefree(ri->data->what);
14409         Safefree(ri->data);
14410     }
14411
14412     Safefree(ri);
14413 }
14414
14415 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14416 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14417 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
14418
14419 /*
14420    re_dup - duplicate a regexp.
14421
14422    This routine is expected to clone a given regexp structure. It is only
14423    compiled under USE_ITHREADS.
14424
14425    After all of the core data stored in struct regexp is duplicated
14426    the regexp_engine.dupe method is used to copy any private data
14427    stored in the *pprivate pointer. This allows extensions to handle
14428    any duplication it needs to do.
14429
14430    See pregfree() and regfree_internal() if you change anything here.
14431 */
14432 #if defined(USE_ITHREADS)
14433 #ifndef PERL_IN_XSUB_RE
14434 void
14435 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14436 {
14437     dVAR;
14438     I32 npar;
14439     const struct regexp *r = ReANY(sstr);
14440     struct regexp *ret = ReANY(dstr);
14441
14442     PERL_ARGS_ASSERT_RE_DUP_GUTS;
14443
14444     npar = r->nparens+1;
14445     Newx(ret->offs, npar, regexp_paren_pair);
14446     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14447     if(ret->swap) {
14448         /* no need to copy these */
14449         Newx(ret->swap, npar, regexp_paren_pair);
14450     }
14451
14452     if (ret->substrs) {
14453         /* Do it this way to avoid reading from *r after the StructCopy().
14454            That way, if any of the sv_dup_inc()s dislodge *r from the L1
14455            cache, it doesn't matter.  */
14456         const bool anchored = r->check_substr
14457             ? r->check_substr == r->anchored_substr
14458             : r->check_utf8 == r->anchored_utf8;
14459         Newx(ret->substrs, 1, struct reg_substr_data);
14460         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14461
14462         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14463         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14464         ret->float_substr = sv_dup_inc(ret->float_substr, param);
14465         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14466
14467         /* check_substr and check_utf8, if non-NULL, point to either their
14468            anchored or float namesakes, and don't hold a second reference.  */
14469
14470         if (ret->check_substr) {
14471             if (anchored) {
14472                 assert(r->check_utf8 == r->anchored_utf8);
14473                 ret->check_substr = ret->anchored_substr;
14474                 ret->check_utf8 = ret->anchored_utf8;
14475             } else {
14476                 assert(r->check_substr == r->float_substr);
14477                 assert(r->check_utf8 == r->float_utf8);
14478                 ret->check_substr = ret->float_substr;
14479                 ret->check_utf8 = ret->float_utf8;
14480             }
14481         } else if (ret->check_utf8) {
14482             if (anchored) {
14483                 ret->check_utf8 = ret->anchored_utf8;
14484             } else {
14485                 ret->check_utf8 = ret->float_utf8;
14486             }
14487         }
14488     }
14489
14490     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14491     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14492
14493     if (ret->pprivate)
14494         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14495
14496     if (RX_MATCH_COPIED(dstr))
14497         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
14498     else
14499         ret->subbeg = NULL;
14500 #ifdef PERL_ANY_COW
14501     ret->saved_copy = NULL;
14502 #endif
14503
14504     /* Whether mother_re be set or no, we need to copy the string.  We
14505        cannot refrain from copying it when the storage points directly to
14506        our mother regexp, because that's
14507                1: a buffer in a different thread
14508                2: something we no longer hold a reference on
14509                so we need to copy it locally.  */
14510     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
14511     ret->mother_re   = NULL;
14512     ret->gofs = 0;
14513 }
14514 #endif /* PERL_IN_XSUB_RE */
14515
14516 /*
14517    regdupe_internal()
14518
14519    This is the internal complement to regdupe() which is used to copy
14520    the structure pointed to by the *pprivate pointer in the regexp.
14521    This is the core version of the extension overridable cloning hook.
14522    The regexp structure being duplicated will be copied by perl prior
14523    to this and will be provided as the regexp *r argument, however
14524    with the /old/ structures pprivate pointer value. Thus this routine
14525    may override any copying normally done by perl.
14526
14527    It returns a pointer to the new regexp_internal structure.
14528 */
14529
14530 void *
14531 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14532 {
14533     dVAR;
14534     struct regexp *const r = ReANY(rx);
14535     regexp_internal *reti;
14536     int len;
14537     RXi_GET_DECL(r,ri);
14538
14539     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14540
14541     len = ProgLen(ri);
14542
14543     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14544     Copy(ri->program, reti->program, len+1, regnode);
14545
14546     reti->num_code_blocks = ri->num_code_blocks;
14547     if (ri->code_blocks) {
14548         int n;
14549         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14550                 struct reg_code_block);
14551         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14552                 struct reg_code_block);
14553         for (n = 0; n < ri->num_code_blocks; n++)
14554              reti->code_blocks[n].src_regex = (REGEXP*)
14555                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14556     }
14557     else
14558         reti->code_blocks = NULL;
14559
14560     reti->regstclass = NULL;
14561
14562     if (ri->data) {
14563         struct reg_data *d;
14564         const int count = ri->data->count;
14565         int i;
14566
14567         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14568                 char, struct reg_data);
14569         Newx(d->what, count, U8);
14570
14571         d->count = count;
14572         for (i = 0; i < count; i++) {
14573             d->what[i] = ri->data->what[i];
14574             switch (d->what[i]) {
14575                 /* see also regcomp.h and regfree_internal() */
14576             case 'a': /* actually an AV, but the dup function is identical.  */
14577             case 'r':
14578             case 's':
14579             case 'S':
14580             case 'u': /* actually an HV, but the dup function is identical.  */
14581                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14582                 break;
14583             case 'f':
14584                 /* This is cheating. */
14585                 Newx(d->data[i], 1, struct regnode_charclass_class);
14586                 StructCopy(ri->data->data[i], d->data[i],
14587                             struct regnode_charclass_class);
14588                 reti->regstclass = (regnode*)d->data[i];
14589                 break;
14590             case 'T':
14591                 /* Trie stclasses are readonly and can thus be shared
14592                  * without duplication. We free the stclass in pregfree
14593                  * when the corresponding reg_ac_data struct is freed.
14594                  */
14595                 reti->regstclass= ri->regstclass;
14596                 /* Fall through */
14597             case 't':
14598                 OP_REFCNT_LOCK;
14599                 ((reg_trie_data*)ri->data->data[i])->refcount++;
14600                 OP_REFCNT_UNLOCK;
14601                 /* Fall through */
14602             case 'l':
14603             case 'L':
14604                 d->data[i] = ri->data->data[i];
14605                 break;
14606             default:
14607                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14608             }
14609         }
14610
14611         reti->data = d;
14612     }
14613     else
14614         reti->data = NULL;
14615
14616     reti->name_list_idx = ri->name_list_idx;
14617
14618 #ifdef RE_TRACK_PATTERN_OFFSETS
14619     if (ri->u.offsets) {
14620         Newx(reti->u.offsets, 2*len+1, U32);
14621         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14622     }
14623 #else
14624     SetProgLen(reti,len);
14625 #endif
14626
14627     return (void*)reti;
14628 }
14629
14630 #endif    /* USE_ITHREADS */
14631
14632 #ifndef PERL_IN_XSUB_RE
14633
14634 /*
14635  - regnext - dig the "next" pointer out of a node
14636  */
14637 regnode *
14638 Perl_regnext(pTHX_ regnode *p)
14639 {
14640     dVAR;
14641     I32 offset;
14642
14643     if (!p)
14644         return(NULL);
14645
14646     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
14647         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14648     }
14649
14650     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14651     if (offset == 0)
14652         return(NULL);
14653
14654     return(p+offset);
14655 }
14656 #endif
14657
14658 STATIC void
14659 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14660 {
14661     va_list args;
14662     STRLEN l1 = strlen(pat1);
14663     STRLEN l2 = strlen(pat2);
14664     char buf[512];
14665     SV *msv;
14666     const char *message;
14667
14668     PERL_ARGS_ASSERT_RE_CROAK2;
14669
14670     if (l1 > 510)
14671         l1 = 510;
14672     if (l1 + l2 > 510)
14673         l2 = 510 - l1;
14674     Copy(pat1, buf, l1 , char);
14675     Copy(pat2, buf + l1, l2 , char);
14676     buf[l1 + l2] = '\n';
14677     buf[l1 + l2 + 1] = '\0';
14678 #ifdef I_STDARG
14679     /* ANSI variant takes additional second argument */
14680     va_start(args, pat2);
14681 #else
14682     va_start(args);
14683 #endif
14684     msv = vmess(buf, &args);
14685     va_end(args);
14686     message = SvPV_const(msv,l1);
14687     if (l1 > 512)
14688         l1 = 512;
14689     Copy(message, buf, l1 , char);
14690     buf[l1-1] = '\0';                   /* Overwrite \n */
14691     Perl_croak(aTHX_ "%s", buf);
14692 }
14693
14694 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
14695
14696 #ifndef PERL_IN_XSUB_RE
14697 void
14698 Perl_save_re_context(pTHX)
14699 {
14700     dVAR;
14701
14702     struct re_save_state *state;
14703
14704     SAVEVPTR(PL_curcop);
14705     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14706
14707     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14708     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14709     SSPUSHUV(SAVEt_RE_STATE);
14710
14711     Copy(&PL_reg_state, state, 1, struct re_save_state);
14712
14713     PL_reg_oldsaved = NULL;
14714     PL_reg_oldsavedlen = 0;
14715     PL_reg_oldsavedoffset = 0;
14716     PL_reg_oldsavedcoffset = 0;
14717     PL_reg_maxiter = 0;
14718     PL_reg_leftiter = 0;
14719     PL_reg_poscache = NULL;
14720     PL_reg_poscache_size = 0;
14721 #ifdef PERL_ANY_COW
14722     PL_nrs = NULL;
14723 #endif
14724
14725     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14726     if (PL_curpm) {
14727         const REGEXP * const rx = PM_GETRE(PL_curpm);
14728         if (rx) {
14729             U32 i;
14730             for (i = 1; i <= RX_NPARENS(rx); i++) {
14731                 char digits[TYPE_CHARS(long)];
14732                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14733                 GV *const *const gvp
14734                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14735
14736                 if (gvp) {
14737                     GV * const gv = *gvp;
14738                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14739                         save_scalar(gv);
14740                 }
14741             }
14742         }
14743     }
14744 }
14745 #endif
14746
14747 #ifdef DEBUGGING
14748
14749 STATIC void
14750 S_put_byte(pTHX_ SV *sv, int c)
14751 {
14752     PERL_ARGS_ASSERT_PUT_BYTE;
14753
14754     /* Our definition of isPRINT() ignores locales, so only bytes that are
14755        not part of UTF-8 are considered printable. I assume that the same
14756        holds for UTF-EBCDIC.
14757        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14758        which Wikipedia says:
14759
14760        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14761        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14762        identical, to the ASCII delete (DEL) or rubout control character.
14763        ) So the old condition can be simplified to !isPRINT(c)  */
14764     if (!isPRINT(c)) {
14765         if (c < 256) {
14766             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14767         }
14768         else {
14769             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14770         }
14771     }
14772     else {
14773         const char string = c;
14774         if (c == '-' || c == ']' || c == '\\' || c == '^')
14775             sv_catpvs(sv, "\\");
14776         sv_catpvn(sv, &string, 1);
14777     }
14778 }
14779
14780
14781 #define CLEAR_OPTSTART \
14782     if (optstart) STMT_START { \
14783             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14784             optstart=NULL; \
14785     } STMT_END
14786
14787 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14788
14789 STATIC const regnode *
14790 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14791             const regnode *last, const regnode *plast,
14792             SV* sv, I32 indent, U32 depth)
14793 {
14794     dVAR;
14795     U8 op = PSEUDO;     /* Arbitrary non-END op. */
14796     const regnode *next;
14797     const regnode *optstart= NULL;
14798
14799     RXi_GET_DECL(r,ri);
14800     GET_RE_DEBUG_FLAGS_DECL;
14801
14802     PERL_ARGS_ASSERT_DUMPUNTIL;
14803
14804 #ifdef DEBUG_DUMPUNTIL
14805     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14806         last ? last-start : 0,plast ? plast-start : 0);
14807 #endif
14808
14809     if (plast && plast < last)
14810         last= plast;
14811
14812     while (PL_regkind[op] != END && (!last || node < last)) {
14813         /* While that wasn't END last time... */
14814         NODE_ALIGN(node);
14815         op = OP(node);
14816         if (op == CLOSE || op == WHILEM)
14817             indent--;
14818         next = regnext((regnode *)node);
14819
14820         /* Where, what. */
14821         if (OP(node) == OPTIMIZED) {
14822             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14823                 optstart = node;
14824             else
14825                 goto after_print;
14826         } else
14827             CLEAR_OPTSTART;
14828
14829         regprop(r, sv, node);
14830         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14831                       (int)(2*indent + 1), "", SvPVX_const(sv));
14832
14833         if (OP(node) != OPTIMIZED) {
14834             if (next == NULL)           /* Next ptr. */
14835                 PerlIO_printf(Perl_debug_log, " (0)");
14836             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14837                 PerlIO_printf(Perl_debug_log, " (FAIL)");
14838             else
14839                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14840             (void)PerlIO_putc(Perl_debug_log, '\n');
14841         }
14842
14843       after_print:
14844         if (PL_regkind[(U8)op] == BRANCHJ) {
14845             assert(next);
14846             {
14847                 const regnode *nnode = (OP(next) == LONGJMP
14848                                        ? regnext((regnode *)next)
14849                                        : next);
14850                 if (last && nnode > last)
14851                     nnode = last;
14852                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14853             }
14854         }
14855         else if (PL_regkind[(U8)op] == BRANCH) {
14856             assert(next);
14857             DUMPUNTIL(NEXTOPER(node), next);
14858         }
14859         else if ( PL_regkind[(U8)op]  == TRIE ) {
14860             const regnode *this_trie = node;
14861             const char op = OP(node);
14862             const U32 n = ARG(node);
14863             const reg_ac_data * const ac = op>=AHOCORASICK ?
14864                (reg_ac_data *)ri->data->data[n] :
14865                NULL;
14866             const reg_trie_data * const trie =
14867                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14868 #ifdef DEBUGGING
14869             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14870 #endif
14871             const regnode *nextbranch= NULL;
14872             I32 word_idx;
14873             sv_setpvs(sv, "");
14874             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14875                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14876
14877                 PerlIO_printf(Perl_debug_log, "%*s%s ",
14878                    (int)(2*(indent+3)), "",
14879                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14880                             PL_colors[0], PL_colors[1],
14881                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14882                             PERL_PV_PRETTY_ELLIPSES    |
14883                             PERL_PV_PRETTY_LTGT
14884                             )
14885                             : "???"
14886                 );
14887                 if (trie->jump) {
14888                     U16 dist= trie->jump[word_idx+1];
14889                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14890                                   (UV)((dist ? this_trie + dist : next) - start));
14891                     if (dist) {
14892                         if (!nextbranch)
14893                             nextbranch= this_trie + trie->jump[0];
14894                         DUMPUNTIL(this_trie + dist, nextbranch);
14895                     }
14896                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14897                         nextbranch= regnext((regnode *)nextbranch);
14898                 } else {
14899                     PerlIO_printf(Perl_debug_log, "\n");
14900                 }
14901             }
14902             if (last && next > last)
14903                 node= last;
14904             else
14905                 node= next;
14906         }
14907         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
14908             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14909                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14910         }
14911         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14912             assert(next);
14913             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14914         }
14915         else if ( op == PLUS || op == STAR) {
14916             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14917         }
14918         else if (PL_regkind[(U8)op] == ANYOF) {
14919             /* arglen 1 + class block */
14920             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14921                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14922             node = NEXTOPER(node);
14923         }
14924         else if (PL_regkind[(U8)op] == EXACT) {
14925             /* Literal string, where present. */
14926             node += NODE_SZ_STR(node) - 1;
14927             node = NEXTOPER(node);
14928         }
14929         else {
14930             node = NEXTOPER(node);
14931             node += regarglen[(U8)op];
14932         }
14933         if (op == CURLYX || op == OPEN)
14934             indent++;
14935     }
14936     CLEAR_OPTSTART;
14937 #ifdef DEBUG_DUMPUNTIL
14938     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14939 #endif
14940     return node;
14941 }
14942
14943 #endif  /* DEBUGGING */
14944
14945 /*
14946  * Local variables:
14947  * c-indentation-style: bsd
14948  * c-basic-offset: 4
14949  * indent-tabs-mode: nil
14950  * End:
14951  *
14952  * ex: set ts=8 sts=4 sw=4 et:
14953  */