regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #ifdef HAS_ISBLANK
  95 #   define hasISBLANK 1
  96 #else
  97 #   define hasISBLANK 0
  98 #endif
  99
 100 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 101 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 102 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 103
 104 #ifdef op
 105 #undef op
 106 #endif /* op */
 107
 108 #ifdef MSDOS
 109 #  if defined(BUGGY_MSC6)
 110  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 111 #    pragma optimize("a",off)
 112  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 113 #    pragma optimize("w",on )
 114 #  endif /* BUGGY_MSC6 */
 115 #endif /* MSDOS */
 116
 117 #ifndef STATIC
 118 #define STATIC  static
 119 #endif
 120
 121
 122 typedef struct RExC_state_t {
 123     U32         flags;                  /* RXf_* are we folding, multilining? */
 124     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 125     char        *precomp;               /* uncompiled string. */
 126     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 127     regexp      *rx;                    /* perl core regexp structure */
 128     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 129     char        *start;                 /* Start of input for compile */
 130     char        *end;                   /* End of input for compile */
 131     char        *parse;                 /* Input-scan pointer. */
 132     I32         whilem_seen;            /* number of WHILEM in this expr */
 133     regnode     *emit_start;            /* Start of emitted-code area */
 134     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 135     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 136     I32         naughty;                /* How bad is this pattern? */
 137     I32         sawback;                /* Did we see \1, ...? */
 138     U32         seen;
 139     I32         size;                   /* Code size. */
 140     I32         npar;                   /* Capture buffer count, (OPEN). */
 141     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 142     I32         nestroot;               /* root parens we are in - used by accept */
 143     I32         extralen;
 144     I32         seen_zerolen;
 145     regnode     **open_parens;          /* pointers to open parens */
 146     regnode     **close_parens;         /* pointers to close parens */
 147     regnode     *opend;                 /* END node in program */
 148     I32         utf8;           /* whether the pattern is utf8 or not */
 149     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 150                                 /* XXX use this for future optimisation of case
 151                                  * where pattern must be upgraded to utf8. */
 152     I32         uni_semantics;  /* If a d charset modifier should use unicode
 153                                    rules, even if the pattern is not in
 154                                    utf8 */
 155     HV          *paren_names;           /* Paren names */
 156
 157     regnode     **recurse;              /* Recurse regops */
 158     I32         recurse_count;          /* Number of recurse regops */
 159     I32         in_lookbehind;
 160     I32         contains_locale;
 161     I32         override_recoding;
 162     I32         in_multi_char_class;
 163     struct reg_code_block *code_blocks; /* positions of literal (?{})
 164                                             within pattern */
 165     int         num_code_blocks;        /* size of code_blocks[] */
 166     int         code_index;             /* next code_blocks[] slot */
 167 #if ADD_TO_REGEXEC
 168     char        *starttry;              /* -Dr: where regtry was called. */
 169 #define RExC_starttry   (pRExC_state->starttry)
 170 #endif
 171     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 172 #ifdef DEBUGGING
 173     const char  *lastparse;
 174     I32         lastnum;
 175     AV          *paren_name_list;       /* idx -> name */
 176 #define RExC_lastparse  (pRExC_state->lastparse)
 177 #define RExC_lastnum    (pRExC_state->lastnum)
 178 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 179 #endif
 180 } RExC_state_t;
 181
 182 #define RExC_flags      (pRExC_state->flags)
 183 #define RExC_pm_flags   (pRExC_state->pm_flags)
 184 #define RExC_precomp    (pRExC_state->precomp)
 185 #define RExC_rx_sv      (pRExC_state->rx_sv)
 186 #define RExC_rx         (pRExC_state->rx)
 187 #define RExC_rxi        (pRExC_state->rxi)
 188 #define RExC_start      (pRExC_state->start)
 189 #define RExC_end        (pRExC_state->end)
 190 #define RExC_parse      (pRExC_state->parse)
 191 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 192 #ifdef RE_TRACK_PATTERN_OFFSETS
 193 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 194 #endif
 195 #define RExC_emit       (pRExC_state->emit)
 196 #define RExC_emit_start (pRExC_state->emit_start)
 197 #define RExC_emit_bound (pRExC_state->emit_bound)
 198 #define RExC_naughty    (pRExC_state->naughty)
 199 #define RExC_sawback    (pRExC_state->sawback)
 200 #define RExC_seen       (pRExC_state->seen)
 201 #define RExC_size       (pRExC_state->size)
 202 #define RExC_npar       (pRExC_state->npar)
 203 #define RExC_nestroot   (pRExC_state->nestroot)
 204 #define RExC_extralen   (pRExC_state->extralen)
 205 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 206 #define RExC_utf8       (pRExC_state->utf8)
 207 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 208 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 209 #define RExC_open_parens        (pRExC_state->open_parens)
 210 #define RExC_close_parens       (pRExC_state->close_parens)
 211 #define RExC_opend      (pRExC_state->opend)
 212 #define RExC_paren_names        (pRExC_state->paren_names)
 213 #define RExC_recurse    (pRExC_state->recurse)
 214 #define RExC_recurse_count      (pRExC_state->recurse_count)
 215 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 216 #define RExC_contains_locale    (pRExC_state->contains_locale)
 217 #define RExC_override_recoding (pRExC_state->override_recoding)
 218 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 219
 220
 221 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 222 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 223         ((*s) == '{' && regcurly(s)))
 224
 225 #ifdef SPSTART
 226 #undef SPSTART          /* dratted cpp namespace... */
 227 #endif
 228 /*
 229  * Flags to be passed up and down.
 230  */
 231 #define WORST           0       /* Worst case. */
 232 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 233
 234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 235  * character.  (There needs to be a case: in the switch statement in regexec.c
 236  * for any node marked SIMPLE.)  Note that this is not the same thing as
 237  * REGNODE_SIMPLE */
 238 #define SIMPLE          0x02
 239 #define SPSTART         0x04    /* Starts with * or + */
 240 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 241 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 242
 243 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 244
 245 /* whether trie related optimizations are enabled */
 246 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 247 #define TRIE_STUDY_OPT
 248 #define FULL_TRIE_STUDY
 249 #define TRIE_STCLASS
 250 #endif
 251
 252
 253
 254 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 255 #define PBITVAL(paren) (1 << ((paren) & 7))
 256 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 257 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 258 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 259
 260 /* If not already in utf8, do a longjmp back to the beginning */
 261 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 262 #define REQUIRE_UTF8    STMT_START {                                       \
 263                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 264                         } STMT_END
 265
 266 /* About scan_data_t.
 267
 268   During optimisation we recurse through the regexp program performing
 269   various inplace (keyhole style) optimisations. In addition study_chunk
 270   and scan_commit populate this data structure with information about
 271   what strings MUST appear in the pattern. We look for the longest
 272   string that must appear at a fixed location, and we look for the
 273   longest string that may appear at a floating location. So for instance
 274   in the pattern:
 275
 276     /FOO[xX]A.*B[xX]BAR/
 277
 278   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 279   strings (because they follow a .* construct). study_chunk will identify
 280   both FOO and BAR as being the longest fixed and floating strings respectively.
 281
 282   The strings can be composites, for instance
 283
 284      /(f)(o)(o)/
 285
 286   will result in a composite fixed substring 'foo'.
 287
 288   For each string some basic information is maintained:
 289
 290   - offset or min_offset
 291     This is the position the string must appear at, or not before.
 292     It also implicitly (when combined with minlenp) tells us how many
 293     characters must match before the string we are searching for.
 294     Likewise when combined with minlenp and the length of the string it
 295     tells us how many characters must appear after the string we have
 296     found.
 297
 298   - max_offset
 299     Only used for floating strings. This is the rightmost point that
 300     the string can appear at. If set to I32 max it indicates that the
 301     string can occur infinitely far to the right.
 302
 303   - minlenp
 304     A pointer to the minimum number of characters of the pattern that the
 305     string was found inside. This is important as in the case of positive
 306     lookahead or positive lookbehind we can have multiple patterns
 307     involved. Consider
 308
 309     /(?=FOO).*F/
 310
 311     The minimum length of the pattern overall is 3, the minimum length
 312     of the lookahead part is 3, but the minimum length of the part that
 313     will actually match is 1. So 'FOO's minimum length is 3, but the
 314     minimum length for the F is 1. This is important as the minimum length
 315     is used to determine offsets in front of and behind the string being
 316     looked for.  Since strings can be composites this is the length of the
 317     pattern at the time it was committed with a scan_commit. Note that
 318     the length is calculated by study_chunk, so that the minimum lengths
 319     are not known until the full pattern has been compiled, thus the
 320     pointer to the value.
 321
 322   - lookbehind
 323
 324     In the case of lookbehind the string being searched for can be
 325     offset past the start point of the final matching string.
 326     If this value was just blithely removed from the min_offset it would
 327     invalidate some of the calculations for how many chars must match
 328     before or after (as they are derived from min_offset and minlen and
 329     the length of the string being searched for).
 330     When the final pattern is compiled and the data is moved from the
 331     scan_data_t structure into the regexp structure the information
 332     about lookbehind is factored in, with the information that would
 333     have been lost precalculated in the end_shift field for the
 334     associated string.
 335
 336   The fields pos_min and pos_delta are used to store the minimum offset
 337   and the delta to the maximum offset at the current point in the pattern.
 338
 339 */
 340
 341 typedef struct scan_data_t {
 342     /*I32 len_min;      unused */
 343     /*I32 len_delta;    unused */
 344     I32 pos_min;
 345     I32 pos_delta;
 346     SV *last_found;
 347     I32 last_end;           /* min value, <0 unless valid. */
 348     I32 last_start_min;
 349     I32 last_start_max;
 350     SV **longest;           /* Either &l_fixed, or &l_float. */
 351     SV *longest_fixed;      /* longest fixed string found in pattern */
 352     I32 offset_fixed;       /* offset where it starts */
 353     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 354     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 355     SV *longest_float;      /* longest floating string found in pattern */
 356     I32 offset_float_min;   /* earliest point in string it can appear */
 357     I32 offset_float_max;   /* latest point in string it can appear */
 358     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 359     I32 lookbehind_float;   /* is the position of the string modified by LB */
 360     I32 flags;
 361     I32 whilem_c;
 362     I32 *last_closep;
 363     struct regnode_charclass_class *start_class;
 364 } scan_data_t;
 365
 366 /*
 367  * Forward declarations for pregcomp()'s friends.
 368  */
 369
 370 static const scan_data_t zero_scan_data =
 371   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 372
 373 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 374 #define SF_BEFORE_SEOL          0x0001
 375 #define SF_BEFORE_MEOL          0x0002
 376 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 377 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 378
 379 #ifdef NO_UNARY_PLUS
 380 #  define SF_FIX_SHIFT_EOL      (0+2)
 381 #  define SF_FL_SHIFT_EOL               (0+4)
 382 #else
 383 #  define SF_FIX_SHIFT_EOL      (+2)
 384 #  define SF_FL_SHIFT_EOL               (+4)
 385 #endif
 386
 387 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 388 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 389
 390 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 391 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 392 #define SF_IS_INF               0x0040
 393 #define SF_HAS_PAR              0x0080
 394 #define SF_IN_PAR               0x0100
 395 #define SF_HAS_EVAL             0x0200
 396 #define SCF_DO_SUBSTR           0x0400
 397 #define SCF_DO_STCLASS_AND      0x0800
 398 #define SCF_DO_STCLASS_OR       0x1000
 399 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 400 #define SCF_WHILEM_VISITED_POS  0x2000
 401
 402 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 403 #define SCF_SEEN_ACCEPT         0x8000
 404
 405 #define UTF cBOOL(RExC_utf8)
 406
 407 /* The enums for all these are ordered so things work out correctly */
 408 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 409 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 410 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 411 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 412 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 413 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 414 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 415
 416 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 417
 418 #define OOB_NAMEDCLASS          -1
 419
 420 /* There is no code point that is out-of-bounds, so this is problematic.  But
 421  * its only current use is to initialize a variable that is always set before
 422  * looked at. */
 423 #define OOB_UNICODE             0xDEADBEEF
 424
 425 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 426 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 427
 428
 429 /* length of regex to show in messages that don't mark a position within */
 430 #define RegexLengthToShowInErrorMessages 127
 431
 432 /*
 433  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 434  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 435  * op/pragma/warn/regcomp.
 436  */
 437 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 438 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 439
 440 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 441
 442 /*
 443  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 444  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 445  * "...".
 446  */
 447 #define _FAIL(code) STMT_START {                                        \
 448     const char *ellipses = "";                                          \
 449     IV len = RExC_end - RExC_precomp;                                   \
 450                                                                         \
 451     if (!SIZE_ONLY)                                                     \
 452         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 453     if (len > RegexLengthToShowInErrorMessages) {                       \
 454         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 455         len = RegexLengthToShowInErrorMessages - 10;                    \
 456         ellipses = "...";                                               \
 457     }                                                                   \
 458     code;                                                               \
 459 } STMT_END
 460
 461 #define FAIL(msg) _FAIL(                            \
 462     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 463             msg, (int)len, RExC_precomp, ellipses))
 464
 465 #define FAIL2(msg,arg) _FAIL(                       \
 466     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 467             arg, (int)len, RExC_precomp, ellipses))
 468
 469 /*
 470  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 471  */
 472 #define Simple_vFAIL(m) STMT_START {                                    \
 473     const IV offset = RExC_parse - RExC_precomp;                        \
 474     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 475             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 476 } STMT_END
 477
 478 /*
 479  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 480  */
 481 #define vFAIL(m) STMT_START {                           \
 482     if (!SIZE_ONLY)                                     \
 483         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 484     Simple_vFAIL(m);                                    \
 485 } STMT_END
 486
 487 /*
 488  * Like Simple_vFAIL(), but accepts two arguments.
 489  */
 490 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 491     const IV offset = RExC_parse - RExC_precomp;                        \
 492     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 493             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 494 } STMT_END
 495
 496 /*
 497  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 498  */
 499 #define vFAIL2(m,a1) STMT_START {                       \
 500     if (!SIZE_ONLY)                                     \
 501         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 502     Simple_vFAIL2(m, a1);                               \
 503 } STMT_END
 504
 505
 506 /*
 507  * Like Simple_vFAIL(), but accepts three arguments.
 508  */
 509 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 510     const IV offset = RExC_parse - RExC_precomp;                \
 511     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 512             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 513 } STMT_END
 514
 515 /*
 516  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 517  */
 518 #define vFAIL3(m,a1,a2) STMT_START {                    \
 519     if (!SIZE_ONLY)                                     \
 520         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 521     Simple_vFAIL3(m, a1, a2);                           \
 522 } STMT_END
 523
 524 /*
 525  * Like Simple_vFAIL(), but accepts four arguments.
 526  */
 527 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 528     const IV offset = RExC_parse - RExC_precomp;                \
 529     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 530             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 531 } STMT_END
 532
 533 #define ckWARNreg(loc,m) STMT_START {                                   \
 534     const IV offset = loc - RExC_precomp;                               \
 535     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 536             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 537 } STMT_END
 538
 539 #define ckWARNregdep(loc,m) STMT_START {                                \
 540     const IV offset = loc - RExC_precomp;                               \
 541     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 542             m REPORT_LOCATION,                                          \
 543             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 544 } STMT_END
 545
 546 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 547     const IV offset = loc - RExC_precomp;                               \
 548     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 549             m REPORT_LOCATION,                                          \
 550             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 551 } STMT_END
 552
 553 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 554     const IV offset = loc - RExC_precomp;                               \
 555     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 556             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 557 } STMT_END
 558
 559 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 560     const IV offset = loc - RExC_precomp;                               \
 561     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 562             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 563 } STMT_END
 564
 565 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 566     const IV offset = loc - RExC_precomp;                               \
 567     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 568             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 569 } STMT_END
 570
 571 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 572     const IV offset = loc - RExC_precomp;                               \
 573     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 574             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 575 } STMT_END
 576
 577 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 578     const IV offset = loc - RExC_precomp;                               \
 579     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 580             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 581 } STMT_END
 582
 583 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 584     const IV offset = loc - RExC_precomp;                               \
 585     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 586             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 587 } STMT_END
 588
 589
 590 /* Allow for side effects in s */
 591 #define REGC(c,s) STMT_START {                  \
 592     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 593 } STMT_END
 594
 595 /* Macros for recording node offsets.   20001227 mjd@plover.com
 596  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 597  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 598  * Element 0 holds the number n.
 599  * Position is 1 indexed.
 600  */
 601 #ifndef RE_TRACK_PATTERN_OFFSETS
 602 #define Set_Node_Offset_To_R(node,byte)
 603 #define Set_Node_Offset(node,byte)
 604 #define Set_Cur_Node_Offset
 605 #define Set_Node_Length_To_R(node,len)
 606 #define Set_Node_Length(node,len)
 607 #define Set_Node_Cur_Length(node)
 608 #define Node_Offset(n)
 609 #define Node_Length(n)
 610 #define Set_Node_Offset_Length(node,offset,len)
 611 #define ProgLen(ri) ri->u.proglen
 612 #define SetProgLen(ri,x) ri->u.proglen = x
 613 #else
 614 #define ProgLen(ri) ri->u.offsets[0]
 615 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 616 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 617     if (! SIZE_ONLY) {                                                  \
 618         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 619                     __LINE__, (int)(node), (int)(byte)));               \
 620         if((node) < 0) {                                                \
 621             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 622         } else {                                                        \
 623             RExC_offsets[2*(node)-1] = (byte);                          \
 624         }                                                               \
 625     }                                                                   \
 626 } STMT_END
 627
 628 #define Set_Node_Offset(node,byte) \
 629     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 630 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 631
 632 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 633     if (! SIZE_ONLY) {                                                  \
 634         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 635                 __LINE__, (int)(node), (int)(len)));                    \
 636         if((node) < 0) {                                                \
 637             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 638         } else {                                                        \
 639             RExC_offsets[2*(node)] = (len);                             \
 640         }                                                               \
 641     }                                                                   \
 642 } STMT_END
 643
 644 #define Set_Node_Length(node,len) \
 645     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 646 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 647 #define Set_Node_Cur_Length(node) \
 648     Set_Node_Length(node, RExC_parse - parse_start)
 649
 650 /* Get offsets and lengths */
 651 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 652 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 653
 654 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 655     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 656     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 657 } STMT_END
 658 #endif
 659
 660 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 661 #define EXPERIMENTAL_INPLACESCAN
 662 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 663
 664 #define DEBUG_STUDYDATA(str,data,depth)                              \
 665 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 666     PerlIO_printf(Perl_debug_log,                                    \
 667         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 668         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 669         (int)(depth)*2, "",                                          \
 670         (IV)((data)->pos_min),                                       \
 671         (IV)((data)->pos_delta),                                     \
 672         (UV)((data)->flags),                                         \
 673         (IV)((data)->whilem_c),                                      \
 674         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 675         is_inf ? "INF " : ""                                         \
 676     );                                                               \
 677     if ((data)->last_found)                                          \
 678         PerlIO_printf(Perl_debug_log,                                \
 679             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 680             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 681             SvPVX_const((data)->last_found),                         \
 682             (IV)((data)->last_end),                                  \
 683             (IV)((data)->last_start_min),                            \
 684             (IV)((data)->last_start_max),                            \
 685             ((data)->longest &&                                      \
 686              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 687             SvPVX_const((data)->longest_fixed),                      \
 688             (IV)((data)->offset_fixed),                              \
 689             ((data)->longest &&                                      \
 690              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 691             SvPVX_const((data)->longest_float),                      \
 692             (IV)((data)->offset_float_min),                          \
 693             (IV)((data)->offset_float_max)                           \
 694         );                                                           \
 695     PerlIO_printf(Perl_debug_log,"\n");                              \
 696 });
 697
 698 static void clear_re(pTHX_ void *r);
 699
 700 /* Mark that we cannot extend a found fixed substring at this point.
 701    Update the longest found anchored substring and the longest found
 702    floating substrings if needed. */
 703
 704 STATIC void
 705 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 706 {
 707     const STRLEN l = CHR_SVLEN(data->last_found);
 708     const STRLEN old_l = CHR_SVLEN(*data->longest);
 709     GET_RE_DEBUG_FLAGS_DECL;
 710
 711     PERL_ARGS_ASSERT_SCAN_COMMIT;
 712
 713     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 714         SvSetMagicSV(*data->longest, data->last_found);
 715         if (*data->longest == data->longest_fixed) {
 716             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 717             if (data->flags & SF_BEFORE_EOL)
 718                 data->flags
 719                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 720             else
 721                 data->flags &= ~SF_FIX_BEFORE_EOL;
 722             data->minlen_fixed=minlenp;
 723             data->lookbehind_fixed=0;
 724         }
 725         else { /* *data->longest == data->longest_float */
 726             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 727             data->offset_float_max = (l
 728                                       ? data->last_start_max
 729                                       : data->pos_min + data->pos_delta);
 730             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 731                 data->offset_float_max = I32_MAX;
 732             if (data->flags & SF_BEFORE_EOL)
 733                 data->flags
 734                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 735             else
 736                 data->flags &= ~SF_FL_BEFORE_EOL;
 737             data->minlen_float=minlenp;
 738             data->lookbehind_float=0;
 739         }
 740     }
 741     SvCUR_set(data->last_found, 0);
 742     {
 743         SV * const sv = data->last_found;
 744         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 745             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 746             if (mg)
 747                 mg->mg_len = 0;
 748         }
 749     }
 750     data->last_end = -1;
 751     data->flags &= ~SF_BEFORE_EOL;
 752     DEBUG_STUDYDATA("commit: ",data,0);
 753 }
 754
 755 /* Can match anything (initialization) */
 756 STATIC void
 757 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 758 {
 759     PERL_ARGS_ASSERT_CL_ANYTHING;
 760
 761     ANYOF_BITMAP_SETALL(cl);
 762     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 763                 |ANYOF_NON_UTF8_LATIN1_ALL;
 764
 765     /* If any portion of the regex is to operate under locale rules,
 766      * initialization includes it.  The reason this isn't done for all regexes
 767      * is that the optimizer was written under the assumption that locale was
 768      * all-or-nothing.  Given the complexity and lack of documentation in the
 769      * optimizer, and that there are inadequate test cases for locale, so many
 770      * parts of it may not work properly, it is safest to avoid locale unless
 771      * necessary. */
 772     if (RExC_contains_locale) {
 773         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 774         cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
 775     }
 776     else {
 777         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 778     }
 779 }
 780
 781 /* Can match anything (initialization) */
 782 STATIC int
 783 S_cl_is_anything(const struct regnode_charclass_class *cl)
 784 {
 785     int value;
 786
 787     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 788
 789     for (value = 0; value <= ANYOF_MAX; value += 2)
 790         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 791             return 1;
 792     if (!(cl->flags & ANYOF_UNICODE_ALL))
 793         return 0;
 794     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 795         return 0;
 796     return 1;
 797 }
 798
 799 /* Can match anything (initialization) */
 800 STATIC void
 801 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 802 {
 803     PERL_ARGS_ASSERT_CL_INIT;
 804
 805     Zero(cl, 1, struct regnode_charclass_class);
 806     cl->type = ANYOF;
 807     cl_anything(pRExC_state, cl);
 808     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 809 }
 810
 811 /* These two functions currently do the exact same thing */
 812 #define cl_init_zero            S_cl_init
 813
 814 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 815  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 816  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 817 STATIC void
 818 S_cl_and(struct regnode_charclass_class *cl,
 819         const struct regnode_charclass_class *and_with)
 820 {
 821     PERL_ARGS_ASSERT_CL_AND;
 822
 823     assert(and_with->type == ANYOF);
 824
 825     /* I (khw) am not sure all these restrictions are necessary XXX */
 826     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 827         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 828         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 829         && !(and_with->flags & ANYOF_LOC_FOLD)
 830         && !(cl->flags & ANYOF_LOC_FOLD)) {
 831         int i;
 832
 833         if (and_with->flags & ANYOF_INVERT)
 834             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 835                 cl->bitmap[i] &= ~and_with->bitmap[i];
 836         else
 837             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 838                 cl->bitmap[i] &= and_with->bitmap[i];
 839     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 840
 841     if (and_with->flags & ANYOF_INVERT) {
 842
 843         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 844          * aren't affected by the inversion.  Those that are affected are
 845          * handled individually below */
 846         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 847         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 848         cl->flags |= affected_flags;
 849
 850         /* We currently don't know how to deal with things that aren't in the
 851          * bitmap, but we know that the intersection is no greater than what
 852          * is already in cl, so let there be false positives that get sorted
 853          * out after the synthetic start class succeeds, and the node is
 854          * matched for real. */
 855
 856         /* The inversion of these two flags indicate that the resulting
 857          * intersection doesn't have them */
 858         if (and_with->flags & ANYOF_UNICODE_ALL) {
 859             cl->flags &= ~ANYOF_UNICODE_ALL;
 860         }
 861         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 862             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 863         }
 864     }
 865     else {   /* and'd node is not inverted */
 866         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 867
 868         if (! ANYOF_NONBITMAP(and_with)) {
 869
 870             /* Here 'and_with' doesn't match anything outside the bitmap
 871              * (except possibly ANYOF_UNICODE_ALL), which means the
 872              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 873              * which case we don't know what the intersection is, but it's no
 874              * greater than what cl already has, so can just leave it alone,
 875              * with possible false positives */
 876             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 877                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 878                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 879             }
 880         }
 881         else if (! ANYOF_NONBITMAP(cl)) {
 882
 883             /* Here, 'and_with' does match something outside the bitmap, and cl
 884              * doesn't have a list of things to match outside the bitmap.  If
 885              * cl can match all code points above 255, the intersection will
 886              * be those above-255 code points that 'and_with' matches.  If cl
 887              * can't match all Unicode code points, it means that it can't
 888              * match anything outside the bitmap (since the 'if' that got us
 889              * into this block tested for that), so we leave the bitmap empty.
 890              */
 891             if (cl->flags & ANYOF_UNICODE_ALL) {
 892                 ARG_SET(cl, ARG(and_with));
 893
 894                 /* and_with's ARG may match things that don't require UTF8.
 895                  * And now cl's will too, in spite of this being an 'and'.  See
 896                  * the comments below about the kludge */
 897                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 898             }
 899         }
 900         else {
 901             /* Here, both 'and_with' and cl match something outside the
 902              * bitmap.  Currently we do not do the intersection, so just match
 903              * whatever cl had at the beginning.  */
 904         }
 905
 906
 907         /* Take the intersection of the two sets of flags.  However, the
 908          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 909          * kludge around the fact that this flag is not treated like the others
 910          * which are initialized in cl_anything().  The way the optimizer works
 911          * is that the synthetic start class (SSC) is initialized to match
 912          * anything, and then the first time a real node is encountered, its
 913          * values are AND'd with the SSC's with the result being the values of
 914          * the real node.  However, there are paths through the optimizer where
 915          * the AND never gets called, so those initialized bits are set
 916          * inappropriately, which is not usually a big deal, as they just cause
 917          * false positives in the SSC, which will just mean a probably
 918          * imperceptible slow down in execution.  However this bit has a
 919          * higher false positive consequence in that it can cause utf8.pm,
 920          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 921          * bigger slowdown and also causes significant extra memory to be used.
 922          * In order to prevent this, the code now takes a different tack.  The
 923          * bit isn't set unless some part of the regular expression needs it,
 924          * but once set it won't get cleared.  This means that these extra
 925          * modules won't get loaded unless there was some path through the
 926          * pattern that would have required them anyway, and  so any false
 927          * positives that occur by not ANDing them out when they could be
 928          * aren't as severe as they would be if we treated this bit like all
 929          * the others */
 930         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 931                                       & ANYOF_NONBITMAP_NON_UTF8;
 932         cl->flags &= and_with->flags;
 933         cl->flags |= outside_bitmap_but_not_utf8;
 934     }
 935 }
 936
 937 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 938  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 939  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 940 STATIC void
 941 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 942 {
 943     PERL_ARGS_ASSERT_CL_OR;
 944
 945     if (or_with->flags & ANYOF_INVERT) {
 946
 947         /* Here, the or'd node is to be inverted.  This means we take the
 948          * complement of everything not in the bitmap, but currently we don't
 949          * know what that is, so give up and match anything */
 950         if (ANYOF_NONBITMAP(or_with)) {
 951             cl_anything(pRExC_state, cl);
 952         }
 953         /* We do not use
 954          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 955          *   <= (B1 | !B2) | (CL1 | !CL2)
 956          * which is wasteful if CL2 is small, but we ignore CL2:
 957          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 958          * XXXX Can we handle case-fold?  Unclear:
 959          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 960          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 961          */
 962         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 963              && !(or_with->flags & ANYOF_LOC_FOLD)
 964              && !(cl->flags & ANYOF_LOC_FOLD) ) {
 965             int i;
 966
 967             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 968                 cl->bitmap[i] |= ~or_with->bitmap[i];
 969         } /* XXXX: logic is complicated otherwise */
 970         else {
 971             cl_anything(pRExC_state, cl);
 972         }
 973
 974         /* And, we can just take the union of the flags that aren't affected
 975          * by the inversion */
 976         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 977
 978         /* For the remaining flags:
 979             ANYOF_UNICODE_ALL and inverted means to not match anything above
 980                     255, which means that the union with cl should just be
 981                     what cl has in it, so can ignore this flag
 982             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 983                     is 127-255 to match them, but then invert that, so the
 984                     union with cl should just be what cl has in it, so can
 985                     ignore this flag
 986          */
 987     } else {    /* 'or_with' is not inverted */
 988         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 989         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 990              && (!(or_with->flags & ANYOF_LOC_FOLD)
 991                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
 992             int i;
 993
 994             /* OR char bitmap and class bitmap separately */
 995             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 996                 cl->bitmap[i] |= or_with->bitmap[i];
 997             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 998                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 999                     cl->classflags[i] |= or_with->classflags[i];
1000                 cl->flags |= ANYOF_CLASS;
1001             }
1002         }
1003         else { /* XXXX: logic is complicated, leave it along for a moment. */
1004             cl_anything(pRExC_state, cl);
1005         }
1006
1007         if (ANYOF_NONBITMAP(or_with)) {
1008
1009             /* Use the added node's outside-the-bit-map match if there isn't a
1010              * conflict.  If there is a conflict (both nodes match something
1011              * outside the bitmap, but what they match outside is not the same
1012              * pointer, and hence not easily compared until XXX we extend
1013              * inversion lists this far), give up and allow the start class to
1014              * match everything outside the bitmap.  If that stuff is all above
1015              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1016             if (! ANYOF_NONBITMAP(cl)) {
1017                 ARG_SET(cl, ARG(or_with));
1018             }
1019             else if (ARG(cl) != ARG(or_with)) {
1020
1021                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1022                     cl_anything(pRExC_state, cl);
1023                 }
1024                 else {
1025                     cl->flags |= ANYOF_UNICODE_ALL;
1026                 }
1027             }
1028         }
1029
1030         /* Take the union */
1031         cl->flags |= or_with->flags;
1032     }
1033 }
1034
1035 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1036 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1037 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1038 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1039
1040
1041 #ifdef DEBUGGING
1042 /*
1043    dump_trie(trie,widecharmap,revcharmap)
1044    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1045    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1046
1047    These routines dump out a trie in a somewhat readable format.
1048    The _interim_ variants are used for debugging the interim
1049    tables that are used to generate the final compressed
1050    representation which is what dump_trie expects.
1051
1052    Part of the reason for their existence is to provide a form
1053    of documentation as to how the different representations function.
1054
1055 */
1056
1057 /*
1058   Dumps the final compressed table form of the trie to Perl_debug_log.
1059   Used for debugging make_trie().
1060 */
1061
1062 STATIC void
1063 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1064             AV *revcharmap, U32 depth)
1065 {
1066     U32 state;
1067     SV *sv=sv_newmortal();
1068     int colwidth= widecharmap ? 6 : 4;
1069     U16 word;
1070     GET_RE_DEBUG_FLAGS_DECL;
1071
1072     PERL_ARGS_ASSERT_DUMP_TRIE;
1073
1074     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1075         (int)depth * 2 + 2,"",
1076         "Match","Base","Ofs" );
1077
1078     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1079         SV ** const tmp = av_fetch( revcharmap, state, 0);
1080         if ( tmp ) {
1081             PerlIO_printf( Perl_debug_log, "%*s",
1082                 colwidth,
1083                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1084                             PL_colors[0], PL_colors[1],
1085                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1086                             PERL_PV_ESCAPE_FIRSTCHAR
1087                 )
1088             );
1089         }
1090     }
1091     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1092         (int)depth * 2 + 2,"");
1093
1094     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1095         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1096     PerlIO_printf( Perl_debug_log, "\n");
1097
1098     for( state = 1 ; state < trie->statecount ; state++ ) {
1099         const U32 base = trie->states[ state ].trans.base;
1100
1101         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1102
1103         if ( trie->states[ state ].wordnum ) {
1104             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1105         } else {
1106             PerlIO_printf( Perl_debug_log, "%6s", "" );
1107         }
1108
1109         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1110
1111         if ( base ) {
1112             U32 ofs = 0;
1113
1114             while( ( base + ofs  < trie->uniquecharcount ) ||
1115                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1116                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1117                     ofs++;
1118
1119             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1120
1121             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1122                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1123                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1124                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1125                 {
1126                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1127                     colwidth,
1128                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1129                 } else {
1130                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1131                 }
1132             }
1133
1134             PerlIO_printf( Perl_debug_log, "]");
1135
1136         }
1137         PerlIO_printf( Perl_debug_log, "\n" );
1138     }
1139     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1140     for (word=1; word <= trie->wordcount; word++) {
1141         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1142             (int)word, (int)(trie->wordinfo[word].prev),
1143             (int)(trie->wordinfo[word].len));
1144     }
1145     PerlIO_printf(Perl_debug_log, "\n" );
1146 }
1147 /*
1148   Dumps a fully constructed but uncompressed trie in list form.
1149   List tries normally only are used for construction when the number of
1150   possible chars (trie->uniquecharcount) is very high.
1151   Used for debugging make_trie().
1152 */
1153 STATIC void
1154 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1155                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1156                          U32 depth)
1157 {
1158     U32 state;
1159     SV *sv=sv_newmortal();
1160     int colwidth= widecharmap ? 6 : 4;
1161     GET_RE_DEBUG_FLAGS_DECL;
1162
1163     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1164
1165     /* print out the table precompression.  */
1166     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1167         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1168         "------:-----+-----------------\n" );
1169
1170     for( state=1 ; state < next_alloc ; state ++ ) {
1171         U16 charid;
1172
1173         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1174             (int)depth * 2 + 2,"", (UV)state  );
1175         if ( ! trie->states[ state ].wordnum ) {
1176             PerlIO_printf( Perl_debug_log, "%5s| ","");
1177         } else {
1178             PerlIO_printf( Perl_debug_log, "W%4x| ",
1179                 trie->states[ state ].wordnum
1180             );
1181         }
1182         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1183             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1184             if ( tmp ) {
1185                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1186                     colwidth,
1187                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1188                             PL_colors[0], PL_colors[1],
1189                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1190                             PERL_PV_ESCAPE_FIRSTCHAR
1191                     ) ,
1192                     TRIE_LIST_ITEM(state,charid).forid,
1193                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1194                 );
1195                 if (!(charid % 10))
1196                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1197                         (int)((depth * 2) + 14), "");
1198             }
1199         }
1200         PerlIO_printf( Perl_debug_log, "\n");
1201     }
1202 }
1203
1204 /*
1205   Dumps a fully constructed but uncompressed trie in table form.
1206   This is the normal DFA style state transition table, with a few
1207   twists to facilitate compression later.
1208   Used for debugging make_trie().
1209 */
1210 STATIC void
1211 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1212                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1213                           U32 depth)
1214 {
1215     U32 state;
1216     U16 charid;
1217     SV *sv=sv_newmortal();
1218     int colwidth= widecharmap ? 6 : 4;
1219     GET_RE_DEBUG_FLAGS_DECL;
1220
1221     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1222
1223     /*
1224        print out the table precompression so that we can do a visual check
1225        that they are identical.
1226      */
1227
1228     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1229
1230     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1231         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1232         if ( tmp ) {
1233             PerlIO_printf( Perl_debug_log, "%*s",
1234                 colwidth,
1235                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1236                             PL_colors[0], PL_colors[1],
1237                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1238                             PERL_PV_ESCAPE_FIRSTCHAR
1239                 )
1240             );
1241         }
1242     }
1243
1244     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1245
1246     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1247         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1248     }
1249
1250     PerlIO_printf( Perl_debug_log, "\n" );
1251
1252     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1253
1254         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1255             (int)depth * 2 + 2,"",
1256             (UV)TRIE_NODENUM( state ) );
1257
1258         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1259             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1260             if (v)
1261                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1262             else
1263                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1264         }
1265         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1266             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1267         } else {
1268             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1269             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1270         }
1271     }
1272 }
1273
1274 #endif
1275
1276
1277 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1278   startbranch: the first branch in the whole branch sequence
1279   first      : start branch of sequence of branch-exact nodes.
1280                May be the same as startbranch
1281   last       : Thing following the last branch.
1282                May be the same as tail.
1283   tail       : item following the branch sequence
1284   count      : words in the sequence
1285   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1286   depth      : indent depth
1287
1288 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1289
1290 A trie is an N'ary tree where the branches are determined by digital
1291 decomposition of the key. IE, at the root node you look up the 1st character and
1292 follow that branch repeat until you find the end of the branches. Nodes can be
1293 marked as "accepting" meaning they represent a complete word. Eg:
1294
1295   /he|she|his|hers/
1296
1297 would convert into the following structure. Numbers represent states, letters
1298 following numbers represent valid transitions on the letter from that state, if
1299 the number is in square brackets it represents an accepting state, otherwise it
1300 will be in parenthesis.
1301
1302       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1303       |    |
1304       |   (2)
1305       |    |
1306      (1)   +-i->(6)-+-s->[7]
1307       |
1308       +-s->(3)-+-h->(4)-+-e->[5]
1309
1310       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1311
1312 This shows that when matching against the string 'hers' we will begin at state 1
1313 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1314 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1315 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1316 single traverse. We store a mapping from accepting to state to which word was
1317 matched, and then when we have multiple possibilities we try to complete the
1318 rest of the regex in the order in which they occured in the alternation.
1319
1320 The only prior NFA like behaviour that would be changed by the TRIE support is
1321 the silent ignoring of duplicate alternations which are of the form:
1322
1323  / (DUPE|DUPE) X? (?{ ... }) Y /x
1324
1325 Thus EVAL blocks following a trie may be called a different number of times with
1326 and without the optimisation. With the optimisations dupes will be silently
1327 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1328 the following demonstrates:
1329
1330  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1331
1332 which prints out 'word' three times, but
1333
1334  'words'=~/(word|word|word)(?{ print $1 })S/
1335
1336 which doesnt print it out at all. This is due to other optimisations kicking in.
1337
1338 Example of what happens on a structural level:
1339
1340 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1341
1342    1: CURLYM[1] {1,32767}(18)
1343    5:   BRANCH(8)
1344    6:     EXACT <ac>(16)
1345    8:   BRANCH(11)
1346    9:     EXACT <ad>(16)
1347   11:   BRANCH(14)
1348   12:     EXACT <ab>(16)
1349   16:   SUCCEED(0)
1350   17:   NOTHING(18)
1351   18: END(0)
1352
1353 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1354 and should turn into:
1355
1356    1: CURLYM[1] {1,32767}(18)
1357    5:   TRIE(16)
1358         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1359           <ac>
1360           <ad>
1361           <ab>
1362   16:   SUCCEED(0)
1363   17:   NOTHING(18)
1364   18: END(0)
1365
1366 Cases where tail != last would be like /(?foo|bar)baz/:
1367
1368    1: BRANCH(4)
1369    2:   EXACT <foo>(8)
1370    4: BRANCH(7)
1371    5:   EXACT <bar>(8)
1372    7: TAIL(8)
1373    8: EXACT <baz>(10)
1374   10: END(0)
1375
1376 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1377 and would end up looking like:
1378
1379     1: TRIE(8)
1380       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1381         <foo>
1382         <bar>
1383    7: TAIL(8)
1384    8: EXACT <baz>(10)
1385   10: END(0)
1386
1387     d = uvuni_to_utf8_flags(d, uv, 0);
1388
1389 is the recommended Unicode-aware way of saying
1390
1391     *(d++) = uv;
1392 */
1393
1394 #define TRIE_STORE_REVCHAR(val)                                            \
1395     STMT_START {                                                           \
1396         if (UTF) {                                                         \
1397             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1398             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1399             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1400             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1401             SvPOK_on(zlopp);                                               \
1402             SvUTF8_on(zlopp);                                              \
1403             av_push(revcharmap, zlopp);                                    \
1404         } else {                                                           \
1405             char ooooff = (char)val;                                           \
1406             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1407         }                                                                  \
1408         } STMT_END
1409
1410 #define TRIE_READ_CHAR STMT_START {                                                     \
1411     wordlen++;                                                                          \
1412     if ( UTF ) {                                                                        \
1413         /* if it is UTF then it is either already folded, or does not need folding */   \
1414         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1415     }                                                                                   \
1416     else if (folder == PL_fold_latin1) {                                                \
1417         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1418         if ( foldlen > 0 ) {                                                            \
1419            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1420            foldlen -= len;                                                              \
1421            scan += len;                                                                 \
1422            len = 0;                                                                     \
1423         } else {                                                                        \
1424             len = 1;                                                                    \
1425             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1426             skiplen = UNISKIP(uvc);                                                     \
1427             foldlen -= skiplen;                                                         \
1428             scan = foldbuf + skiplen;                                                   \
1429         }                                                                               \
1430     } else {                                                                            \
1431         /* raw data, will be folded later if needed */                                  \
1432         uvc = (U32)*uc;                                                                 \
1433         len = 1;                                                                        \
1434     }                                                                                   \
1435 } STMT_END
1436
1437
1438
1439 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1440     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1441         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1442         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1443     }                                                           \
1444     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1445     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1446     TRIE_LIST_CUR( state )++;                                   \
1447 } STMT_END
1448
1449 #define TRIE_LIST_NEW(state) STMT_START {                       \
1450     Newxz( trie->states[ state ].trans.list,               \
1451         4, reg_trie_trans_le );                                 \
1452      TRIE_LIST_CUR( state ) = 1;                                \
1453      TRIE_LIST_LEN( state ) = 4;                                \
1454 } STMT_END
1455
1456 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1457     U16 dupe= trie->states[ state ].wordnum;                    \
1458     regnode * const noper_next = regnext( noper );              \
1459                                                                 \
1460     DEBUG_r({                                                   \
1461         /* store the word for dumping */                        \
1462         SV* tmp;                                                \
1463         if (OP(noper) != NOTHING)                               \
1464             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1465         else                                                    \
1466             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1467         av_push( trie_words, tmp );                             \
1468     });                                                         \
1469                                                                 \
1470     curword++;                                                  \
1471     trie->wordinfo[curword].prev   = 0;                         \
1472     trie->wordinfo[curword].len    = wordlen;                   \
1473     trie->wordinfo[curword].accept = state;                     \
1474                                                                 \
1475     if ( noper_next < tail ) {                                  \
1476         if (!trie->jump)                                        \
1477             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1478         trie->jump[curword] = (U16)(noper_next - convert);      \
1479         if (!jumper)                                            \
1480             jumper = noper_next;                                \
1481         if (!nextbranch)                                        \
1482             nextbranch= regnext(cur);                           \
1483     }                                                           \
1484                                                                 \
1485     if ( dupe ) {                                               \
1486         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1487         /* chain, so that when the bits of chain are later    */\
1488         /* linked together, the dups appear in the chain      */\
1489         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1490         trie->wordinfo[dupe].prev = curword;                    \
1491     } else {                                                    \
1492         /* we haven't inserted this word yet.                */ \
1493         trie->states[ state ].wordnum = curword;                \
1494     }                                                           \
1495 } STMT_END
1496
1497
1498 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1499      ( ( base + charid >=  ucharcount                                   \
1500          && base + charid < ubound                                      \
1501          && state == trie->trans[ base - ucharcount + charid ].check    \
1502          && trie->trans[ base - ucharcount + charid ].next )            \
1503            ? trie->trans[ base - ucharcount + charid ].next             \
1504            : ( state==1 ? special : 0 )                                 \
1505       )
1506
1507 #define MADE_TRIE       1
1508 #define MADE_JUMP_TRIE  2
1509 #define MADE_EXACT_TRIE 4
1510
1511 STATIC I32
1512 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1513 {
1514     dVAR;
1515     /* first pass, loop through and scan words */
1516     reg_trie_data *trie;
1517     HV *widecharmap = NULL;
1518     AV *revcharmap = newAV();
1519     regnode *cur;
1520     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1521     STRLEN len = 0;
1522     UV uvc = 0;
1523     U16 curword = 0;
1524     U32 next_alloc = 0;
1525     regnode *jumper = NULL;
1526     regnode *nextbranch = NULL;
1527     regnode *convert = NULL;
1528     U32 *prev_states; /* temp array mapping each state to previous one */
1529     /* we just use folder as a flag in utf8 */
1530     const U8 * folder = NULL;
1531
1532 #ifdef DEBUGGING
1533     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1534     AV *trie_words = NULL;
1535     /* along with revcharmap, this only used during construction but both are
1536      * useful during debugging so we store them in the struct when debugging.
1537      */
1538 #else
1539     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1540     STRLEN trie_charcount=0;
1541 #endif
1542     SV *re_trie_maxbuff;
1543     GET_RE_DEBUG_FLAGS_DECL;
1544
1545     PERL_ARGS_ASSERT_MAKE_TRIE;
1546 #ifndef DEBUGGING
1547     PERL_UNUSED_ARG(depth);
1548 #endif
1549
1550     switch (flags) {
1551         case EXACT: break;
1552         case EXACTFA:
1553         case EXACTFU_SS:
1554         case EXACTFU_TRICKYFOLD:
1555         case EXACTFU: folder = PL_fold_latin1; break;
1556         case EXACTF:  folder = PL_fold; break;
1557         case EXACTFL: folder = PL_fold_locale; break;
1558         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1559     }
1560
1561     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1562     trie->refcount = 1;
1563     trie->startstate = 1;
1564     trie->wordcount = word_count;
1565     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1566     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1567     if (flags == EXACT)
1568         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1569     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1570                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1571
1572     DEBUG_r({
1573         trie_words = newAV();
1574     });
1575
1576     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1577     if (!SvIOK(re_trie_maxbuff)) {
1578         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1579     }
1580     DEBUG_TRIE_COMPILE_r({
1581                 PerlIO_printf( Perl_debug_log,
1582                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1583                   (int)depth * 2 + 2, "",
1584                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1585                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1586                   (int)depth);
1587     });
1588
1589    /* Find the node we are going to overwrite */
1590     if ( first == startbranch && OP( last ) != BRANCH ) {
1591         /* whole branch chain */
1592         convert = first;
1593     } else {
1594         /* branch sub-chain */
1595         convert = NEXTOPER( first );
1596     }
1597
1598     /*  -- First loop and Setup --
1599
1600        We first traverse the branches and scan each word to determine if it
1601        contains widechars, and how many unique chars there are, this is
1602        important as we have to build a table with at least as many columns as we
1603        have unique chars.
1604
1605        We use an array of integers to represent the character codes 0..255
1606        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1607        native representation of the character value as the key and IV's for the
1608        coded index.
1609
1610        *TODO* If we keep track of how many times each character is used we can
1611        remap the columns so that the table compression later on is more
1612        efficient in terms of memory by ensuring the most common value is in the
1613        middle and the least common are on the outside.  IMO this would be better
1614        than a most to least common mapping as theres a decent chance the most
1615        common letter will share a node with the least common, meaning the node
1616        will not be compressible. With a middle is most common approach the worst
1617        case is when we have the least common nodes twice.
1618
1619      */
1620
1621     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1622         regnode *noper = NEXTOPER( cur );
1623         const U8 *uc = (U8*)STRING( noper );
1624         const U8 *e  = uc + STR_LEN( noper );
1625         STRLEN foldlen = 0;
1626         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1627         STRLEN skiplen = 0;
1628         const U8 *scan = (U8*)NULL;
1629         U32 wordlen      = 0;         /* required init */
1630         STRLEN chars = 0;
1631         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1632
1633         if (OP(noper) == NOTHING) {
1634             regnode *noper_next= regnext(noper);
1635             if (noper_next != tail && OP(noper_next) == flags) {
1636                 noper = noper_next;
1637                 uc= (U8*)STRING(noper);
1638                 e= uc + STR_LEN(noper);
1639                 trie->minlen= STR_LEN(noper);
1640             } else {
1641                 trie->minlen= 0;
1642                 continue;
1643             }
1644         }
1645
1646         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1647             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1648                                           regardless of encoding */
1649             if (OP( noper ) == EXACTFU_SS) {
1650                 /* false positives are ok, so just set this */
1651                 TRIE_BITMAP_SET(trie,0xDF);
1652             }
1653         }
1654         for ( ; uc < e ; uc += len ) {
1655             TRIE_CHARCOUNT(trie)++;
1656             TRIE_READ_CHAR;
1657             chars++;
1658             if ( uvc < 256 ) {
1659                 if ( folder ) {
1660                     U8 folded= folder[ (U8) uvc ];
1661                     if ( !trie->charmap[ folded ] ) {
1662                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1663                         TRIE_STORE_REVCHAR( folded );
1664                     }
1665                 }
1666                 if ( !trie->charmap[ uvc ] ) {
1667                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1668                     TRIE_STORE_REVCHAR( uvc );
1669                 }
1670                 if ( set_bit ) {
1671                     /* store the codepoint in the bitmap, and its folded
1672                      * equivalent. */
1673                     TRIE_BITMAP_SET(trie, uvc);
1674
1675                     /* store the folded codepoint */
1676                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1677
1678                     if ( !UTF ) {
1679                         /* store first byte of utf8 representation of
1680                            variant codepoints */
1681                         if (! UNI_IS_INVARIANT(uvc)) {
1682                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1683                         }
1684                     }
1685                     set_bit = 0; /* We've done our bit :-) */
1686                 }
1687             } else {
1688                 SV** svpp;
1689                 if ( !widecharmap )
1690                     widecharmap = newHV();
1691
1692                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1693
1694                 if ( !svpp )
1695                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1696
1697                 if ( !SvTRUE( *svpp ) ) {
1698                     sv_setiv( *svpp, ++trie->uniquecharcount );
1699                     TRIE_STORE_REVCHAR(uvc);
1700                 }
1701             }
1702         }
1703         if( cur == first ) {
1704             trie->minlen = chars;
1705             trie->maxlen = chars;
1706         } else if (chars < trie->minlen) {
1707             trie->minlen = chars;
1708         } else if (chars > trie->maxlen) {
1709             trie->maxlen = chars;
1710         }
1711         if (OP( noper ) == EXACTFU_SS) {
1712             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1713             if (trie->minlen > 1)
1714                 trie->minlen= 1;
1715         }
1716         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1717             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1718              *                - We assume that any such sequence might match a 2 byte string */
1719             if (trie->minlen > 2 )
1720                 trie->minlen= 2;
1721         }
1722
1723     } /* end first pass */
1724     DEBUG_TRIE_COMPILE_r(
1725         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1726                 (int)depth * 2 + 2,"",
1727                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1728                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1729                 (int)trie->minlen, (int)trie->maxlen )
1730     );
1731
1732     /*
1733         We now know what we are dealing with in terms of unique chars and
1734         string sizes so we can calculate how much memory a naive
1735         representation using a flat table  will take. If it's over a reasonable
1736         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1737         conservative but potentially much slower representation using an array
1738         of lists.
1739
1740         At the end we convert both representations into the same compressed
1741         form that will be used in regexec.c for matching with. The latter
1742         is a form that cannot be used to construct with but has memory
1743         properties similar to the list form and access properties similar
1744         to the table form making it both suitable for fast searches and
1745         small enough that its feasable to store for the duration of a program.
1746
1747         See the comment in the code where the compressed table is produced
1748         inplace from the flat tabe representation for an explanation of how
1749         the compression works.
1750
1751     */
1752
1753
1754     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1755     prev_states[1] = 0;
1756
1757     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1758         /*
1759             Second Pass -- Array Of Lists Representation
1760
1761             Each state will be represented by a list of charid:state records
1762             (reg_trie_trans_le) the first such element holds the CUR and LEN
1763             points of the allocated array. (See defines above).
1764
1765             We build the initial structure using the lists, and then convert
1766             it into the compressed table form which allows faster lookups
1767             (but cant be modified once converted).
1768         */
1769
1770         STRLEN transcount = 1;
1771
1772         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1773             "%*sCompiling trie using list compiler\n",
1774             (int)depth * 2 + 2, ""));
1775
1776         trie->states = (reg_trie_state *)
1777             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1778                                   sizeof(reg_trie_state) );
1779         TRIE_LIST_NEW(1);
1780         next_alloc = 2;
1781
1782         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1783
1784             regnode *noper   = NEXTOPER( cur );
1785             U8 *uc           = (U8*)STRING( noper );
1786             const U8 *e      = uc + STR_LEN( noper );
1787             U32 state        = 1;         /* required init */
1788             U16 charid       = 0;         /* sanity init */
1789             U8 *scan         = (U8*)NULL; /* sanity init */
1790             STRLEN foldlen   = 0;         /* required init */
1791             U32 wordlen      = 0;         /* required init */
1792             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1793             STRLEN skiplen   = 0;
1794
1795             if (OP(noper) == NOTHING) {
1796                 regnode *noper_next= regnext(noper);
1797                 if (noper_next != tail && OP(noper_next) == flags) {
1798                     noper = noper_next;
1799                     uc= (U8*)STRING(noper);
1800                     e= uc + STR_LEN(noper);
1801                 }
1802             }
1803
1804             if (OP(noper) != NOTHING) {
1805                 for ( ; uc < e ; uc += len ) {
1806
1807                     TRIE_READ_CHAR;
1808
1809                     if ( uvc < 256 ) {
1810                         charid = trie->charmap[ uvc ];
1811                     } else {
1812                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1813                         if ( !svpp ) {
1814                             charid = 0;
1815                         } else {
1816                             charid=(U16)SvIV( *svpp );
1817                         }
1818                     }
1819                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1820                     if ( charid ) {
1821
1822                         U16 check;
1823                         U32 newstate = 0;
1824
1825                         charid--;
1826                         if ( !trie->states[ state ].trans.list ) {
1827                             TRIE_LIST_NEW( state );
1828                         }
1829                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1830                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1831                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1832                                 break;
1833                             }
1834                         }
1835                         if ( ! newstate ) {
1836                             newstate = next_alloc++;
1837                             prev_states[newstate] = state;
1838                             TRIE_LIST_PUSH( state, charid, newstate );
1839                             transcount++;
1840                         }
1841                         state = newstate;
1842                     } else {
1843                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1844                     }
1845                 }
1846             }
1847             TRIE_HANDLE_WORD(state);
1848
1849         } /* end second pass */
1850
1851         /* next alloc is the NEXT state to be allocated */
1852         trie->statecount = next_alloc;
1853         trie->states = (reg_trie_state *)
1854             PerlMemShared_realloc( trie->states,
1855                                    next_alloc
1856                                    * sizeof(reg_trie_state) );
1857
1858         /* and now dump it out before we compress it */
1859         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1860                                                          revcharmap, next_alloc,
1861                                                          depth+1)
1862         );
1863
1864         trie->trans = (reg_trie_trans *)
1865             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1866         {
1867             U32 state;
1868             U32 tp = 0;
1869             U32 zp = 0;
1870
1871
1872             for( state=1 ; state < next_alloc ; state ++ ) {
1873                 U32 base=0;
1874
1875                 /*
1876                 DEBUG_TRIE_COMPILE_MORE_r(
1877                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1878                 );
1879                 */
1880
1881                 if (trie->states[state].trans.list) {
1882                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1883                     U16 maxid=minid;
1884                     U16 idx;
1885
1886                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1887                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1888                         if ( forid < minid ) {
1889                             minid=forid;
1890                         } else if ( forid > maxid ) {
1891                             maxid=forid;
1892                         }
1893                     }
1894                     if ( transcount < tp + maxid - minid + 1) {
1895                         transcount *= 2;
1896                         trie->trans = (reg_trie_trans *)
1897                             PerlMemShared_realloc( trie->trans,
1898                                                      transcount
1899                                                      * sizeof(reg_trie_trans) );
1900                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1901                     }
1902                     base = trie->uniquecharcount + tp - minid;
1903                     if ( maxid == minid ) {
1904                         U32 set = 0;
1905                         for ( ; zp < tp ; zp++ ) {
1906                             if ( ! trie->trans[ zp ].next ) {
1907                                 base = trie->uniquecharcount + zp - minid;
1908                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1909                                 trie->trans[ zp ].check = state;
1910                                 set = 1;
1911                                 break;
1912                             }
1913                         }
1914                         if ( !set ) {
1915                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1916                             trie->trans[ tp ].check = state;
1917                             tp++;
1918                             zp = tp;
1919                         }
1920                     } else {
1921                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1922                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1923                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1924                             trie->trans[ tid ].check = state;
1925                         }
1926                         tp += ( maxid - minid + 1 );
1927                     }
1928                     Safefree(trie->states[ state ].trans.list);
1929                 }
1930                 /*
1931                 DEBUG_TRIE_COMPILE_MORE_r(
1932                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1933                 );
1934                 */
1935                 trie->states[ state ].trans.base=base;
1936             }
1937             trie->lasttrans = tp + 1;
1938         }
1939     } else {
1940         /*
1941            Second Pass -- Flat Table Representation.
1942
1943            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1944            We know that we will need Charcount+1 trans at most to store the data
1945            (one row per char at worst case) So we preallocate both structures
1946            assuming worst case.
1947
1948            We then construct the trie using only the .next slots of the entry
1949            structs.
1950
1951            We use the .check field of the first entry of the node temporarily to
1952            make compression both faster and easier by keeping track of how many non
1953            zero fields are in the node.
1954
1955            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1956            transition.
1957
1958            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1959            number representing the first entry of the node, and state as a
1960            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1961            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1962            are 2 entrys per node. eg:
1963
1964              A B       A B
1965           1. 2 4    1. 3 7
1966           2. 0 3    3. 0 5
1967           3. 0 0    5. 0 0
1968           4. 0 0    7. 0 0
1969
1970            The table is internally in the right hand, idx form. However as we also
1971            have to deal with the states array which is indexed by nodenum we have to
1972            use TRIE_NODENUM() to convert.
1973
1974         */
1975         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1976             "%*sCompiling trie using table compiler\n",
1977             (int)depth * 2 + 2, ""));
1978
1979         trie->trans = (reg_trie_trans *)
1980             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1981                                   * trie->uniquecharcount + 1,
1982                                   sizeof(reg_trie_trans) );
1983         trie->states = (reg_trie_state *)
1984             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1985                                   sizeof(reg_trie_state) );
1986         next_alloc = trie->uniquecharcount + 1;
1987
1988
1989         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1990
1991             regnode *noper   = NEXTOPER( cur );
1992             const U8 *uc     = (U8*)STRING( noper );
1993             const U8 *e      = uc + STR_LEN( noper );
1994
1995             U32 state        = 1;         /* required init */
1996
1997             U16 charid       = 0;         /* sanity init */
1998             U32 accept_state = 0;         /* sanity init */
1999             U8 *scan         = (U8*)NULL; /* sanity init */
2000
2001             STRLEN foldlen   = 0;         /* required init */
2002             U32 wordlen      = 0;         /* required init */
2003             STRLEN skiplen   = 0;
2004             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2005
2006             if (OP(noper) == NOTHING) {
2007                 regnode *noper_next= regnext(noper);
2008                 if (noper_next != tail && OP(noper_next) == flags) {
2009                     noper = noper_next;
2010                     uc= (U8*)STRING(noper);
2011                     e= uc + STR_LEN(noper);
2012                 }
2013             }
2014
2015             if ( OP(noper) != NOTHING ) {
2016                 for ( ; uc < e ; uc += len ) {
2017
2018                     TRIE_READ_CHAR;
2019
2020                     if ( uvc < 256 ) {
2021                         charid = trie->charmap[ uvc ];
2022                     } else {
2023                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2024                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2025                     }
2026                     if ( charid ) {
2027                         charid--;
2028                         if ( !trie->trans[ state + charid ].next ) {
2029                             trie->trans[ state + charid ].next = next_alloc;
2030                             trie->trans[ state ].check++;
2031                             prev_states[TRIE_NODENUM(next_alloc)]
2032                                     = TRIE_NODENUM(state);
2033                             next_alloc += trie->uniquecharcount;
2034                         }
2035                         state = trie->trans[ state + charid ].next;
2036                     } else {
2037                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2038                     }
2039                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2040                 }
2041             }
2042             accept_state = TRIE_NODENUM( state );
2043             TRIE_HANDLE_WORD(accept_state);
2044
2045         } /* end second pass */
2046
2047         /* and now dump it out before we compress it */
2048         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2049                                                           revcharmap,
2050                                                           next_alloc, depth+1));
2051
2052         {
2053         /*
2054            * Inplace compress the table.*
2055
2056            For sparse data sets the table constructed by the trie algorithm will
2057            be mostly 0/FAIL transitions or to put it another way mostly empty.
2058            (Note that leaf nodes will not contain any transitions.)
2059
2060            This algorithm compresses the tables by eliminating most such
2061            transitions, at the cost of a modest bit of extra work during lookup:
2062
2063            - Each states[] entry contains a .base field which indicates the
2064            index in the state[] array wheres its transition data is stored.
2065
2066            - If .base is 0 there are no valid transitions from that node.
2067
2068            - If .base is nonzero then charid is added to it to find an entry in
2069            the trans array.
2070
2071            -If trans[states[state].base+charid].check!=state then the
2072            transition is taken to be a 0/Fail transition. Thus if there are fail
2073            transitions at the front of the node then the .base offset will point
2074            somewhere inside the previous nodes data (or maybe even into a node
2075            even earlier), but the .check field determines if the transition is
2076            valid.
2077
2078            XXX - wrong maybe?
2079            The following process inplace converts the table to the compressed
2080            table: We first do not compress the root node 1,and mark all its
2081            .check pointers as 1 and set its .base pointer as 1 as well. This
2082            allows us to do a DFA construction from the compressed table later,
2083            and ensures that any .base pointers we calculate later are greater
2084            than 0.
2085
2086            - We set 'pos' to indicate the first entry of the second node.
2087
2088            - We then iterate over the columns of the node, finding the first and
2089            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2090            and set the .check pointers accordingly, and advance pos
2091            appropriately and repreat for the next node. Note that when we copy
2092            the next pointers we have to convert them from the original
2093            NODEIDX form to NODENUM form as the former is not valid post
2094            compression.
2095
2096            - If a node has no transitions used we mark its base as 0 and do not
2097            advance the pos pointer.
2098
2099            - If a node only has one transition we use a second pointer into the
2100            structure to fill in allocated fail transitions from other states.
2101            This pointer is independent of the main pointer and scans forward
2102            looking for null transitions that are allocated to a state. When it
2103            finds one it writes the single transition into the "hole".  If the
2104            pointer doesnt find one the single transition is appended as normal.
2105
2106            - Once compressed we can Renew/realloc the structures to release the
2107            excess space.
2108
2109            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2110            specifically Fig 3.47 and the associated pseudocode.
2111
2112            demq
2113         */
2114         const U32 laststate = TRIE_NODENUM( next_alloc );
2115         U32 state, charid;
2116         U32 pos = 0, zp=0;
2117         trie->statecount = laststate;
2118
2119         for ( state = 1 ; state < laststate ; state++ ) {
2120             U8 flag = 0;
2121             const U32 stateidx = TRIE_NODEIDX( state );
2122             const U32 o_used = trie->trans[ stateidx ].check;
2123             U32 used = trie->trans[ stateidx ].check;
2124             trie->trans[ stateidx ].check = 0;
2125
2126             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2127                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2128                     if ( trie->trans[ stateidx + charid ].next ) {
2129                         if (o_used == 1) {
2130                             for ( ; zp < pos ; zp++ ) {
2131                                 if ( ! trie->trans[ zp ].next ) {
2132                                     break;
2133                                 }
2134                             }
2135                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2136                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2137                             trie->trans[ zp ].check = state;
2138                             if ( ++zp > pos ) pos = zp;
2139                             break;
2140                         }
2141                         used--;
2142                     }
2143                     if ( !flag ) {
2144                         flag = 1;
2145                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2146                     }
2147                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2148                     trie->trans[ pos ].check = state;
2149                     pos++;
2150                 }
2151             }
2152         }
2153         trie->lasttrans = pos + 1;
2154         trie->states = (reg_trie_state *)
2155             PerlMemShared_realloc( trie->states, laststate
2156                                    * sizeof(reg_trie_state) );
2157         DEBUG_TRIE_COMPILE_MORE_r(
2158                 PerlIO_printf( Perl_debug_log,
2159                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2160                     (int)depth * 2 + 2,"",
2161                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2162                     (IV)next_alloc,
2163                     (IV)pos,
2164                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2165             );
2166
2167         } /* end table compress */
2168     }
2169     DEBUG_TRIE_COMPILE_MORE_r(
2170             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2171                 (int)depth * 2 + 2, "",
2172                 (UV)trie->statecount,
2173                 (UV)trie->lasttrans)
2174     );
2175     /* resize the trans array to remove unused space */
2176     trie->trans = (reg_trie_trans *)
2177         PerlMemShared_realloc( trie->trans, trie->lasttrans
2178                                * sizeof(reg_trie_trans) );
2179
2180     {   /* Modify the program and insert the new TRIE node */
2181         U8 nodetype =(U8)(flags & 0xFF);
2182         char *str=NULL;
2183
2184 #ifdef DEBUGGING
2185         regnode *optimize = NULL;
2186 #ifdef RE_TRACK_PATTERN_OFFSETS
2187
2188         U32 mjd_offset = 0;
2189         U32 mjd_nodelen = 0;
2190 #endif /* RE_TRACK_PATTERN_OFFSETS */
2191 #endif /* DEBUGGING */
2192         /*
2193            This means we convert either the first branch or the first Exact,
2194            depending on whether the thing following (in 'last') is a branch
2195            or not and whther first is the startbranch (ie is it a sub part of
2196            the alternation or is it the whole thing.)
2197            Assuming its a sub part we convert the EXACT otherwise we convert
2198            the whole branch sequence, including the first.
2199          */
2200         /* Find the node we are going to overwrite */
2201         if ( first != startbranch || OP( last ) == BRANCH ) {
2202             /* branch sub-chain */
2203             NEXT_OFF( first ) = (U16)(last - first);
2204 #ifdef RE_TRACK_PATTERN_OFFSETS
2205             DEBUG_r({
2206                 mjd_offset= Node_Offset((convert));
2207                 mjd_nodelen= Node_Length((convert));
2208             });
2209 #endif
2210             /* whole branch chain */
2211         }
2212 #ifdef RE_TRACK_PATTERN_OFFSETS
2213         else {
2214             DEBUG_r({
2215                 const  regnode *nop = NEXTOPER( convert );
2216                 mjd_offset= Node_Offset((nop));
2217                 mjd_nodelen= Node_Length((nop));
2218             });
2219         }
2220         DEBUG_OPTIMISE_r(
2221             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2222                 (int)depth * 2 + 2, "",
2223                 (UV)mjd_offset, (UV)mjd_nodelen)
2224         );
2225 #endif
2226         /* But first we check to see if there is a common prefix we can
2227            split out as an EXACT and put in front of the TRIE node.  */
2228         trie->startstate= 1;
2229         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2230             U32 state;
2231             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2232                 U32 ofs = 0;
2233                 I32 idx = -1;
2234                 U32 count = 0;
2235                 const U32 base = trie->states[ state ].trans.base;
2236
2237                 if ( trie->states[state].wordnum )
2238                         count = 1;
2239
2240                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2241                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2242                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2243                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2244                     {
2245                         if ( ++count > 1 ) {
2246                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2247                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2248                             if ( state == 1 ) break;
2249                             if ( count == 2 ) {
2250                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2251                                 DEBUG_OPTIMISE_r(
2252                                     PerlIO_printf(Perl_debug_log,
2253                                         "%*sNew Start State=%"UVuf" Class: [",
2254                                         (int)depth * 2 + 2, "",
2255                                         (UV)state));
2256                                 if (idx >= 0) {
2257                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2258                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2259
2260                                     TRIE_BITMAP_SET(trie,*ch);
2261                                     if ( folder )
2262                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2263                                     DEBUG_OPTIMISE_r(
2264                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2265                                     );
2266                                 }
2267                             }
2268                             TRIE_BITMAP_SET(trie,*ch);
2269                             if ( folder )
2270                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2271                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2272                         }
2273                         idx = ofs;
2274                     }
2275                 }
2276                 if ( count == 1 ) {
2277                     SV **tmp = av_fetch( revcharmap, idx, 0);
2278                     STRLEN len;
2279                     char *ch = SvPV( *tmp, len );
2280                     DEBUG_OPTIMISE_r({
2281                         SV *sv=sv_newmortal();
2282                         PerlIO_printf( Perl_debug_log,
2283                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2284                             (int)depth * 2 + 2, "",
2285                             (UV)state, (UV)idx,
2286                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2287                                 PL_colors[0], PL_colors[1],
2288                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2289                                 PERL_PV_ESCAPE_FIRSTCHAR
2290                             )
2291                         );
2292                     });
2293                     if ( state==1 ) {
2294                         OP( convert ) = nodetype;
2295                         str=STRING(convert);
2296                         STR_LEN(convert)=0;
2297                     }
2298                     STR_LEN(convert) += len;
2299                     while (len--)
2300                         *str++ = *ch++;
2301                 } else {
2302 #ifdef DEBUGGING
2303                     if (state>1)
2304                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2305 #endif
2306                     break;
2307                 }
2308             }
2309             trie->prefixlen = (state-1);
2310             if (str) {
2311                 regnode *n = convert+NODE_SZ_STR(convert);
2312                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2313                 trie->startstate = state;
2314                 trie->minlen -= (state - 1);
2315                 trie->maxlen -= (state - 1);
2316 #ifdef DEBUGGING
2317                /* At least the UNICOS C compiler choked on this
2318                 * being argument to DEBUG_r(), so let's just have
2319                 * it right here. */
2320                if (
2321 #ifdef PERL_EXT_RE_BUILD
2322                    1
2323 #else
2324                    DEBUG_r_TEST
2325 #endif
2326                    ) {
2327                    regnode *fix = convert;
2328                    U32 word = trie->wordcount;
2329                    mjd_nodelen++;
2330                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2331                    while( ++fix < n ) {
2332                        Set_Node_Offset_Length(fix, 0, 0);
2333                    }
2334                    while (word--) {
2335                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2336                        if (tmp) {
2337                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2338                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2339                            else
2340                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2341                        }
2342                    }
2343                }
2344 #endif
2345                 if (trie->maxlen) {
2346                     convert = n;
2347                 } else {
2348                     NEXT_OFF(convert) = (U16)(tail - convert);
2349                     DEBUG_r(optimize= n);
2350                 }
2351             }
2352         }
2353         if (!jumper)
2354             jumper = last;
2355         if ( trie->maxlen ) {
2356             NEXT_OFF( convert ) = (U16)(tail - convert);
2357             ARG_SET( convert, data_slot );
2358             /* Store the offset to the first unabsorbed branch in
2359                jump[0], which is otherwise unused by the jump logic.
2360                We use this when dumping a trie and during optimisation. */
2361             if (trie->jump)
2362                 trie->jump[0] = (U16)(nextbranch - convert);
2363
2364             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2365              *   and there is a bitmap
2366              *   and the first "jump target" node we found leaves enough room
2367              * then convert the TRIE node into a TRIEC node, with the bitmap
2368              * embedded inline in the opcode - this is hypothetically faster.
2369              */
2370             if ( !trie->states[trie->startstate].wordnum
2371                  && trie->bitmap
2372                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2373             {
2374                 OP( convert ) = TRIEC;
2375                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2376                 PerlMemShared_free(trie->bitmap);
2377                 trie->bitmap= NULL;
2378             } else
2379                 OP( convert ) = TRIE;
2380
2381             /* store the type in the flags */
2382             convert->flags = nodetype;
2383             DEBUG_r({
2384             optimize = convert
2385                       + NODE_STEP_REGNODE
2386                       + regarglen[ OP( convert ) ];
2387             });
2388             /* XXX We really should free up the resource in trie now,
2389                    as we won't use them - (which resources?) dmq */
2390         }
2391         /* needed for dumping*/
2392         DEBUG_r(if (optimize) {
2393             regnode *opt = convert;
2394
2395             while ( ++opt < optimize) {
2396                 Set_Node_Offset_Length(opt,0,0);
2397             }
2398             /*
2399                 Try to clean up some of the debris left after the
2400                 optimisation.
2401              */
2402             while( optimize < jumper ) {
2403                 mjd_nodelen += Node_Length((optimize));
2404                 OP( optimize ) = OPTIMIZED;
2405                 Set_Node_Offset_Length(optimize,0,0);
2406                 optimize++;
2407             }
2408             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2409         });
2410     } /* end node insert */
2411
2412     /*  Finish populating the prev field of the wordinfo array.  Walk back
2413      *  from each accept state until we find another accept state, and if
2414      *  so, point the first word's .prev field at the second word. If the
2415      *  second already has a .prev field set, stop now. This will be the
2416      *  case either if we've already processed that word's accept state,
2417      *  or that state had multiple words, and the overspill words were
2418      *  already linked up earlier.
2419      */
2420     {
2421         U16 word;
2422         U32 state;
2423         U16 prev;
2424
2425         for (word=1; word <= trie->wordcount; word++) {
2426             prev = 0;
2427             if (trie->wordinfo[word].prev)
2428                 continue;
2429             state = trie->wordinfo[word].accept;
2430             while (state) {
2431                 state = prev_states[state];
2432                 if (!state)
2433                     break;
2434                 prev = trie->states[state].wordnum;
2435                 if (prev)
2436                     break;
2437             }
2438             trie->wordinfo[word].prev = prev;
2439         }
2440         Safefree(prev_states);
2441     }
2442
2443
2444     /* and now dump out the compressed format */
2445     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2446
2447     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2448 #ifdef DEBUGGING
2449     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2450     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2451 #else
2452     SvREFCNT_dec(revcharmap);
2453 #endif
2454     return trie->jump
2455            ? MADE_JUMP_TRIE
2456            : trie->startstate>1
2457              ? MADE_EXACT_TRIE
2458              : MADE_TRIE;
2459 }
2460
2461 STATIC void
2462 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2463 {
2464 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2465
2466    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2467    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2468    ISBN 0-201-10088-6
2469
2470    We find the fail state for each state in the trie, this state is the longest proper
2471    suffix of the current state's 'word' that is also a proper prefix of another word in our
2472    trie. State 1 represents the word '' and is thus the default fail state. This allows
2473    the DFA not to have to restart after its tried and failed a word at a given point, it
2474    simply continues as though it had been matching the other word in the first place.
2475    Consider
2476       'abcdgu'=~/abcdefg|cdgu/
2477    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2478    fail, which would bring us to the state representing 'd' in the second word where we would
2479    try 'g' and succeed, proceeding to match 'cdgu'.
2480  */
2481  /* add a fail transition */
2482     const U32 trie_offset = ARG(source);
2483     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2484     U32 *q;
2485     const U32 ucharcount = trie->uniquecharcount;
2486     const U32 numstates = trie->statecount;
2487     const U32 ubound = trie->lasttrans + ucharcount;
2488     U32 q_read = 0;
2489     U32 q_write = 0;
2490     U32 charid;
2491     U32 base = trie->states[ 1 ].trans.base;
2492     U32 *fail;
2493     reg_ac_data *aho;
2494     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2495     GET_RE_DEBUG_FLAGS_DECL;
2496
2497     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2498 #ifndef DEBUGGING
2499     PERL_UNUSED_ARG(depth);
2500 #endif
2501
2502
2503     ARG_SET( stclass, data_slot );
2504     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2505     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2506     aho->trie=trie_offset;
2507     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2508     Copy( trie->states, aho->states, numstates, reg_trie_state );
2509     Newxz( q, numstates, U32);
2510     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2511     aho->refcount = 1;
2512     fail = aho->fail;
2513     /* initialize fail[0..1] to be 1 so that we always have
2514        a valid final fail state */
2515     fail[ 0 ] = fail[ 1 ] = 1;
2516
2517     for ( charid = 0; charid < ucharcount ; charid++ ) {
2518         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2519         if ( newstate ) {
2520             q[ q_write ] = newstate;
2521             /* set to point at the root */
2522             fail[ q[ q_write++ ] ]=1;
2523         }
2524     }
2525     while ( q_read < q_write) {
2526         const U32 cur = q[ q_read++ % numstates ];
2527         base = trie->states[ cur ].trans.base;
2528
2529         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2530             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2531             if (ch_state) {
2532                 U32 fail_state = cur;
2533                 U32 fail_base;
2534                 do {
2535                     fail_state = fail[ fail_state ];
2536                     fail_base = aho->states[ fail_state ].trans.base;
2537                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2538
2539                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2540                 fail[ ch_state ] = fail_state;
2541                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2542                 {
2543                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2544                 }
2545                 q[ q_write++ % numstates] = ch_state;
2546             }
2547         }
2548     }
2549     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2550        when we fail in state 1, this allows us to use the
2551        charclass scan to find a valid start char. This is based on the principle
2552        that theres a good chance the string being searched contains lots of stuff
2553        that cant be a start char.
2554      */
2555     fail[ 0 ] = fail[ 1 ] = 0;
2556     DEBUG_TRIE_COMPILE_r({
2557         PerlIO_printf(Perl_debug_log,
2558                       "%*sStclass Failtable (%"UVuf" states): 0",
2559                       (int)(depth * 2), "", (UV)numstates
2560         );
2561         for( q_read=1; q_read<numstates; q_read++ ) {
2562             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2563         }
2564         PerlIO_printf(Perl_debug_log, "\n");
2565     });
2566     Safefree(q);
2567     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2568 }
2569
2570
2571 /*
2572  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2573  * These need to be revisited when a newer toolchain becomes available.
2574  */
2575 #if defined(__sparc64__) && defined(__GNUC__)
2576 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2577 #       undef  SPARC64_GCC_WORKAROUND
2578 #       define SPARC64_GCC_WORKAROUND 1
2579 #   endif
2580 #endif
2581
2582 #define DEBUG_PEEP(str,scan,depth) \
2583     DEBUG_OPTIMISE_r({if (scan){ \
2584        SV * const mysv=sv_newmortal(); \
2585        regnode *Next = regnext(scan); \
2586        regprop(RExC_rx, mysv, scan); \
2587        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2588        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2589        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2590    }});
2591
2592
2593 /* The below joins as many adjacent EXACTish nodes as possible into a single
2594  * one.  The regop may be changed if the node(s) contain certain sequences that
2595  * require special handling.  The joining is only done if:
2596  * 1) there is room in the current conglomerated node to entirely contain the
2597  *    next one.
2598  * 2) they are the exact same node type
2599  *
2600  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2601  * these get optimized out
2602  *
2603  * If a node is to match under /i (folded), the number of characters it matches
2604  * can be different than its character length if it contains a multi-character
2605  * fold.  *min_subtract is set to the total delta of the input nodes.
2606  *
2607  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2608  * and contains LATIN SMALL LETTER SHARP S
2609  *
2610  * This is as good a place as any to discuss the design of handling these
2611  * multi-character fold sequences.  It's been wrong in Perl for a very long
2612  * time.  There are three code points in Unicode whose multi-character folds
2613  * were long ago discovered to mess things up.  The previous designs for
2614  * dealing with these involved assigning a special node for them.  This
2615  * approach doesn't work, as evidenced by this example:
2616  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2617  * Both these fold to "sss", but if the pattern is parsed to create a node that
2618  * would match just the \xDF, it won't be able to handle the case where a
2619  * successful match would have to cross the node's boundary.  The new approach
2620  * that hopefully generally solves the problem generates an EXACTFU_SS node
2621  * that is "sss".
2622  *
2623  * It turns out that there are problems with all multi-character folds, and not
2624  * just these three.  Now the code is general, for all such cases, but the
2625  * three still have some special handling.  The approach taken is:
2626  * 1)   This routine examines each EXACTFish node that could contain multi-
2627  *      character fold sequences.  It returns in *min_subtract how much to
2628  *      subtract from the the actual length of the string to get a real minimum
2629  *      match length; it is 0 if there are no multi-char folds.  This delta is
2630  *      used by the caller to adjust the min length of the match, and the delta
2631  *      between min and max, so that the optimizer doesn't reject these
2632  *      possibilities based on size constraints.
2633  * 2)   Certain of these sequences require special handling by the trie code,
2634  *      so, if found, this code changes the joined node type to special ops:
2635  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2636  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2637  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2638  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2639  *      there is a possible fold length change.  That means that a regular
2640  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2641  *      with length changes, and so can be processed faster.  regexec.c takes
2642  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2643  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2644  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2645  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2646  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2647  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2648  *      possibilities for the non-UTF8 patterns are quite simple, except for
2649  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2650  *      members of a fold-pair, and arrays are set up for all of them so that
2651  *      the other member of the pair can be found quickly.  Code elsewhere in
2652  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2653  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2654  *      described in the next item.
2655  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2656  *      'ss' or not is not knowable at compile time.  It will match iff the
2657  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2658  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2659  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2660  *      described in item 3).  An assumption that the optimizer part of
2661  *      regexec.c (probably unwittingly) makes is that a character in the
2662  *      pattern corresponds to at most a single character in the target string.
2663  *      (And I do mean character, and not byte here, unlike other parts of the
2664  *      documentation that have never been updated to account for multibyte
2665  *      Unicode.)  This assumption is wrong only in this case, as all other
2666  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2667  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2668  *      reluctant to try to change this assumption, so instead the code punts.
2669  *      This routine examines EXACTF nodes for the sharp s, and returns a
2670  *      boolean indicating whether or not the node is an EXACTF node that
2671  *      contains a sharp s.  When it is true, the caller sets a flag that later
2672  *      causes the optimizer in this file to not set values for the floating
2673  *      and fixed string lengths, and thus avoids the optimizer code in
2674  *      regexec.c that makes the invalid assumption.  Thus, there is no
2675  *      optimization based on string lengths for EXACTF nodes that contain the
2676  *      sharp s.  This only happens for /id rules (which means the pattern
2677  *      isn't in UTF-8).
2678  */
2679
2680 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2681     if (PL_regkind[OP(scan)] == EXACT) \
2682         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2683
2684 STATIC U32
2685 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2686     /* Merge several consecutive EXACTish nodes into one. */
2687     regnode *n = regnext(scan);
2688     U32 stringok = 1;
2689     regnode *next = scan + NODE_SZ_STR(scan);
2690     U32 merged = 0;
2691     U32 stopnow = 0;
2692 #ifdef DEBUGGING
2693     regnode *stop = scan;
2694     GET_RE_DEBUG_FLAGS_DECL;
2695 #else
2696     PERL_UNUSED_ARG(depth);
2697 #endif
2698
2699     PERL_ARGS_ASSERT_JOIN_EXACT;
2700 #ifndef EXPERIMENTAL_INPLACESCAN
2701     PERL_UNUSED_ARG(flags);
2702     PERL_UNUSED_ARG(val);
2703 #endif
2704     DEBUG_PEEP("join",scan,depth);
2705
2706     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2707      * EXACT ones that are mergeable to the current one. */
2708     while (n
2709            && (PL_regkind[OP(n)] == NOTHING
2710                || (stringok && OP(n) == OP(scan)))
2711            && NEXT_OFF(n)
2712            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2713     {
2714
2715         if (OP(n) == TAIL || n > next)
2716             stringok = 0;
2717         if (PL_regkind[OP(n)] == NOTHING) {
2718             DEBUG_PEEP("skip:",n,depth);
2719             NEXT_OFF(scan) += NEXT_OFF(n);
2720             next = n + NODE_STEP_REGNODE;
2721 #ifdef DEBUGGING
2722             if (stringok)
2723                 stop = n;
2724 #endif
2725             n = regnext(n);
2726         }
2727         else if (stringok) {
2728             const unsigned int oldl = STR_LEN(scan);
2729             regnode * const nnext = regnext(n);
2730
2731             /* XXX I (khw) kind of doubt that this works on platforms where
2732              * U8_MAX is above 255 because of lots of other assumptions */
2733             if (oldl + STR_LEN(n) > U8_MAX)
2734                 break;
2735
2736             DEBUG_PEEP("merg",n,depth);
2737             merged++;
2738
2739             NEXT_OFF(scan) += NEXT_OFF(n);
2740             STR_LEN(scan) += STR_LEN(n);
2741             next = n + NODE_SZ_STR(n);
2742             /* Now we can overwrite *n : */
2743             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2744 #ifdef DEBUGGING
2745             stop = next - 1;
2746 #endif
2747             n = nnext;
2748             if (stopnow) break;
2749         }
2750
2751 #ifdef EXPERIMENTAL_INPLACESCAN
2752         if (flags && !NEXT_OFF(n)) {
2753             DEBUG_PEEP("atch", val, depth);
2754             if (reg_off_by_arg[OP(n)]) {
2755                 ARG_SET(n, val - n);
2756             }
2757             else {
2758                 NEXT_OFF(n) = val - n;
2759             }
2760             stopnow = 1;
2761         }
2762 #endif
2763     }
2764
2765     *min_subtract = 0;
2766     *has_exactf_sharp_s = FALSE;
2767
2768     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2769      * can now analyze for sequences of problematic code points.  (Prior to
2770      * this final joining, sequences could have been split over boundaries, and
2771      * hence missed).  The sequences only happen in folding, hence for any
2772      * non-EXACT EXACTish node */
2773     if (OP(scan) != EXACT) {
2774         const U8 * const s0 = (U8*) STRING(scan);
2775         const U8 * s = s0;
2776         const U8 * const s_end = s0 + STR_LEN(scan);
2777
2778         /* One pass is made over the node's string looking for all the
2779          * possibilities.  to avoid some tests in the loop, there are two main
2780          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2781          * non-UTF-8 */
2782         if (UTF) {
2783
2784             /* Examine the string for a multi-character fold sequence.  UTF-8
2785              * patterns have all characters pre-folded by the time this code is
2786              * executed */
2787             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2788                                      length sequence we are looking for is 2 */
2789             {
2790                 int count = 0;
2791                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2792                 if (! len) {    /* Not a multi-char fold: get next char */
2793                     s += UTF8SKIP(s);
2794                     continue;
2795                 }
2796
2797                 /* Nodes with 'ss' require special handling, except for EXACTFL
2798                  * and EXACTFA for which there is no multi-char fold to this */
2799                 if (len == 2 && *s == 's' && *(s+1) == 's'
2800                     && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2801                 {
2802                     count = 2;
2803                     OP(scan) = EXACTFU_SS;
2804                     s += 2;
2805                 }
2806                 else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2807                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2808                                       COMBINING_DIAERESIS_UTF8
2809                                       COMBINING_ACUTE_ACCENT_UTF8,
2810                                    6)
2811                              || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2812                                          COMBINING_DIAERESIS_UTF8
2813                                          COMBINING_ACUTE_ACCENT_UTF8,
2814                                      6)))
2815                 {
2816                     count = 3;
2817
2818                     /* These two folds require special handling by trie's, so
2819                      * change the node type to indicate this.  If EXACTFA and
2820                      * EXACTFL were ever to be handled by trie's, this would
2821                      * have to be changed.  If this node has already been
2822                      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2823                      * (khw) think it doesn't matter in regexec.c for UTF
2824                      * patterns, but no need to change it */
2825                     if (OP(scan) == EXACTFU) {
2826                         OP(scan) = EXACTFU_TRICKYFOLD;
2827                     }
2828                     s += 6;
2829                 }
2830                 else { /* Here is a generic multi-char fold. */
2831                     const U8* multi_end  = s + len;
2832
2833                     /* Count how many characters in it.  In the case of /l and
2834                      * /aa, no folds which contain ASCII code points are
2835                      * allowed, so check for those, and skip if found.  (In
2836                      * EXACTFL, no folds are allowed to any Latin1 code point,
2837                      * not just ASCII.  But there aren't any of these
2838                      * currently, nor ever likely, so don't take the time to
2839                      * test for them.  The code that generates the
2840                      * is_MULTI_foo() macros croaks should one actually get put
2841                      * into Unicode .) */
2842                     if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2843                         count = utf8_length(s, multi_end);
2844                         s = multi_end;
2845                     }
2846                     else {
2847                         while (s < multi_end) {
2848                             if (isASCII(*s)) {
2849                                 s++;
2850                                 goto next_iteration;
2851                             }
2852                             else {
2853                                 s += UTF8SKIP(s);
2854                             }
2855                             count++;
2856                         }
2857                     }
2858                 }
2859
2860                 /* The delta is how long the sequence is minus 1 (1 is how long
2861                  * the character that folds to the sequence is) */
2862                 *min_subtract += count - 1;
2863             next_iteration: ;
2864             }
2865         }
2866         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2867
2868             /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2869              * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2870              * nodes can't have multi-char folds to this range (and there are
2871              * no existing ones in the upper latin1 range).  In the EXACTF
2872              * case we look also for the sharp s, which can be in the final
2873              * position.  Otherwise we can stop looking 1 byte earlier because
2874              * have to find at least two characters for a multi-fold */
2875             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2876
2877             /* The below is perhaps overboard, but this allows us to save a
2878              * test each time through the loop at the expense of a mask.  This
2879              * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2880              * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2881              * are 64.  This uses an exclusive 'or' to find that bit and then
2882              * inverts it to form a mask, with just a single 0, in the bit
2883              * position where 'S' and 's' differ. */
2884             const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2885             const U8 s_masked = 's' & S_or_s_mask;
2886
2887             while (s < upper) {
2888                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2889                 if (! len) {    /* Not a multi-char fold. */
2890                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2891                     {
2892                         *has_exactf_sharp_s = TRUE;
2893                     }
2894                     s++;
2895                     continue;
2896                 }
2897
2898                 if (len == 2
2899                     && ((*s & S_or_s_mask) == s_masked)
2900                     && ((*(s+1) & S_or_s_mask) == s_masked))
2901                 {
2902
2903                     /* EXACTF nodes need to know that the minimum length
2904                      * changed so that a sharp s in the string can match this
2905                      * ss in the pattern, but they remain EXACTF nodes, as they
2906                      * won't match this unless the target string is is UTF-8,
2907                      * which we don't know until runtime */
2908                     if (OP(scan) != EXACTF) {
2909                         OP(scan) = EXACTFU_SS;
2910                     }
2911                 }
2912
2913                 *min_subtract += len - 1;
2914                 s += len;
2915             }
2916         }
2917     }
2918
2919 #ifdef DEBUGGING
2920     /* Allow dumping but overwriting the collection of skipped
2921      * ops and/or strings with fake optimized ops */
2922     n = scan + NODE_SZ_STR(scan);
2923     while (n <= stop) {
2924         OP(n) = OPTIMIZED;
2925         FLAGS(n) = 0;
2926         NEXT_OFF(n) = 0;
2927         n++;
2928     }
2929 #endif
2930     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2931     return stopnow;
2932 }
2933
2934 /* REx optimizer.  Converts nodes into quicker variants "in place".
2935    Finds fixed substrings.  */
2936
2937 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2938    to the position after last scanned or to NULL. */
2939
2940 #define INIT_AND_WITHP \
2941     assert(!and_withp); \
2942     Newx(and_withp,1,struct regnode_charclass_class); \
2943     SAVEFREEPV(and_withp)
2944
2945 /* this is a chain of data about sub patterns we are processing that
2946    need to be handled separately/specially in study_chunk. Its so
2947    we can simulate recursion without losing state.  */
2948 struct scan_frame;
2949 typedef struct scan_frame {
2950     regnode *last;  /* last node to process in this frame */
2951     regnode *next;  /* next node to process when last is reached */
2952     struct scan_frame *prev; /*previous frame*/
2953     I32 stop; /* what stopparen do we use */
2954 } scan_frame;
2955
2956
2957 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2958
2959 #define CASE_SYNST_FNC(nAmE)                                       \
2960 case nAmE:                                                         \
2961     if (flags & SCF_DO_STCLASS_AND) {                              \
2962             for (value = 0; value < 256; value++)                  \
2963                 if (!is_ ## nAmE ## _cp(value))                       \
2964                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2965     }                                                              \
2966     else {                                                         \
2967             for (value = 0; value < 256; value++)                  \
2968                 if (is_ ## nAmE ## _cp(value))                        \
2969                     ANYOF_BITMAP_SET(data->start_class, value);    \
2970     }                                                              \
2971     break;                                                         \
2972 case N ## nAmE:                                                    \
2973     if (flags & SCF_DO_STCLASS_AND) {                              \
2974             for (value = 0; value < 256; value++)                   \
2975                 if (is_ ## nAmE ## _cp(value))                         \
2976                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2977     }                                                               \
2978     else {                                                          \
2979             for (value = 0; value < 256; value++)                   \
2980                 if (!is_ ## nAmE ## _cp(value))                        \
2981                     ANYOF_BITMAP_SET(data->start_class, value);     \
2982     }                                                               \
2983     break
2984
2985
2986
2987 STATIC I32
2988 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2989                         I32 *minlenp, I32 *deltap,
2990                         regnode *last,
2991                         scan_data_t *data,
2992                         I32 stopparen,
2993                         U8* recursed,
2994                         struct regnode_charclass_class *and_withp,
2995                         U32 flags, U32 depth)
2996                         /* scanp: Start here (read-write). */
2997                         /* deltap: Write maxlen-minlen here. */
2998                         /* last: Stop before this one. */
2999                         /* data: string data about the pattern */
3000                         /* stopparen: treat close N as END */
3001                         /* recursed: which subroutines have we recursed into */
3002                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3003 {
3004     dVAR;
3005     I32 min = 0;    /* There must be at least this number of characters to match */
3006     I32 pars = 0, code;
3007     regnode *scan = *scanp, *next;
3008     I32 delta = 0;
3009     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3010     int is_inf_internal = 0;            /* The studied chunk is infinite */
3011     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3012     scan_data_t data_fake;
3013     SV *re_trie_maxbuff = NULL;
3014     regnode *first_non_open = scan;
3015     I32 stopmin = I32_MAX;
3016     scan_frame *frame = NULL;
3017     GET_RE_DEBUG_FLAGS_DECL;
3018
3019     PERL_ARGS_ASSERT_STUDY_CHUNK;
3020
3021 #ifdef DEBUGGING
3022     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3023 #endif
3024
3025     if ( depth == 0 ) {
3026         while (first_non_open && OP(first_non_open) == OPEN)
3027             first_non_open=regnext(first_non_open);
3028     }
3029
3030
3031   fake_study_recurse:
3032     while ( scan && OP(scan) != END && scan < last ){
3033         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3034                                    node length to get a real minimum (because
3035                                    the folded version may be shorter) */
3036         bool has_exactf_sharp_s = FALSE;
3037         /* Peephole optimizer: */
3038         DEBUG_STUDYDATA("Peep:", data,depth);
3039         DEBUG_PEEP("Peep",scan,depth);
3040
3041         /* Its not clear to khw or hv why this is done here, and not in the
3042          * clauses that deal with EXACT nodes.  khw's guess is that it's
3043          * because of a previous design */
3044         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3045
3046         /* Follow the next-chain of the current node and optimize
3047            away all the NOTHINGs from it.  */
3048         if (OP(scan) != CURLYX) {
3049             const int max = (reg_off_by_arg[OP(scan)]
3050                        ? I32_MAX
3051                        /* I32 may be smaller than U16 on CRAYs! */
3052                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3053             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3054             int noff;
3055             regnode *n = scan;
3056
3057             /* Skip NOTHING and LONGJMP. */
3058             while ((n = regnext(n))
3059                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3060                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3061                    && off + noff < max)
3062                 off += noff;
3063             if (reg_off_by_arg[OP(scan)])
3064                 ARG(scan) = off;
3065             else
3066                 NEXT_OFF(scan) = off;
3067         }
3068
3069
3070
3071         /* The principal pseudo-switch.  Cannot be a switch, since we
3072            look into several different things.  */
3073         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3074                    || OP(scan) == IFTHEN) {
3075             next = regnext(scan);
3076             code = OP(scan);
3077             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3078
3079             if (OP(next) == code || code == IFTHEN) {
3080                 /* NOTE - There is similar code to this block below for handling
3081                    TRIE nodes on a re-study.  If you change stuff here check there
3082                    too. */
3083                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3084                 struct regnode_charclass_class accum;
3085                 regnode * const startbranch=scan;
3086
3087                 if (flags & SCF_DO_SUBSTR)
3088                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3089                 if (flags & SCF_DO_STCLASS)
3090                     cl_init_zero(pRExC_state, &accum);
3091
3092                 while (OP(scan) == code) {
3093                     I32 deltanext, minnext, f = 0, fake;
3094                     struct regnode_charclass_class this_class;
3095
3096                     num++;
3097                     data_fake.flags = 0;
3098                     if (data) {
3099                         data_fake.whilem_c = data->whilem_c;
3100                         data_fake.last_closep = data->last_closep;
3101                     }
3102                     else
3103                         data_fake.last_closep = &fake;
3104
3105                     data_fake.pos_delta = delta;
3106                     next = regnext(scan);
3107                     scan = NEXTOPER(scan);
3108                     if (code != BRANCH)
3109                         scan = NEXTOPER(scan);
3110                     if (flags & SCF_DO_STCLASS) {
3111                         cl_init(pRExC_state, &this_class);
3112                         data_fake.start_class = &this_class;
3113                         f = SCF_DO_STCLASS_AND;
3114                     }
3115                     if (flags & SCF_WHILEM_VISITED_POS)
3116                         f |= SCF_WHILEM_VISITED_POS;
3117
3118                     /* we suppose the run is continuous, last=next...*/
3119                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3120                                           next, &data_fake,
3121                                           stopparen, recursed, NULL, f,depth+1);
3122                     if (min1 > minnext)
3123                         min1 = minnext;
3124                     if (max1 < minnext + deltanext)
3125                         max1 = minnext + deltanext;
3126                     if (deltanext == I32_MAX)
3127                         is_inf = is_inf_internal = 1;
3128                     scan = next;
3129                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3130                         pars++;
3131                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3132                         if ( stopmin > minnext)
3133                             stopmin = min + min1;
3134                         flags &= ~SCF_DO_SUBSTR;
3135                         if (data)
3136                             data->flags |= SCF_SEEN_ACCEPT;
3137                     }
3138                     if (data) {
3139                         if (data_fake.flags & SF_HAS_EVAL)
3140                             data->flags |= SF_HAS_EVAL;
3141                         data->whilem_c = data_fake.whilem_c;
3142                     }
3143                     if (flags & SCF_DO_STCLASS)
3144                         cl_or(pRExC_state, &accum, &this_class);
3145                 }
3146                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3147                     min1 = 0;
3148                 if (flags & SCF_DO_SUBSTR) {
3149                     data->pos_min += min1;
3150                     data->pos_delta += max1 - min1;
3151                     if (max1 != min1 || is_inf)
3152                         data->longest = &(data->longest_float);
3153                 }
3154                 min += min1;
3155                 delta += max1 - min1;
3156                 if (flags & SCF_DO_STCLASS_OR) {
3157                     cl_or(pRExC_state, data->start_class, &accum);
3158                     if (min1) {
3159                         cl_and(data->start_class, and_withp);
3160                         flags &= ~SCF_DO_STCLASS;
3161                     }
3162                 }
3163                 else if (flags & SCF_DO_STCLASS_AND) {
3164                     if (min1) {
3165                         cl_and(data->start_class, &accum);
3166                         flags &= ~SCF_DO_STCLASS;
3167                     }
3168                     else {
3169                         /* Switch to OR mode: cache the old value of
3170                          * data->start_class */
3171                         INIT_AND_WITHP;
3172                         StructCopy(data->start_class, and_withp,
3173                                    struct regnode_charclass_class);
3174                         flags &= ~SCF_DO_STCLASS_AND;
3175                         StructCopy(&accum, data->start_class,
3176                                    struct regnode_charclass_class);
3177                         flags |= SCF_DO_STCLASS_OR;
3178                         data->start_class->flags |= ANYOF_EOS;
3179                     }
3180                 }
3181
3182                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3183                 /* demq.
3184
3185                    Assuming this was/is a branch we are dealing with: 'scan' now
3186                    points at the item that follows the branch sequence, whatever
3187                    it is. We now start at the beginning of the sequence and look
3188                    for subsequences of
3189
3190                    BRANCH->EXACT=>x1
3191                    BRANCH->EXACT=>x2
3192                    tail
3193
3194                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3195
3196                    If we can find such a subsequence we need to turn the first
3197                    element into a trie and then add the subsequent branch exact
3198                    strings to the trie.
3199
3200                    We have two cases
3201
3202                      1. patterns where the whole set of branches can be converted.
3203
3204                      2. patterns where only a subset can be converted.
3205
3206                    In case 1 we can replace the whole set with a single regop
3207                    for the trie. In case 2 we need to keep the start and end
3208                    branches so
3209
3210                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3211                      becomes BRANCH TRIE; BRANCH X;
3212
3213                   There is an additional case, that being where there is a
3214                   common prefix, which gets split out into an EXACT like node
3215                   preceding the TRIE node.
3216
3217                   If x(1..n)==tail then we can do a simple trie, if not we make
3218                   a "jump" trie, such that when we match the appropriate word
3219                   we "jump" to the appropriate tail node. Essentially we turn
3220                   a nested if into a case structure of sorts.
3221
3222                 */
3223
3224                     int made=0;
3225                     if (!re_trie_maxbuff) {
3226                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3227                         if (!SvIOK(re_trie_maxbuff))
3228                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3229                     }
3230                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3231                         regnode *cur;
3232                         regnode *first = (regnode *)NULL;
3233                         regnode *last = (regnode *)NULL;
3234                         regnode *tail = scan;
3235                         U8 trietype = 0;
3236                         U32 count=0;
3237
3238 #ifdef DEBUGGING
3239                         SV * const mysv = sv_newmortal();       /* for dumping */
3240 #endif
3241                         /* var tail is used because there may be a TAIL
3242                            regop in the way. Ie, the exacts will point to the
3243                            thing following the TAIL, but the last branch will
3244                            point at the TAIL. So we advance tail. If we
3245                            have nested (?:) we may have to move through several
3246                            tails.
3247                          */
3248
3249                         while ( OP( tail ) == TAIL ) {
3250                             /* this is the TAIL generated by (?:) */
3251                             tail = regnext( tail );
3252                         }
3253
3254
3255                         DEBUG_TRIE_COMPILE_r({
3256                             regprop(RExC_rx, mysv, tail );
3257                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3258                                 (int)depth * 2 + 2, "",
3259                                 "Looking for TRIE'able sequences. Tail node is: ",
3260                                 SvPV_nolen_const( mysv )
3261                             );
3262                         });
3263
3264                         /*
3265
3266                             Step through the branches
3267                                 cur represents each branch,
3268                                 noper is the first thing to be matched as part of that branch
3269                                 noper_next is the regnext() of that node.
3270
3271                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3272                             via a "jump trie" but we also support building with NOJUMPTRIE,
3273                             which restricts the trie logic to structures like /FOO|BAR/.
3274
3275                             If noper is a trieable nodetype then the branch is a possible optimization
3276                             target. If we are building under NOJUMPTRIE then we require that noper_next
3277                             is the same as scan (our current position in the regex program).
3278
3279                             Once we have two or more consecutive such branches we can create a
3280                             trie of the EXACT's contents and stitch it in place into the program.
3281
3282                             If the sequence represents all of the branches in the alternation we
3283                             replace the entire thing with a single TRIE node.
3284
3285                             Otherwise when it is a subsequence we need to stitch it in place and
3286                             replace only the relevant branches. This means the first branch has
3287                             to remain as it is used by the alternation logic, and its next pointer,
3288                             and needs to be repointed at the item on the branch chain following
3289                             the last branch we have optimized away.
3290
3291                             This could be either a BRANCH, in which case the subsequence is internal,
3292                             or it could be the item following the branch sequence in which case the
3293                             subsequence is at the end (which does not necessarily mean the first node
3294                             is the start of the alternation).
3295
3296                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3297
3298                                 optype          |  trietype
3299                                 ----------------+-----------
3300                                 NOTHING         | NOTHING
3301                                 EXACT           | EXACT
3302                                 EXACTFU         | EXACTFU
3303                                 EXACTFU_SS      | EXACTFU
3304                                 EXACTFU_TRICKYFOLD | EXACTFU
3305                                 EXACTFA         | 0
3306
3307
3308                         */
3309 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3310                        ( EXACT == (X) )   ? EXACT :        \
3311                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3312                        0 )
3313
3314                         /* dont use tail as the end marker for this traverse */
3315                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3316                             regnode * const noper = NEXTOPER( cur );
3317                             U8 noper_type = OP( noper );
3318                             U8 noper_trietype = TRIE_TYPE( noper_type );
3319 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3320                             regnode * const noper_next = regnext( noper );
3321                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3322                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3323 #endif
3324
3325                             DEBUG_TRIE_COMPILE_r({
3326                                 regprop(RExC_rx, mysv, cur);
3327                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3328                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3329
3330                                 regprop(RExC_rx, mysv, noper);
3331                                 PerlIO_printf( Perl_debug_log, " -> %s",
3332                                     SvPV_nolen_const(mysv));
3333
3334                                 if ( noper_next ) {
3335                                   regprop(RExC_rx, mysv, noper_next );
3336                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3337                                     SvPV_nolen_const(mysv));
3338                                 }
3339                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3340                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3341                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3342                                 );
3343                             });
3344
3345                             /* Is noper a trieable nodetype that can be merged with the
3346                              * current trie (if there is one)? */
3347                             if ( noper_trietype
3348                                   &&
3349                                   (
3350                                         ( noper_trietype == NOTHING)
3351                                         || ( trietype == NOTHING )
3352                                         || ( trietype == noper_trietype )
3353                                   )
3354 #ifdef NOJUMPTRIE
3355                                   && noper_next == tail
3356 #endif
3357                                   && count < U16_MAX)
3358                             {
3359                                 /* Handle mergable triable node
3360                                  * Either we are the first node in a new trieable sequence,
3361                                  * in which case we do some bookkeeping, otherwise we update
3362                                  * the end pointer. */
3363                                 if ( !first ) {
3364                                     first = cur;
3365                                     if ( noper_trietype == NOTHING ) {
3366 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3367                                         regnode * const noper_next = regnext( noper );
3368                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3369                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3370 #endif
3371
3372                                         if ( noper_next_trietype ) {
3373                                             trietype = noper_next_trietype;
3374                                         } else if (noper_next_type)  {
3375                                             /* a NOTHING regop is 1 regop wide. We need at least two
3376                                              * for a trie so we can't merge this in */
3377                                             first = NULL;
3378                                         }
3379                                     } else {
3380                                         trietype = noper_trietype;
3381                                     }
3382                                 } else {
3383                                     if ( trietype == NOTHING )
3384                                         trietype = noper_trietype;
3385                                     last = cur;
3386                                 }
3387                                 if (first)
3388                                     count++;
3389                             } /* end handle mergable triable node */
3390                             else {
3391                                 /* handle unmergable node -
3392                                  * noper may either be a triable node which can not be tried
3393                                  * together with the current trie, or a non triable node */
3394                                 if ( last ) {
3395                                     /* If last is set and trietype is not NOTHING then we have found
3396                                      * at least two triable branch sequences in a row of a similar
3397                                      * trietype so we can turn them into a trie. If/when we
3398                                      * allow NOTHING to start a trie sequence this condition will be
3399                                      * required, and it isn't expensive so we leave it in for now. */
3400                                     if ( trietype && trietype != NOTHING )
3401                                         make_trie( pRExC_state,
3402                                                 startbranch, first, cur, tail, count,
3403                                                 trietype, depth+1 );
3404                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3405                                 }
3406                                 if ( noper_trietype
3407 #ifdef NOJUMPTRIE
3408                                      && noper_next == tail
3409 #endif
3410                                 ){
3411                                     /* noper is triable, so we can start a new trie sequence */
3412                                     count = 1;
3413                                     first = cur;
3414                                     trietype = noper_trietype;
3415                                 } else if (first) {
3416                                     /* if we already saw a first but the current node is not triable then we have
3417                                      * to reset the first information. */
3418                                     count = 0;
3419                                     first = NULL;
3420                                     trietype = 0;
3421                                 }
3422                             } /* end handle unmergable node */
3423                         } /* loop over branches */
3424                         DEBUG_TRIE_COMPILE_r({
3425                             regprop(RExC_rx, mysv, cur);
3426                             PerlIO_printf( Perl_debug_log,
3427                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3428                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3429
3430                         });
3431                         if ( last && trietype ) {
3432                             if ( trietype != NOTHING ) {
3433                                 /* the last branch of the sequence was part of a trie,
3434                                  * so we have to construct it here outside of the loop
3435                                  */
3436                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3437 #ifdef TRIE_STUDY_OPT
3438                                 if ( ((made == MADE_EXACT_TRIE &&
3439                                      startbranch == first)
3440                                      || ( first_non_open == first )) &&
3441                                      depth==0 ) {
3442                                     flags |= SCF_TRIE_RESTUDY;
3443                                     if ( startbranch == first
3444                                          && scan == tail )
3445                                     {
3446                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3447                                     }
3448                                 }
3449 #endif
3450                             } else {
3451                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3452                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3453                                  */
3454                                 if ( startbranch == first ) {
3455                                     regnode *opt;
3456                                     /* the entire thing is a NOTHING sequence, something like this:
3457                                      * (?:|) So we can turn it into a plain NOTHING op. */
3458                                     DEBUG_TRIE_COMPILE_r({
3459                                         regprop(RExC_rx, mysv, cur);
3460                                         PerlIO_printf( Perl_debug_log,
3461                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3462                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3463
3464                                     });
3465                                     OP(startbranch)= NOTHING;
3466                                     NEXT_OFF(startbranch)= tail - startbranch;
3467                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3468                                         OP(opt)= OPTIMIZED;
3469                                 }
3470                             }
3471                         } /* end if ( last) */
3472                     } /* TRIE_MAXBUF is non zero */
3473
3474                 } /* do trie */
3475
3476             }
3477             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3478                 scan = NEXTOPER(NEXTOPER(scan));
3479             } else                      /* single branch is optimized. */
3480                 scan = NEXTOPER(scan);
3481             continue;
3482         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3483             scan_frame *newframe = NULL;
3484             I32 paren;
3485             regnode *start;
3486             regnode *end;
3487
3488             if (OP(scan) != SUSPEND) {
3489             /* set the pointer */
3490                 if (OP(scan) == GOSUB) {
3491                     paren = ARG(scan);
3492                     RExC_recurse[ARG2L(scan)] = scan;
3493                     start = RExC_open_parens[paren-1];
3494                     end   = RExC_close_parens[paren-1];
3495                 } else {
3496                     paren = 0;
3497                     start = RExC_rxi->program + 1;
3498                     end   = RExC_opend;
3499                 }
3500                 if (!recursed) {
3501                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3502                     SAVEFREEPV(recursed);
3503                 }
3504                 if (!PAREN_TEST(recursed,paren+1)) {
3505                     PAREN_SET(recursed,paren+1);
3506                     Newx(newframe,1,scan_frame);
3507                 } else {
3508                     if (flags & SCF_DO_SUBSTR) {
3509                         SCAN_COMMIT(pRExC_state,data,minlenp);
3510                         data->longest = &(data->longest_float);
3511                     }
3512                     is_inf = is_inf_internal = 1;
3513                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3514                         cl_anything(pRExC_state, data->start_class);
3515                     flags &= ~SCF_DO_STCLASS;
3516                 }
3517             } else {
3518                 Newx(newframe,1,scan_frame);
3519                 paren = stopparen;
3520                 start = scan+2;
3521                 end = regnext(scan);
3522             }
3523             if (newframe) {
3524                 assert(start);
3525                 assert(end);
3526                 SAVEFREEPV(newframe);
3527                 newframe->next = regnext(scan);
3528                 newframe->last = last;
3529                 newframe->stop = stopparen;
3530                 newframe->prev = frame;
3531
3532                 frame = newframe;
3533                 scan =  start;
3534                 stopparen = paren;
3535                 last = end;
3536
3537                 continue;
3538             }
3539         }
3540         else if (OP(scan) == EXACT) {
3541             I32 l = STR_LEN(scan);
3542             UV uc;
3543             if (UTF) {
3544                 const U8 * const s = (U8*)STRING(scan);
3545                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3546                 l = utf8_length(s, s + l);
3547             } else {
3548                 uc = *((U8*)STRING(scan));
3549             }
3550             min += l;
3551             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3552                 /* The code below prefers earlier match for fixed
3553                    offset, later match for variable offset.  */
3554                 if (data->last_end == -1) { /* Update the start info. */
3555                     data->last_start_min = data->pos_min;
3556                     data->last_start_max = is_inf
3557                         ? I32_MAX : data->pos_min + data->pos_delta;
3558                 }
3559                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3560                 if (UTF)
3561                     SvUTF8_on(data->last_found);
3562                 {
3563                     SV * const sv = data->last_found;
3564                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3565                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3566                     if (mg && mg->mg_len >= 0)
3567                         mg->mg_len += utf8_length((U8*)STRING(scan),
3568                                                   (U8*)STRING(scan)+STR_LEN(scan));
3569                 }
3570                 data->last_end = data->pos_min + l;
3571                 data->pos_min += l; /* As in the first entry. */
3572                 data->flags &= ~SF_BEFORE_EOL;
3573             }
3574             if (flags & SCF_DO_STCLASS_AND) {
3575                 /* Check whether it is compatible with what we know already! */
3576                 int compat = 1;
3577
3578
3579                 /* If compatible, we or it in below.  It is compatible if is
3580                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3581                  * it's for a locale.  Even if there isn't unicode semantics
3582                  * here, at runtime there may be because of matching against a
3583                  * utf8 string, so accept a possible false positive for
3584                  * latin1-range folds */
3585                 if (uc >= 0x100 ||
3586                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3587                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3588                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3589                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3590                     )
3591                 {
3592                     compat = 0;
3593                 }
3594                 ANYOF_CLASS_ZERO(data->start_class);
3595                 ANYOF_BITMAP_ZERO(data->start_class);
3596                 if (compat)
3597                     ANYOF_BITMAP_SET(data->start_class, uc);
3598                 else if (uc >= 0x100) {
3599                     int i;
3600
3601                     /* Some Unicode code points fold to the Latin1 range; as
3602                      * XXX temporary code, instead of figuring out if this is
3603                      * one, just assume it is and set all the start class bits
3604                      * that could be some such above 255 code point's fold
3605                      * which will generate fals positives.  As the code
3606                      * elsewhere that does compute the fold settles down, it
3607                      * can be extracted out and re-used here */
3608                     for (i = 0; i < 256; i++){
3609                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3610                             ANYOF_BITMAP_SET(data->start_class, i);
3611                         }
3612                     }
3613                 }
3614                 data->start_class->flags &= ~ANYOF_EOS;
3615                 if (uc < 0x100)
3616                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3617             }
3618             else if (flags & SCF_DO_STCLASS_OR) {
3619                 /* false positive possible if the class is case-folded */
3620                 if (uc < 0x100)
3621                     ANYOF_BITMAP_SET(data->start_class, uc);
3622                 else
3623                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3624                 data->start_class->flags &= ~ANYOF_EOS;
3625                 cl_and(data->start_class, and_withp);
3626             }
3627             flags &= ~SCF_DO_STCLASS;
3628         }
3629         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3630             I32 l = STR_LEN(scan);
3631             UV uc = *((U8*)STRING(scan));
3632
3633             /* Search for fixed substrings supports EXACT only. */
3634             if (flags & SCF_DO_SUBSTR) {
3635                 assert(data);
3636                 SCAN_COMMIT(pRExC_state, data, minlenp);
3637             }
3638             if (UTF) {
3639                 const U8 * const s = (U8 *)STRING(scan);
3640                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3641                 l = utf8_length(s, s + l);
3642             }
3643             if (has_exactf_sharp_s) {
3644                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3645             }
3646             min += l - min_subtract;
3647             assert (min >= 0);
3648             delta += min_subtract;
3649             if (flags & SCF_DO_SUBSTR) {
3650                 data->pos_min += l - min_subtract;
3651                 if (data->pos_min < 0) {
3652                     data->pos_min = 0;
3653                 }
3654                 data->pos_delta += min_subtract;
3655                 if (min_subtract) {
3656                     data->longest = &(data->longest_float);
3657                 }
3658             }
3659             if (flags & SCF_DO_STCLASS_AND) {
3660                 /* Check whether it is compatible with what we know already! */
3661                 int compat = 1;
3662                 if (uc >= 0x100 ||
3663                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3664                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3665                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3666                 {
3667                     compat = 0;
3668                 }
3669                 ANYOF_CLASS_ZERO(data->start_class);
3670                 ANYOF_BITMAP_ZERO(data->start_class);
3671                 if (compat) {
3672                     ANYOF_BITMAP_SET(data->start_class, uc);
3673                     data->start_class->flags &= ~ANYOF_EOS;
3674                     if (OP(scan) == EXACTFL) {
3675                         /* XXX This set is probably no longer necessary, and
3676                          * probably wrong as LOCALE now is on in the initial
3677                          * state */
3678                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3679                     }
3680                     else {
3681
3682                         /* Also set the other member of the fold pair.  In case
3683                          * that unicode semantics is called for at runtime, use
3684                          * the full latin1 fold.  (Can't do this for locale,
3685                          * because not known until runtime) */
3686                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3687
3688                         /* All other (EXACTFL handled above) folds except under
3689                          * /iaa that include s, S, and sharp_s also may include
3690                          * the others */
3691                         if (OP(scan) != EXACTFA) {
3692                             if (uc == 's' || uc == 'S') {
3693                                 ANYOF_BITMAP_SET(data->start_class,
3694                                                  LATIN_SMALL_LETTER_SHARP_S);
3695                             }
3696                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3697                                 ANYOF_BITMAP_SET(data->start_class, 's');
3698                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3699                             }
3700                         }
3701                     }
3702                 }
3703                 else if (uc >= 0x100) {
3704                     int i;
3705                     for (i = 0; i < 256; i++){
3706                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3707                             ANYOF_BITMAP_SET(data->start_class, i);
3708                         }
3709                     }
3710                 }
3711             }
3712             else if (flags & SCF_DO_STCLASS_OR) {
3713                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3714                     /* false positive possible if the class is case-folded.
3715                        Assume that the locale settings are the same... */
3716                     if (uc < 0x100) {
3717                         ANYOF_BITMAP_SET(data->start_class, uc);
3718                         if (OP(scan) != EXACTFL) {
3719
3720                             /* And set the other member of the fold pair, but
3721                              * can't do that in locale because not known until
3722                              * run-time */
3723                             ANYOF_BITMAP_SET(data->start_class,
3724                                              PL_fold_latin1[uc]);
3725
3726                             /* All folds except under /iaa that include s, S,
3727                              * and sharp_s also may include the others */
3728                             if (OP(scan) != EXACTFA) {
3729                                 if (uc == 's' || uc == 'S') {
3730                                     ANYOF_BITMAP_SET(data->start_class,
3731                                                    LATIN_SMALL_LETTER_SHARP_S);
3732                                 }
3733                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3734                                     ANYOF_BITMAP_SET(data->start_class, 's');
3735                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3736                                 }
3737                             }
3738                         }
3739                     }
3740                     data->start_class->flags &= ~ANYOF_EOS;
3741                 }
3742                 cl_and(data->start_class, and_withp);
3743             }
3744             flags &= ~SCF_DO_STCLASS;
3745         }
3746         else if (REGNODE_VARIES(OP(scan))) {
3747             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3748             I32 f = flags, pos_before = 0;
3749             regnode * const oscan = scan;
3750             struct regnode_charclass_class this_class;
3751             struct regnode_charclass_class *oclass = NULL;
3752             I32 next_is_eval = 0;
3753
3754             switch (PL_regkind[OP(scan)]) {
3755             case WHILEM:                /* End of (?:...)* . */
3756                 scan = NEXTOPER(scan);
3757                 goto finish;
3758             case PLUS:
3759                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3760                     next = NEXTOPER(scan);
3761                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3762                         mincount = 1;
3763                         maxcount = REG_INFTY;
3764                         next = regnext(scan);
3765                         scan = NEXTOPER(scan);
3766                         goto do_curly;
3767                     }
3768                 }
3769                 if (flags & SCF_DO_SUBSTR)
3770                     data->pos_min++;
3771                 min++;
3772                 /* Fall through. */
3773             case STAR:
3774                 if (flags & SCF_DO_STCLASS) {
3775                     mincount = 0;
3776                     maxcount = REG_INFTY;
3777                     next = regnext(scan);
3778                     scan = NEXTOPER(scan);
3779                     goto do_curly;
3780                 }
3781                 is_inf = is_inf_internal = 1;
3782                 scan = regnext(scan);
3783                 if (flags & SCF_DO_SUBSTR) {
3784                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3785                     data->longest = &(data->longest_float);
3786                 }
3787                 goto optimize_curly_tail;
3788             case CURLY:
3789                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3790                     && (scan->flags == stopparen))
3791                 {
3792                     mincount = 1;
3793                     maxcount = 1;
3794                 } else {
3795                     mincount = ARG1(scan);
3796                     maxcount = ARG2(scan);
3797                 }
3798                 next = regnext(scan);
3799                 if (OP(scan) == CURLYX) {
3800                     I32 lp = (data ? *(data->last_closep) : 0);
3801                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3802                 }
3803                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3804                 next_is_eval = (OP(scan) == EVAL);
3805               do_curly:
3806                 if (flags & SCF_DO_SUBSTR) {
3807                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3808                     pos_before = data->pos_min;
3809                 }
3810                 if (data) {
3811                     fl = data->flags;
3812                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3813                     if (is_inf)
3814                         data->flags |= SF_IS_INF;
3815                 }
3816                 if (flags & SCF_DO_STCLASS) {
3817                     cl_init(pRExC_state, &this_class);
3818                     oclass = data->start_class;
3819                     data->start_class = &this_class;
3820                     f |= SCF_DO_STCLASS_AND;
3821                     f &= ~SCF_DO_STCLASS_OR;
3822                 }
3823                 /* Exclude from super-linear cache processing any {n,m}
3824                    regops for which the combination of input pos and regex
3825                    pos is not enough information to determine if a match
3826                    will be possible.
3827
3828                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3829                    regex pos at the \s*, the prospects for a match depend not
3830                    only on the input position but also on how many (bar\s*)
3831                    repeats into the {4,8} we are. */
3832                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3833                     f &= ~SCF_WHILEM_VISITED_POS;
3834
3835                 /* This will finish on WHILEM, setting scan, or on NULL: */
3836                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3837                                       last, data, stopparen, recursed, NULL,
3838                                       (mincount == 0
3839                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3840
3841                 if (flags & SCF_DO_STCLASS)
3842                     data->start_class = oclass;
3843                 if (mincount == 0 || minnext == 0) {
3844                     if (flags & SCF_DO_STCLASS_OR) {
3845                         cl_or(pRExC_state, data->start_class, &this_class);
3846                     }
3847                     else if (flags & SCF_DO_STCLASS_AND) {
3848                         /* Switch to OR mode: cache the old value of
3849                          * data->start_class */
3850                         INIT_AND_WITHP;
3851                         StructCopy(data->start_class, and_withp,
3852                                    struct regnode_charclass_class);
3853                         flags &= ~SCF_DO_STCLASS_AND;
3854                         StructCopy(&this_class, data->start_class,
3855                                    struct regnode_charclass_class);
3856                         flags |= SCF_DO_STCLASS_OR;
3857                         data->start_class->flags |= ANYOF_EOS;
3858                     }
3859                 } else {                /* Non-zero len */
3860                     if (flags & SCF_DO_STCLASS_OR) {
3861                         cl_or(pRExC_state, data->start_class, &this_class);
3862                         cl_and(data->start_class, and_withp);
3863                     }
3864                     else if (flags & SCF_DO_STCLASS_AND)
3865                         cl_and(data->start_class, &this_class);
3866                     flags &= ~SCF_DO_STCLASS;
3867                 }
3868                 if (!scan)              /* It was not CURLYX, but CURLY. */
3869                     scan = next;
3870                 if ( /* ? quantifier ok, except for (?{ ... }) */
3871                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3872                     && (minnext == 0) && (deltanext == 0)
3873                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3874                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3875                 {
3876                     ckWARNreg(RExC_parse,
3877                               "Quantifier unexpected on zero-length expression");
3878                 }
3879
3880                 min += minnext * mincount;
3881                 is_inf_internal |= ((maxcount == REG_INFTY
3882                                      && (minnext + deltanext) > 0)
3883                                     || deltanext == I32_MAX);
3884                 is_inf |= is_inf_internal;
3885                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3886
3887                 /* Try powerful optimization CURLYX => CURLYN. */
3888                 if (  OP(oscan) == CURLYX && data
3889                       && data->flags & SF_IN_PAR
3890                       && !(data->flags & SF_HAS_EVAL)
3891                       && !deltanext && minnext == 1 ) {
3892                     /* Try to optimize to CURLYN.  */
3893                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3894                     regnode * const nxt1 = nxt;
3895 #ifdef DEBUGGING
3896                     regnode *nxt2;
3897 #endif
3898
3899                     /* Skip open. */
3900                     nxt = regnext(nxt);
3901                     if (!REGNODE_SIMPLE(OP(nxt))
3902                         && !(PL_regkind[OP(nxt)] == EXACT
3903                              && STR_LEN(nxt) == 1))
3904                         goto nogo;
3905 #ifdef DEBUGGING
3906                     nxt2 = nxt;
3907 #endif
3908                     nxt = regnext(nxt);
3909                     if (OP(nxt) != CLOSE)
3910                         goto nogo;
3911                     if (RExC_open_parens) {
3912                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3913                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3914                     }
3915                     /* Now we know that nxt2 is the only contents: */
3916                     oscan->flags = (U8)ARG(nxt);
3917                     OP(oscan) = CURLYN;
3918                     OP(nxt1) = NOTHING; /* was OPEN. */
3919
3920 #ifdef DEBUGGING
3921                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3922                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3923                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3924                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3925                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3926                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3927 #endif
3928                 }
3929               nogo:
3930
3931                 /* Try optimization CURLYX => CURLYM. */
3932                 if (  OP(oscan) == CURLYX && data
3933                       && !(data->flags & SF_HAS_PAR)
3934                       && !(data->flags & SF_HAS_EVAL)
3935                       && !deltanext     /* atom is fixed width */
3936                       && minnext != 0   /* CURLYM can't handle zero width */
3937                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3938                 ) {
3939                     /* XXXX How to optimize if data == 0? */
3940                     /* Optimize to a simpler form.  */
3941                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3942                     regnode *nxt2;
3943
3944                     OP(oscan) = CURLYM;
3945                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3946                             && (OP(nxt2) != WHILEM))
3947                         nxt = nxt2;
3948                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3949                     /* Need to optimize away parenths. */
3950                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3951                         /* Set the parenth number.  */
3952                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3953
3954                         oscan->flags = (U8)ARG(nxt);
3955                         if (RExC_open_parens) {
3956                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3957                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3958                         }
3959                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3960                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3961
3962 #ifdef DEBUGGING
3963                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3964                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3965                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3966                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3967 #endif
3968 #if 0
3969                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3970                             regnode *nnxt = regnext(nxt1);
3971                             if (nnxt == nxt) {
3972                                 if (reg_off_by_arg[OP(nxt1)])
3973                                     ARG_SET(nxt1, nxt2 - nxt1);
3974                                 else if (nxt2 - nxt1 < U16_MAX)
3975                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3976                                 else
3977                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3978                             }
3979                             nxt1 = nnxt;
3980                         }
3981 #endif
3982                         /* Optimize again: */
3983                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3984                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3985                     }
3986                     else
3987                         oscan->flags = 0;
3988                 }
3989                 else if ((OP(oscan) == CURLYX)
3990                          && (flags & SCF_WHILEM_VISITED_POS)
3991                          /* See the comment on a similar expression above.
3992                             However, this time it's not a subexpression
3993                             we care about, but the expression itself. */
3994                          && (maxcount == REG_INFTY)
3995                          && data && ++data->whilem_c < 16) {
3996                     /* This stays as CURLYX, we can put the count/of pair. */
3997                     /* Find WHILEM (as in regexec.c) */
3998                     regnode *nxt = oscan + NEXT_OFF(oscan);
3999
4000                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4001                         nxt += ARG(nxt);
4002                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4003                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4004                 }
4005                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4006                     pars++;
4007                 if (flags & SCF_DO_SUBSTR) {
4008                     SV *last_str = NULL;
4009                     int counted = mincount != 0;
4010
4011                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4012 #if defined(SPARC64_GCC_WORKAROUND)
4013                         I32 b = 0;
4014                         STRLEN l = 0;
4015                         const char *s = NULL;
4016                         I32 old = 0;
4017
4018                         if (pos_before >= data->last_start_min)
4019                             b = pos_before;
4020                         else
4021                             b = data->last_start_min;
4022
4023                         l = 0;
4024                         s = SvPV_const(data->last_found, l);
4025                         old = b - data->last_start_min;
4026
4027 #else
4028                         I32 b = pos_before >= data->last_start_min
4029                             ? pos_before : data->last_start_min;
4030                         STRLEN l;
4031                         const char * const s = SvPV_const(data->last_found, l);
4032                         I32 old = b - data->last_start_min;
4033 #endif
4034
4035                         if (UTF)
4036                             old = utf8_hop((U8*)s, old) - (U8*)s;
4037                         l -= old;
4038                         /* Get the added string: */
4039                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4040                         if (deltanext == 0 && pos_before == b) {
4041                             /* What was added is a constant string */
4042                             if (mincount > 1) {
4043                                 SvGROW(last_str, (mincount * l) + 1);
4044                                 repeatcpy(SvPVX(last_str) + l,
4045                                           SvPVX_const(last_str), l, mincount - 1);
4046                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4047                                 /* Add additional parts. */
4048                                 SvCUR_set(data->last_found,
4049                                           SvCUR(data->last_found) - l);
4050                                 sv_catsv(data->last_found, last_str);
4051                                 {
4052                                     SV * sv = data->last_found;
4053                                     MAGIC *mg =
4054                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4055                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4056                                     if (mg && mg->mg_len >= 0)
4057                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4058                                 }
4059                                 data->last_end += l * (mincount - 1);
4060                             }
4061                         } else {
4062                             /* start offset must point into the last copy */
4063                             data->last_start_min += minnext * (mincount - 1);
4064                             data->last_start_max += is_inf ? I32_MAX
4065                                 : (maxcount - 1) * (minnext + data->pos_delta);
4066                         }
4067                     }
4068                     /* It is counted once already... */
4069                     data->pos_min += minnext * (mincount - counted);
4070                     data->pos_delta += - counted * deltanext +
4071                         (minnext + deltanext) * maxcount - minnext * mincount;
4072                     if (mincount != maxcount) {
4073                          /* Cannot extend fixed substrings found inside
4074                             the group.  */
4075                         SCAN_COMMIT(pRExC_state,data,minlenp);
4076                         if (mincount && last_str) {
4077                             SV * const sv = data->last_found;
4078                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4079                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4080
4081                             if (mg)
4082                                 mg->mg_len = -1;
4083                             sv_setsv(sv, last_str);
4084                             data->last_end = data->pos_min;
4085                             data->last_start_min =
4086                                 data->pos_min - CHR_SVLEN(last_str);
4087                             data->last_start_max = is_inf
4088                                 ? I32_MAX
4089                                 : data->pos_min + data->pos_delta
4090                                 - CHR_SVLEN(last_str);
4091                         }
4092                         data->longest = &(data->longest_float);
4093                     }
4094                     SvREFCNT_dec(last_str);
4095                 }
4096                 if (data && (fl & SF_HAS_EVAL))
4097                     data->flags |= SF_HAS_EVAL;
4098               optimize_curly_tail:
4099                 if (OP(oscan) != CURLYX) {
4100                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4101                            && NEXT_OFF(next))
4102                         NEXT_OFF(oscan) += NEXT_OFF(next);
4103                 }
4104                 continue;
4105             default:                    /* REF, ANYOFV, and CLUMP only? */
4106                 if (flags & SCF_DO_SUBSTR) {
4107                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4108                     data->longest = &(data->longest_float);
4109                 }
4110                 is_inf = is_inf_internal = 1;
4111                 if (flags & SCF_DO_STCLASS_OR)
4112                     cl_anything(pRExC_state, data->start_class);
4113                 flags &= ~SCF_DO_STCLASS;
4114                 break;
4115             }
4116         }
4117         else if (OP(scan) == LNBREAK) {
4118             if (flags & SCF_DO_STCLASS) {
4119                 int value = 0;
4120                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4121                 if (flags & SCF_DO_STCLASS_AND) {
4122                     for (value = 0; value < 256; value++)
4123                         if (!is_VERTWS_cp(value))
4124                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4125                 }
4126                 else {
4127                     for (value = 0; value < 256; value++)
4128                         if (is_VERTWS_cp(value))
4129                             ANYOF_BITMAP_SET(data->start_class, value);
4130                 }
4131                 if (flags & SCF_DO_STCLASS_OR)
4132                     cl_and(data->start_class, and_withp);
4133                 flags &= ~SCF_DO_STCLASS;
4134             }
4135             min++;
4136             delta++;    /* Because of the 2 char string cr-lf */
4137             if (flags & SCF_DO_SUBSTR) {
4138                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4139                 data->pos_min += 1;
4140                 data->pos_delta += 1;
4141                 data->longest = &(data->longest_float);
4142             }
4143         }
4144         else if (REGNODE_SIMPLE(OP(scan))) {
4145             int value = 0;
4146
4147             if (flags & SCF_DO_SUBSTR) {
4148                 SCAN_COMMIT(pRExC_state,data,minlenp);
4149                 data->pos_min++;
4150             }
4151             min++;
4152             if (flags & SCF_DO_STCLASS) {
4153                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4154
4155                 /* Some of the logic below assumes that switching
4156                    locale on will only add false positives. */
4157                 switch (PL_regkind[OP(scan)]) {
4158                 case SANY:
4159                 default:
4160                   do_default:
4161                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4162                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4163                         cl_anything(pRExC_state, data->start_class);
4164                     break;
4165                 case REG_ANY:
4166                     if (OP(scan) == SANY)
4167                         goto do_default;
4168                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4169                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4170                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4171                         cl_anything(pRExC_state, data->start_class);
4172                     }
4173                     if (flags & SCF_DO_STCLASS_AND || !value)
4174                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4175                     break;
4176                 case ANYOF:
4177                     if (flags & SCF_DO_STCLASS_AND)
4178                         cl_and(data->start_class,
4179                                (struct regnode_charclass_class*)scan);
4180                     else
4181                         cl_or(pRExC_state, data->start_class,
4182                               (struct regnode_charclass_class*)scan);
4183                     break;
4184                 case ALNUM:
4185                     if (flags & SCF_DO_STCLASS_AND) {
4186                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4187                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
4188                             if (OP(scan) == ALNUMU) {
4189                                 for (value = 0; value < 256; value++) {
4190                                     if (!isWORDCHAR_L1(value)) {
4191                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4192                                     }
4193                                 }
4194                             } else {
4195                                 for (value = 0; value < 256; value++) {
4196                                     if (!isALNUM(value)) {
4197                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4198                                     }
4199                                 }
4200                             }
4201                         }
4202                     }
4203                     else {
4204                         if (data->start_class->flags & ANYOF_LOCALE)
4205                             ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
4206
4207                         /* Even if under locale, set the bits for non-locale
4208                          * in case it isn't a true locale-node.  This will
4209                          * create false positives if it truly is locale */
4210                         if (OP(scan) == ALNUMU) {
4211                             for (value = 0; value < 256; value++) {
4212                                 if (isWORDCHAR_L1(value)) {
4213                                     ANYOF_BITMAP_SET(data->start_class, value);
4214                                 }
4215                             }
4216                         } else {
4217                             for (value = 0; value < 256; value++) {
4218                                 if (isALNUM(value)) {
4219                                     ANYOF_BITMAP_SET(data->start_class, value);
4220                                 }
4221                             }
4222                         }
4223                     }
4224                     break;
4225                 case NALNUM:
4226                     if (flags & SCF_DO_STCLASS_AND) {
4227                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4228                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
4229                             if (OP(scan) == NALNUMU) {
4230                                 for (value = 0; value < 256; value++) {
4231                                     if (isWORDCHAR_L1(value)) {
4232                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4233                                     }
4234                                 }
4235                             } else {
4236                                 for (value = 0; value < 256; value++) {
4237                                     if (isALNUM(value)) {
4238                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4239                                     }
4240                                 }
4241                             }
4242                         }
4243                     }
4244                     else {
4245                         if (data->start_class->flags & ANYOF_LOCALE)
4246                             ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
4247
4248                         /* Even if under locale, set the bits for non-locale in
4249                          * case it isn't a true locale-node.  This will create
4250                          * false positives if it truly is locale */
4251                         if (OP(scan) == NALNUMU) {
4252                             for (value = 0; value < 256; value++) {
4253                                 if (! isWORDCHAR_L1(value)) {
4254                                     ANYOF_BITMAP_SET(data->start_class, value);
4255                                 }
4256                             }
4257                         } else {
4258                             for (value = 0; value < 256; value++) {
4259                                 if (! isALNUM(value)) {
4260                                     ANYOF_BITMAP_SET(data->start_class, value);
4261                                 }
4262                             }
4263                         }
4264                     }
4265                     break;
4266                 case SPACE:
4267                     if (flags & SCF_DO_STCLASS_AND) {
4268                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4269                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4270                             if (OP(scan) == SPACEU) {
4271                                 for (value = 0; value < 256; value++) {
4272                                     if (!isSPACE_L1(value)) {
4273                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4274                                     }
4275                                 }
4276                             } else {
4277                                 for (value = 0; value < 256; value++) {
4278                                     if (!isSPACE(value)) {
4279                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4280                                     }
4281                                 }
4282                             }
4283                         }
4284                     }
4285                     else {
4286                         if (data->start_class->flags & ANYOF_LOCALE) {
4287                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4288                         }
4289                         if (OP(scan) == SPACEU) {
4290                             for (value = 0; value < 256; value++) {
4291                                 if (isSPACE_L1(value)) {
4292                                     ANYOF_BITMAP_SET(data->start_class, value);
4293                                 }
4294                             }
4295                         } else {
4296                             for (value = 0; value < 256; value++) {
4297                                 if (isSPACE(value)) {
4298                                     ANYOF_BITMAP_SET(data->start_class, value);
4299                                 }
4300                             }
4301                         }
4302                     }
4303                     break;
4304                 case NSPACE:
4305                     if (flags & SCF_DO_STCLASS_AND) {
4306                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4307                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4308                             if (OP(scan) == NSPACEU) {
4309                                 for (value = 0; value < 256; value++) {
4310                                     if (isSPACE_L1(value)) {
4311                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4312                                     }
4313                                 }
4314                             } else {
4315                                 for (value = 0; value < 256; value++) {
4316                                     if (isSPACE(value)) {
4317                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4318                                     }
4319                                 }
4320                             }
4321                         }
4322                     }
4323                     else {
4324                         if (data->start_class->flags & ANYOF_LOCALE)
4325                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4326                         if (OP(scan) == NSPACEU) {
4327                             for (value = 0; value < 256; value++) {
4328                                 if (!isSPACE_L1(value)) {
4329                                     ANYOF_BITMAP_SET(data->start_class, value);
4330                                 }
4331                             }
4332                         }
4333                         else {
4334                             for (value = 0; value < 256; value++) {
4335                                 if (!isSPACE(value)) {
4336                                     ANYOF_BITMAP_SET(data->start_class, value);
4337                                 }
4338                             }
4339                         }
4340                     }
4341                     break;
4342                 case DIGIT:
4343                     if (flags & SCF_DO_STCLASS_AND) {
4344                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4345                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4346                             for (value = 0; value < 256; value++)
4347                                 if (!isDIGIT(value))
4348                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4349                         }
4350                     }
4351                     else {
4352                         if (data->start_class->flags & ANYOF_LOCALE)
4353                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4354                         for (value = 0; value < 256; value++)
4355                             if (isDIGIT(value))
4356                                 ANYOF_BITMAP_SET(data->start_class, value);
4357                     }
4358                     break;
4359                 case NDIGIT:
4360                     if (flags & SCF_DO_STCLASS_AND) {
4361                         if (!(data->start_class->flags & ANYOF_LOCALE))
4362                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4363                         for (value = 0; value < 256; value++)
4364                             if (isDIGIT(value))
4365                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4366                     }
4367                     else {
4368                         if (data->start_class->flags & ANYOF_LOCALE)
4369                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4370                         for (value = 0; value < 256; value++)
4371                             if (!isDIGIT(value))
4372                                 ANYOF_BITMAP_SET(data->start_class, value);
4373                     }
4374                     break;
4375                 CASE_SYNST_FNC(VERTWS);
4376                 CASE_SYNST_FNC(HORIZWS);
4377
4378                 }
4379                 if (flags & SCF_DO_STCLASS_OR)
4380                     cl_and(data->start_class, and_withp);
4381                 flags &= ~SCF_DO_STCLASS;
4382             }
4383         }
4384         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4385             data->flags |= (OP(scan) == MEOL
4386                             ? SF_BEFORE_MEOL
4387                             : SF_BEFORE_SEOL);
4388             SCAN_COMMIT(pRExC_state, data, minlenp);
4389
4390         }
4391         else if (  PL_regkind[OP(scan)] == BRANCHJ
4392                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4393                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4394                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4395             if ( OP(scan) == UNLESSM &&
4396                  scan->flags == 0 &&
4397                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4398                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4399             ) {
4400                 regnode *opt;
4401                 regnode *upto= regnext(scan);
4402                 DEBUG_PARSE_r({
4403                     SV * const mysv_val=sv_newmortal();
4404                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4405
4406                     /*DEBUG_PARSE_MSG("opfail");*/
4407                     regprop(RExC_rx, mysv_val, upto);
4408                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4409                                   SvPV_nolen_const(mysv_val),
4410                                   (IV)REG_NODE_NUM(upto),
4411                                   (IV)(upto - scan)
4412                     );
4413                 });
4414                 OP(scan) = OPFAIL;
4415                 NEXT_OFF(scan) = upto - scan;
4416                 for (opt= scan + 1; opt < upto ; opt++)
4417                     OP(opt) = OPTIMIZED;
4418                 scan= upto;
4419                 continue;
4420             }
4421             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4422                 || OP(scan) == UNLESSM )
4423             {
4424                 /* Negative Lookahead/lookbehind
4425                    In this case we can't do fixed string optimisation.
4426                 */
4427
4428                 I32 deltanext, minnext, fake = 0;
4429                 regnode *nscan;
4430                 struct regnode_charclass_class intrnl;
4431                 int f = 0;
4432
4433                 data_fake.flags = 0;
4434                 if (data) {
4435                     data_fake.whilem_c = data->whilem_c;
4436                     data_fake.last_closep = data->last_closep;
4437                 }
4438                 else
4439                     data_fake.last_closep = &fake;
4440                 data_fake.pos_delta = delta;
4441                 if ( flags & SCF_DO_STCLASS && !scan->flags
4442                      && OP(scan) == IFMATCH ) { /* Lookahead */
4443                     cl_init(pRExC_state, &intrnl);
4444                     data_fake.start_class = &intrnl;
4445                     f |= SCF_DO_STCLASS_AND;
4446                 }
4447                 if (flags & SCF_WHILEM_VISITED_POS)
4448                     f |= SCF_WHILEM_VISITED_POS;
4449                 next = regnext(scan);
4450                 nscan = NEXTOPER(NEXTOPER(scan));
4451                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4452                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4453                 if (scan->flags) {
4454                     if (deltanext) {
4455                         FAIL("Variable length lookbehind not implemented");
4456                     }
4457                     else if (minnext > (I32)U8_MAX) {
4458                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4459                     }
4460                     scan->flags = (U8)minnext;
4461                 }
4462                 if (data) {
4463                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4464                         pars++;
4465                     if (data_fake.flags & SF_HAS_EVAL)
4466                         data->flags |= SF_HAS_EVAL;
4467                     data->whilem_c = data_fake.whilem_c;
4468                 }
4469                 if (f & SCF_DO_STCLASS_AND) {
4470                     if (flags & SCF_DO_STCLASS_OR) {
4471                         /* OR before, AND after: ideally we would recurse with
4472                          * data_fake to get the AND applied by study of the
4473                          * remainder of the pattern, and then derecurse;
4474                          * *** HACK *** for now just treat as "no information".
4475                          * See [perl #56690].
4476                          */
4477                         cl_init(pRExC_state, data->start_class);
4478                     }  else {
4479                         /* AND before and after: combine and continue */
4480                         const int was = (data->start_class->flags & ANYOF_EOS);
4481
4482                         cl_and(data->start_class, &intrnl);
4483                         if (was)
4484                             data->start_class->flags |= ANYOF_EOS;
4485                     }
4486                 }
4487             }
4488 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4489             else {
4490                 /* Positive Lookahead/lookbehind
4491                    In this case we can do fixed string optimisation,
4492                    but we must be careful about it. Note in the case of
4493                    lookbehind the positions will be offset by the minimum
4494                    length of the pattern, something we won't know about
4495                    until after the recurse.
4496                 */
4497                 I32 deltanext, fake = 0;
4498                 regnode *nscan;
4499                 struct regnode_charclass_class intrnl;
4500                 int f = 0;
4501                 /* We use SAVEFREEPV so that when the full compile
4502                     is finished perl will clean up the allocated
4503                     minlens when it's all done. This way we don't
4504                     have to worry about freeing them when we know
4505                     they wont be used, which would be a pain.
4506                  */
4507                 I32 *minnextp;
4508                 Newx( minnextp, 1, I32 );
4509                 SAVEFREEPV(minnextp);
4510
4511                 if (data) {
4512                     StructCopy(data, &data_fake, scan_data_t);
4513                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4514                         f |= SCF_DO_SUBSTR;
4515                         if (scan->flags)
4516                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4517                         data_fake.last_found=newSVsv(data->last_found);
4518                     }
4519                 }
4520                 else
4521                     data_fake.last_closep = &fake;
4522                 data_fake.flags = 0;
4523                 data_fake.pos_delta = delta;
4524                 if (is_inf)
4525                     data_fake.flags |= SF_IS_INF;
4526                 if ( flags & SCF_DO_STCLASS && !scan->flags
4527                      && OP(scan) == IFMATCH ) { /* Lookahead */
4528                     cl_init(pRExC_state, &intrnl);
4529                     data_fake.start_class = &intrnl;
4530                     f |= SCF_DO_STCLASS_AND;
4531                 }
4532                 if (flags & SCF_WHILEM_VISITED_POS)
4533                     f |= SCF_WHILEM_VISITED_POS;
4534                 next = regnext(scan);
4535                 nscan = NEXTOPER(NEXTOPER(scan));
4536
4537                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4538                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4539                 if (scan->flags) {
4540                     if (deltanext) {
4541                         FAIL("Variable length lookbehind not implemented");
4542                     }
4543                     else if (*minnextp > (I32)U8_MAX) {
4544                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4545                     }
4546                     scan->flags = (U8)*minnextp;
4547                 }
4548
4549                 *minnextp += min;
4550
4551                 if (f & SCF_DO_STCLASS_AND) {
4552                     const int was = (data->start_class->flags & ANYOF_EOS);
4553
4554                     cl_and(data->start_class, &intrnl);
4555                     if (was)
4556                         data->start_class->flags |= ANYOF_EOS;
4557                 }
4558                 if (data) {
4559                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4560                         pars++;
4561                     if (data_fake.flags & SF_HAS_EVAL)
4562                         data->flags |= SF_HAS_EVAL;
4563                     data->whilem_c = data_fake.whilem_c;
4564                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4565                         if (RExC_rx->minlen<*minnextp)
4566                             RExC_rx->minlen=*minnextp;
4567                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4568                         SvREFCNT_dec(data_fake.last_found);
4569
4570                         if ( data_fake.minlen_fixed != minlenp )
4571                         {
4572                             data->offset_fixed= data_fake.offset_fixed;
4573                             data->minlen_fixed= data_fake.minlen_fixed;
4574                             data->lookbehind_fixed+= scan->flags;
4575                         }
4576                         if ( data_fake.minlen_float != minlenp )
4577                         {
4578                             data->minlen_float= data_fake.minlen_float;
4579                             data->offset_float_min=data_fake.offset_float_min;
4580                             data->offset_float_max=data_fake.offset_float_max;
4581                             data->lookbehind_float+= scan->flags;
4582                         }
4583                     }
4584                 }
4585             }
4586 #endif
4587         }
4588         else if (OP(scan) == OPEN) {
4589             if (stopparen != (I32)ARG(scan))
4590                 pars++;
4591         }
4592         else if (OP(scan) == CLOSE) {
4593             if (stopparen == (I32)ARG(scan)) {
4594                 break;
4595             }
4596             if ((I32)ARG(scan) == is_par) {
4597                 next = regnext(scan);
4598
4599                 if ( next && (OP(next) != WHILEM) && next < last)
4600                     is_par = 0;         /* Disable optimization */
4601             }
4602             if (data)
4603                 *(data->last_closep) = ARG(scan);
4604         }
4605         else if (OP(scan) == EVAL) {
4606                 if (data)
4607                     data->flags |= SF_HAS_EVAL;
4608         }
4609         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4610             if (flags & SCF_DO_SUBSTR) {
4611                 SCAN_COMMIT(pRExC_state,data,minlenp);
4612                 flags &= ~SCF_DO_SUBSTR;
4613             }
4614             if (data && OP(scan)==ACCEPT) {
4615                 data->flags |= SCF_SEEN_ACCEPT;
4616                 if (stopmin > min)
4617                     stopmin = min;
4618             }
4619         }
4620         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4621         {
4622                 if (flags & SCF_DO_SUBSTR) {
4623                     SCAN_COMMIT(pRExC_state,data,minlenp);
4624                     data->longest = &(data->longest_float);
4625                 }
4626                 is_inf = is_inf_internal = 1;
4627                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4628                     cl_anything(pRExC_state, data->start_class);
4629                 flags &= ~SCF_DO_STCLASS;
4630         }
4631         else if (OP(scan) == GPOS) {
4632             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4633                 !(delta || is_inf || (data && data->pos_delta)))
4634             {
4635                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4636                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4637                 if (RExC_rx->gofs < (U32)min)
4638                     RExC_rx->gofs = min;
4639             } else {
4640                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4641                 RExC_rx->gofs = 0;
4642             }
4643         }
4644 #ifdef TRIE_STUDY_OPT
4645 #ifdef FULL_TRIE_STUDY
4646         else if (PL_regkind[OP(scan)] == TRIE) {
4647             /* NOTE - There is similar code to this block above for handling
4648                BRANCH nodes on the initial study.  If you change stuff here
4649                check there too. */
4650             regnode *trie_node= scan;
4651             regnode *tail= regnext(scan);
4652             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4653             I32 max1 = 0, min1 = I32_MAX;
4654             struct regnode_charclass_class accum;
4655
4656             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4657                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4658             if (flags & SCF_DO_STCLASS)
4659                 cl_init_zero(pRExC_state, &accum);
4660
4661             if (!trie->jump) {
4662                 min1= trie->minlen;
4663                 max1= trie->maxlen;
4664             } else {
4665                 const regnode *nextbranch= NULL;
4666                 U32 word;
4667
4668                 for ( word=1 ; word <= trie->wordcount ; word++)
4669                 {
4670                     I32 deltanext=0, minnext=0, f = 0, fake;
4671                     struct regnode_charclass_class this_class;
4672
4673                     data_fake.flags = 0;
4674                     if (data) {
4675                         data_fake.whilem_c = data->whilem_c;
4676                         data_fake.last_closep = data->last_closep;
4677                     }
4678                     else
4679                         data_fake.last_closep = &fake;
4680                     data_fake.pos_delta = delta;
4681                     if (flags & SCF_DO_STCLASS) {
4682                         cl_init(pRExC_state, &this_class);
4683                         data_fake.start_class = &this_class;
4684                         f = SCF_DO_STCLASS_AND;
4685                     }
4686                     if (flags & SCF_WHILEM_VISITED_POS)
4687                         f |= SCF_WHILEM_VISITED_POS;
4688
4689                     if (trie->jump[word]) {
4690                         if (!nextbranch)
4691                             nextbranch = trie_node + trie->jump[0];
4692                         scan= trie_node + trie->jump[word];
4693                         /* We go from the jump point to the branch that follows
4694                            it. Note this means we need the vestigal unused branches
4695                            even though they arent otherwise used.
4696                          */
4697                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4698                             &deltanext, (regnode *)nextbranch, &data_fake,
4699                             stopparen, recursed, NULL, f,depth+1);
4700                     }
4701                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4702                         nextbranch= regnext((regnode*)nextbranch);
4703
4704                     if (min1 > (I32)(minnext + trie->minlen))
4705                         min1 = minnext + trie->minlen;
4706                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4707                         max1 = minnext + deltanext + trie->maxlen;
4708                     if (deltanext == I32_MAX)
4709                         is_inf = is_inf_internal = 1;
4710
4711                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4712                         pars++;
4713                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4714                         if ( stopmin > min + min1)
4715                             stopmin = min + min1;
4716                         flags &= ~SCF_DO_SUBSTR;
4717                         if (data)
4718                             data->flags |= SCF_SEEN_ACCEPT;
4719                     }
4720                     if (data) {
4721                         if (data_fake.flags & SF_HAS_EVAL)
4722                             data->flags |= SF_HAS_EVAL;
4723                         data->whilem_c = data_fake.whilem_c;
4724                     }
4725                     if (flags & SCF_DO_STCLASS)
4726                         cl_or(pRExC_state, &accum, &this_class);
4727                 }
4728             }
4729             if (flags & SCF_DO_SUBSTR) {
4730                 data->pos_min += min1;
4731                 data->pos_delta += max1 - min1;
4732                 if (max1 != min1 || is_inf)
4733                     data->longest = &(data->longest_float);
4734             }
4735             min += min1;
4736             delta += max1 - min1;
4737             if (flags & SCF_DO_STCLASS_OR) {
4738                 cl_or(pRExC_state, data->start_class, &accum);
4739                 if (min1) {
4740                     cl_and(data->start_class, and_withp);
4741                     flags &= ~SCF_DO_STCLASS;
4742                 }
4743             }
4744             else if (flags & SCF_DO_STCLASS_AND) {
4745                 if (min1) {
4746                     cl_and(data->start_class, &accum);
4747                     flags &= ~SCF_DO_STCLASS;
4748                 }
4749                 else {
4750                     /* Switch to OR mode: cache the old value of
4751                      * data->start_class */
4752                     INIT_AND_WITHP;
4753                     StructCopy(data->start_class, and_withp,
4754                                struct regnode_charclass_class);
4755                     flags &= ~SCF_DO_STCLASS_AND;
4756                     StructCopy(&accum, data->start_class,
4757                                struct regnode_charclass_class);
4758                     flags |= SCF_DO_STCLASS_OR;
4759                     data->start_class->flags |= ANYOF_EOS;
4760                 }
4761             }
4762             scan= tail;
4763             continue;
4764         }
4765 #else
4766         else if (PL_regkind[OP(scan)] == TRIE) {
4767             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4768             U8*bang=NULL;
4769
4770             min += trie->minlen;
4771             delta += (trie->maxlen - trie->minlen);
4772             flags &= ~SCF_DO_STCLASS; /* xxx */
4773             if (flags & SCF_DO_SUBSTR) {
4774                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4775                 data->pos_min += trie->minlen;
4776                 data->pos_delta += (trie->maxlen - trie->minlen);
4777                 if (trie->maxlen != trie->minlen)
4778                     data->longest = &(data->longest_float);
4779             }
4780             if (trie->jump) /* no more substrings -- for now /grr*/
4781                 flags &= ~SCF_DO_SUBSTR;
4782         }
4783 #endif /* old or new */
4784 #endif /* TRIE_STUDY_OPT */
4785
4786         /* Else: zero-length, ignore. */
4787         scan = regnext(scan);
4788     }
4789     if (frame) {
4790         last = frame->last;
4791         scan = frame->next;
4792         stopparen = frame->stop;
4793         frame = frame->prev;
4794         goto fake_study_recurse;
4795     }
4796
4797   finish:
4798     assert(!frame);
4799     DEBUG_STUDYDATA("pre-fin:",data,depth);
4800
4801     *scanp = scan;
4802     *deltap = is_inf_internal ? I32_MAX : delta;
4803     if (flags & SCF_DO_SUBSTR && is_inf)
4804         data->pos_delta = I32_MAX - data->pos_min;
4805     if (is_par > (I32)U8_MAX)
4806         is_par = 0;
4807     if (is_par && pars==1 && data) {
4808         data->flags |= SF_IN_PAR;
4809         data->flags &= ~SF_HAS_PAR;
4810     }
4811     else if (pars && data) {
4812         data->flags |= SF_HAS_PAR;
4813         data->flags &= ~SF_IN_PAR;
4814     }
4815     if (flags & SCF_DO_STCLASS_OR)
4816         cl_and(data->start_class, and_withp);
4817     if (flags & SCF_TRIE_RESTUDY)
4818         data->flags |=  SCF_TRIE_RESTUDY;
4819
4820     DEBUG_STUDYDATA("post-fin:",data,depth);
4821
4822     return min < stopmin ? min : stopmin;
4823 }
4824
4825 STATIC U32
4826 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4827 {
4828     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4829
4830     PERL_ARGS_ASSERT_ADD_DATA;
4831
4832     Renewc(RExC_rxi->data,
4833            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4834            char, struct reg_data);
4835     if(count)
4836         Renew(RExC_rxi->data->what, count + n, U8);
4837     else
4838         Newx(RExC_rxi->data->what, n, U8);
4839     RExC_rxi->data->count = count + n;
4840     Copy(s, RExC_rxi->data->what + count, n, U8);
4841     return count;
4842 }
4843
4844 /*XXX: todo make this not included in a non debugging perl */
4845 #ifndef PERL_IN_XSUB_RE
4846 void
4847 Perl_reginitcolors(pTHX)
4848 {
4849     dVAR;
4850     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4851     if (s) {
4852         char *t = savepv(s);
4853         int i = 0;
4854         PL_colors[0] = t;
4855         while (++i < 6) {
4856             t = strchr(t, '\t');
4857             if (t) {
4858                 *t = '\0';
4859                 PL_colors[i] = ++t;
4860             }
4861             else
4862                 PL_colors[i] = t = (char *)"";
4863         }
4864     } else {
4865         int i = 0;
4866         while (i < 6)
4867             PL_colors[i++] = (char *)"";
4868     }
4869     PL_colorset = 1;
4870 }
4871 #endif
4872
4873
4874 #ifdef TRIE_STUDY_OPT
4875 #define CHECK_RESTUDY_GOTO                                  \
4876         if (                                                \
4877               (data.flags & SCF_TRIE_RESTUDY)               \
4878               && ! restudied++                              \
4879         )     goto reStudy
4880 #else
4881 #define CHECK_RESTUDY_GOTO
4882 #endif
4883
4884 /*
4885  * pregcomp - compile a regular expression into internal code
4886  *
4887  * Decides which engine's compiler to call based on the hint currently in
4888  * scope
4889  */
4890
4891 #ifndef PERL_IN_XSUB_RE
4892
4893 /* return the currently in-scope regex engine (or the default if none)  */
4894
4895 regexp_engine const *
4896 Perl_current_re_engine(pTHX)
4897 {
4898     dVAR;
4899
4900     if (IN_PERL_COMPILETIME) {
4901         HV * const table = GvHV(PL_hintgv);
4902         SV **ptr;
4903
4904         if (!table)
4905             return &PL_core_reg_engine;
4906         ptr = hv_fetchs(table, "regcomp", FALSE);
4907         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4908             return &PL_core_reg_engine;
4909         return INT2PTR(regexp_engine*,SvIV(*ptr));
4910     }
4911     else {
4912         SV *ptr;
4913         if (!PL_curcop->cop_hints_hash)
4914             return &PL_core_reg_engine;
4915         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4916         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4917             return &PL_core_reg_engine;
4918         return INT2PTR(regexp_engine*,SvIV(ptr));
4919     }
4920 }
4921
4922
4923 REGEXP *
4924 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4925 {
4926     dVAR;
4927     regexp_engine const *eng = current_re_engine();
4928     GET_RE_DEBUG_FLAGS_DECL;
4929
4930     PERL_ARGS_ASSERT_PREGCOMP;
4931
4932     /* Dispatch a request to compile a regexp to correct regexp engine. */
4933     DEBUG_COMPILE_r({
4934         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4935                         PTR2UV(eng));
4936     });
4937     return CALLREGCOMP_ENG(eng, pattern, flags);
4938 }
4939 #endif
4940
4941 /* public(ish) entry point for the perl core's own regex compiling code.
4942  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4943  * pattern rather than a list of OPs, and uses the internal engine rather
4944  * than the current one */
4945
4946 REGEXP *
4947 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4948 {
4949     SV *pat = pattern; /* defeat constness! */
4950     PERL_ARGS_ASSERT_RE_COMPILE;
4951     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4952 #ifdef PERL_IN_XSUB_RE
4953                                 &my_reg_engine,
4954 #else
4955                                 &PL_core_reg_engine,
4956 #endif
4957                                 NULL, NULL, rx_flags, 0);
4958 }
4959
4960 /* see if there are any run-time code blocks in the pattern.
4961  * False positives are allowed */
4962
4963 static bool
4964 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4965                     U32 pm_flags, char *pat, STRLEN plen)
4966 {
4967     int n = 0;
4968     STRLEN s;
4969
4970     /* avoid infinitely recursing when we recompile the pattern parcelled up
4971      * as qr'...'. A single constant qr// string can't have have any
4972      * run-time component in it, and thus, no runtime code. (A non-qr
4973      * string, however, can, e.g. $x =~ '(?{})') */
4974     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4975         return 0;
4976
4977     for (s = 0; s < plen; s++) {
4978         if (n < pRExC_state->num_code_blocks
4979             && s == pRExC_state->code_blocks[n].start)
4980         {
4981             s = pRExC_state->code_blocks[n].end;
4982             n++;
4983             continue;
4984         }
4985         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4986          * positives here */
4987         if (pat[s] == '(' && pat[s+1] == '?' &&
4988             (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4989         )
4990             return 1;
4991     }
4992     return 0;
4993 }
4994
4995 /* Handle run-time code blocks. We will already have compiled any direct
4996  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4997  * copy of it, but with any literal code blocks blanked out and
4998  * appropriate chars escaped; then feed it into
4999  *
5000  *    eval "qr'modified_pattern'"
5001  *
5002  * For example,
5003  *
5004  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5005  *
5006  * becomes
5007  *
5008  *    qr'a\\bc                       def\'ghi\\\\jkl(?{"this is runtime"})mno'
5009  *
5010  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5011  * and merge them with any code blocks of the original regexp.
5012  *
5013  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5014  * instead, just save the qr and return FALSE; this tells our caller that
5015  * the original pattern needs upgrading to utf8.
5016  */
5017
5018 static bool
5019 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5020     char *pat, STRLEN plen)
5021 {
5022     SV *qr;
5023
5024     GET_RE_DEBUG_FLAGS_DECL;
5025
5026     if (pRExC_state->runtime_code_qr) {
5027         /* this is the second time we've been called; this should
5028          * only happen if the main pattern got upgraded to utf8
5029          * during compilation; re-use the qr we compiled first time
5030          * round (which should be utf8 too)
5031          */
5032         qr = pRExC_state->runtime_code_qr;
5033         pRExC_state->runtime_code_qr = NULL;
5034         assert(RExC_utf8 && SvUTF8(qr));
5035     }
5036     else {
5037         int n = 0;
5038         STRLEN s;
5039         char *p, *newpat;
5040         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5041         SV *sv, *qr_ref;
5042         dSP;
5043
5044         /* determine how many extra chars we need for ' and \ escaping */
5045         for (s = 0; s < plen; s++) {
5046             if (pat[s] == '\'' || pat[s] == '\\')
5047                 newlen++;
5048         }
5049
5050         Newx(newpat, newlen, char);
5051         p = newpat;
5052         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5053
5054         for (s = 0; s < plen; s++) {
5055             if (n < pRExC_state->num_code_blocks
5056                 && s == pRExC_state->code_blocks[n].start)
5057             {
5058                 /* blank out literal code block */
5059                 assert(pat[s] == '(');
5060                 while (s <= pRExC_state->code_blocks[n].end) {
5061                     *p++ = ' ';
5062                     s++;
5063                 }
5064                 s--;
5065                 n++;
5066                 continue;
5067             }
5068             if (pat[s] == '\'' || pat[s] == '\\')
5069                 *p++ = '\\';
5070             *p++ = pat[s];
5071         }
5072         *p++ = '\'';
5073         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5074             *p++ = 'x';
5075         *p++ = '\0';
5076         DEBUG_COMPILE_r({
5077             PerlIO_printf(Perl_debug_log,
5078                 "%sre-parsing pattern for runtime code:%s %s\n",
5079                 PL_colors[4],PL_colors[5],newpat);
5080         });
5081
5082         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5083         Safefree(newpat);
5084
5085         ENTER;
5086         SAVETMPS;
5087         save_re_context();
5088         PUSHSTACKi(PERLSI_REQUIRE);
5089         /* this causes the toker to collapse \\ into \ when parsing
5090          * qr''; normally only q'' does this. It also alters hints
5091          * handling */
5092         PL_reg_state.re_reparsing = TRUE;
5093         eval_sv(sv, G_SCALAR);
5094         SvREFCNT_dec(sv);
5095         SPAGAIN;
5096         qr_ref = POPs;
5097         PUTBACK;
5098         if (SvTRUE(ERRSV))
5099             Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5100         assert(SvROK(qr_ref));
5101         qr = SvRV(qr_ref);
5102         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5103         /* the leaving below frees the tmp qr_ref.
5104          * Give qr a life of its own */
5105         SvREFCNT_inc(qr);
5106         POPSTACK;
5107         FREETMPS;
5108         LEAVE;
5109
5110     }
5111
5112     if (!RExC_utf8 && SvUTF8(qr)) {
5113         /* first time through; the pattern got upgraded; save the
5114          * qr for the next time through */
5115         assert(!pRExC_state->runtime_code_qr);
5116         pRExC_state->runtime_code_qr = qr;
5117         return 0;
5118     }
5119
5120
5121     /* extract any code blocks within the returned qr//  */
5122
5123
5124     /* merge the main (r1) and run-time (r2) code blocks into one */
5125     {
5126         RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5127         struct reg_code_block *new_block, *dst;
5128         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5129         int i1 = 0, i2 = 0;
5130
5131         if (!r2->num_code_blocks) /* we guessed wrong */
5132             return 1;
5133
5134         Newx(new_block,
5135             r1->num_code_blocks + r2->num_code_blocks,
5136             struct reg_code_block);
5137         dst = new_block;
5138
5139         while (    i1 < r1->num_code_blocks
5140                 || i2 < r2->num_code_blocks)
5141         {
5142             struct reg_code_block *src;
5143             bool is_qr = 0;
5144
5145             if (i1 == r1->num_code_blocks) {
5146                 src = &r2->code_blocks[i2++];
5147                 is_qr = 1;
5148             }
5149             else if (i2 == r2->num_code_blocks)
5150                 src = &r1->code_blocks[i1++];
5151             else if (  r1->code_blocks[i1].start
5152                      < r2->code_blocks[i2].start)
5153             {
5154                 src = &r1->code_blocks[i1++];
5155                 assert(src->end < r2->code_blocks[i2].start);
5156             }
5157             else {
5158                 assert(  r1->code_blocks[i1].start
5159                        > r2->code_blocks[i2].start);
5160                 src = &r2->code_blocks[i2++];
5161                 is_qr = 1;
5162                 assert(src->end < r1->code_blocks[i1].start);
5163             }
5164
5165             assert(pat[src->start] == '(');
5166             assert(pat[src->end]   == ')');
5167             dst->start      = src->start;
5168             dst->end        = src->end;
5169             dst->block      = src->block;
5170             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5171                                     : src->src_regex;
5172             dst++;
5173         }
5174         r1->num_code_blocks += r2->num_code_blocks;
5175         Safefree(r1->code_blocks);
5176         r1->code_blocks = new_block;
5177     }
5178
5179     SvREFCNT_dec(qr);
5180     return 1;
5181 }
5182
5183
5184 STATIC bool
5185 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5186 {
5187     /* This is the common code for setting up the floating and fixed length
5188      * string data extracted from Perlre_op_compile() below.  Returns a boolean
5189      * as to whether succeeded or not */
5190
5191     I32 t,ml;
5192
5193     if (! (longest_length
5194            || (eol /* Can't have SEOL and MULTI */
5195                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5196           )
5197             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5198         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5199     {
5200         return FALSE;
5201     }
5202
5203     /* copy the information about the longest from the reg_scan_data
5204         over to the program. */
5205     if (SvUTF8(sv_longest)) {
5206         *rx_utf8 = sv_longest;
5207         *rx_substr = NULL;
5208     } else {
5209         *rx_substr = sv_longest;
5210         *rx_utf8 = NULL;
5211     }
5212     /* end_shift is how many chars that must be matched that
5213         follow this item. We calculate it ahead of time as once the
5214         lookbehind offset is added in we lose the ability to correctly
5215         calculate it.*/
5216     ml = minlen ? *(minlen) : (I32)longest_length;
5217     *rx_end_shift = ml - offset
5218         - longest_length + (SvTAIL(sv_longest) != 0)
5219         + lookbehind;
5220
5221     t = (eol/* Can't have SEOL and MULTI */
5222          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5223     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5224
5225     return TRUE;
5226 }
5227
5228 /*
5229  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5230  * regular expression into internal code.
5231  * The pattern may be passed either as:
5232  *    a list of SVs (patternp plus pat_count)
5233  *    a list of OPs (expr)
5234  * If both are passed, the SV list is used, but the OP list indicates
5235  * which SVs are actually pre-compiled code blocks
5236  *
5237  * The SVs in the list have magic and qr overloading applied to them (and
5238  * the list may be modified in-place with replacement SVs in the latter
5239  * case).
5240  *
5241  * If the pattern hasn't changed from old_re, then old_re will be
5242  * returned.
5243  *
5244  * eng is the current engine. If that engine has an op_comp method, then
5245  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5246  * do the initial concatenation of arguments and pass on to the external
5247  * engine.
5248  *
5249  * If is_bare_re is not null, set it to a boolean indicating whether the
5250  * arg list reduced (after overloading) to a single bare regex which has
5251  * been returned (i.e. /$qr/).
5252  *
5253  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5254  *
5255  * pm_flags contains the PMf_* flags, typically based on those from the
5256  * pm_flags field of the related PMOP. Currently we're only interested in
5257  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5258  *
5259  * We can't allocate space until we know how big the compiled form will be,
5260  * but we can't compile it (and thus know how big it is) until we've got a
5261  * place to put the code.  So we cheat:  we compile it twice, once with code
5262  * generation turned off and size counting turned on, and once "for real".
5263  * This also means that we don't allocate space until we are sure that the
5264  * thing really will compile successfully, and we never have to move the
5265  * code and thus invalidate pointers into it.  (Note that it has to be in
5266  * one piece because free() must be able to free it all.) [NB: not true in perl]
5267  *
5268  * Beware that the optimization-preparation code in here knows about some
5269  * of the structure of the compiled regexp.  [I'll say.]
5270  */
5271
5272 REGEXP *
5273 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5274                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5275                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5276 {
5277     dVAR;
5278     REGEXP *rx;
5279     struct regexp *r;
5280     regexp_internal *ri;
5281     STRLEN plen;
5282     char  * VOL exp;
5283     char* xend;
5284     regnode *scan;
5285     I32 flags;
5286     I32 minlen = 0;
5287     U32 rx_flags;
5288     SV * VOL pat;
5289
5290     /* these are all flags - maybe they should be turned
5291      * into a single int with different bit masks */
5292     I32 sawlookahead = 0;
5293     I32 sawplus = 0;
5294     I32 sawopen = 0;
5295     bool used_setjump = FALSE;
5296     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5297     bool code_is_utf8 = 0;
5298     bool VOL recompile = 0;
5299     bool runtime_code = 0;
5300     U8 jump_ret = 0;
5301     dJMPENV;
5302     scan_data_t data;
5303     RExC_state_t RExC_state;
5304     RExC_state_t * const pRExC_state = &RExC_state;
5305 #ifdef TRIE_STUDY_OPT
5306     int restudied;
5307     RExC_state_t copyRExC_state;
5308 #endif
5309     GET_RE_DEBUG_FLAGS_DECL;
5310
5311     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5312
5313     DEBUG_r(if (!PL_colorset) reginitcolors());
5314
5315 #ifndef PERL_IN_XSUB_RE
5316     /* Initialize these here instead of as-needed, as is quick and avoids
5317      * having to test them each time otherwise */
5318     if (! PL_AboveLatin1) {
5319         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5320         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5321         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5322
5323         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5324         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5325
5326         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5327         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5328
5329         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5330         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5331
5332         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5333
5334         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5335         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5336
5337         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5338
5339         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5340         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5341
5342         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5343         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5344
5345         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5346         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5347
5348         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5349         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5350
5351         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5352         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5353
5354         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5355         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5356
5357         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5358         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5359
5360         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5361
5362         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5363         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5364
5365         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5366         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5367
5368         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5369     }
5370 #endif
5371
5372     pRExC_state->code_blocks = NULL;
5373     pRExC_state->num_code_blocks = 0;
5374
5375     if (is_bare_re)
5376         *is_bare_re = FALSE;
5377
5378     if (expr && (expr->op_type == OP_LIST ||
5379                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5380
5381         /* is the source UTF8, and how many code blocks are there? */
5382         OP *o;
5383         int ncode = 0;
5384
5385         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5386             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5387                 code_is_utf8 = 1;
5388             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5389                 /* count of DO blocks */
5390                 ncode++;
5391         }
5392         if (ncode) {
5393             pRExC_state->num_code_blocks = ncode;
5394             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5395         }
5396     }
5397
5398     if (pat_count) {
5399         /* handle a list of SVs */
5400
5401         SV **svp;
5402
5403         /* apply magic and RE overloading to each arg */
5404         for (svp = patternp; svp < patternp + pat_count; svp++) {
5405             SV *rx = *svp;
5406             SvGETMAGIC(rx);
5407             if (SvROK(rx) && SvAMAGIC(rx)) {
5408                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5409                 if (sv) {
5410                     if (SvROK(sv))
5411                         sv = SvRV(sv);
5412                     if (SvTYPE(sv) != SVt_REGEXP)
5413                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5414                     *svp = sv;
5415                 }
5416             }
5417         }
5418
5419         if (pat_count > 1) {
5420             /* concat multiple args and find any code block indexes */
5421
5422             OP *o = NULL;
5423             int n = 0;
5424             bool utf8 = 0;
5425             STRLEN orig_patlen = 0;
5426
5427             if (pRExC_state->num_code_blocks) {
5428                 o = cLISTOPx(expr)->op_first;
5429                 assert(o->op_type == OP_PUSHMARK);
5430                 o = o->op_sibling;
5431             }
5432
5433             pat = newSVpvn("", 0);
5434             SAVEFREESV(pat);
5435
5436             /* determine if the pattern is going to be utf8 (needed
5437              * in advance to align code block indices correctly).
5438              * XXX This could fail to be detected for an arg with
5439              * overloading but not concat overloading; but the main effect
5440              * in this obscure case is to need a 'use re eval' for a
5441              * literal code block */
5442             for (svp = patternp; svp < patternp + pat_count; svp++) {
5443                 if (SvUTF8(*svp))
5444                     utf8 = 1;
5445             }
5446             if (utf8)
5447                 SvUTF8_on(pat);
5448
5449             for (svp = patternp; svp < patternp + pat_count; svp++) {
5450                 SV *sv, *msv = *svp;
5451                 SV *rx;
5452                 bool code = 0;
5453                 if (o) {
5454                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5455                         assert(n < pRExC_state->num_code_blocks);
5456                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5457                         pRExC_state->code_blocks[n].block = o;
5458                         pRExC_state->code_blocks[n].src_regex = NULL;
5459                         n++;
5460                         code = 1;
5461                         o = o->op_sibling; /* skip CONST */
5462                         assert(o);
5463                     }
5464                     o = o->op_sibling;;
5465                 }
5466
5467                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5468                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5469                 {
5470                     sv_setsv(pat, sv);
5471                     /* overloading involved: all bets are off over literal
5472                      * code. Pretend we haven't seen it */
5473                     pRExC_state->num_code_blocks -= n;
5474                     n = 0;
5475                     rx = NULL;
5476
5477                 }
5478                 else  {
5479                     while (SvAMAGIC(msv)
5480                             && (sv = AMG_CALLunary(msv, string_amg))
5481                             && sv != msv
5482                             &&  !(   SvROK(msv)
5483                                   && SvROK(sv)
5484                                   && SvRV(msv) == SvRV(sv))
5485                     ) {
5486                         msv = sv;
5487                         SvGETMAGIC(msv);
5488                     }
5489                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5490                         msv = SvRV(msv);
5491                     orig_patlen = SvCUR(pat);
5492                     sv_catsv_nomg(pat, msv);
5493                     rx = msv;
5494                     if (code)
5495                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5496                 }
5497
5498                 /* extract any code blocks within any embedded qr//'s */
5499                 if (rx && SvTYPE(rx) == SVt_REGEXP
5500                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5501                 {
5502
5503                     RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5504                     if (ri->num_code_blocks) {
5505                         int i;
5506                         /* the presence of an embedded qr// with code means
5507                          * we should always recompile: the text of the
5508                          * qr// may not have changed, but it may be a
5509                          * different closure than last time */
5510                         recompile = 1;
5511                         Renew(pRExC_state->code_blocks,
5512                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5513                             struct reg_code_block);
5514                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5515                         for (i=0; i < ri->num_code_blocks; i++) {
5516                             struct reg_code_block *src, *dst;
5517                             STRLEN offset =  orig_patlen
5518                                 + ((struct regexp *)SvANY(rx))->pre_prefix;
5519                             assert(n < pRExC_state->num_code_blocks);
5520                             src = &ri->code_blocks[i];
5521                             dst = &pRExC_state->code_blocks[n];
5522                             dst->start      = src->start + offset;
5523                             dst->end        = src->end   + offset;
5524                             dst->block      = src->block;
5525                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5526                                                     src->src_regex
5527                                                         ? src->src_regex
5528                                                         : (REGEXP*)rx);
5529                             n++;
5530                         }
5531                     }
5532                 }
5533             }
5534             SvSETMAGIC(pat);
5535         }
5536         else {
5537             SV *sv;
5538             pat = *patternp;
5539             while (SvAMAGIC(pat)
5540                     && (sv = AMG_CALLunary(pat, string_amg))
5541                     && sv != pat)
5542             {
5543                 pat = sv;
5544                 SvGETMAGIC(pat);
5545             }
5546         }
5547
5548         /* handle bare regex: foo =~ $re */
5549         {
5550             SV *re = pat;
5551             if (SvROK(re))
5552                 re = SvRV(re);
5553             if (SvTYPE(re) == SVt_REGEXP) {
5554                 if (is_bare_re)
5555                     *is_bare_re = TRUE;
5556                 SvREFCNT_inc(re);
5557                 Safefree(pRExC_state->code_blocks);
5558                 return (REGEXP*)re;
5559             }
5560         }
5561     }
5562     else {
5563         /* not a list of SVs, so must be a list of OPs */
5564         assert(expr);
5565         if (expr->op_type == OP_LIST) {
5566             int i = -1;
5567             bool is_code = 0;
5568             OP *o;
5569
5570             pat = newSVpvn("", 0);
5571             SAVEFREESV(pat);
5572             if (code_is_utf8)
5573                 SvUTF8_on(pat);
5574
5575             /* given a list of CONSTs and DO blocks in expr, append all
5576              * the CONSTs to pat, and record the start and end of each
5577              * code block in code_blocks[] (each DO{} op is followed by an
5578              * OP_CONST containing the corresponding literal '(?{...})
5579              * text)
5580              */
5581             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5582                 if (o->op_type == OP_CONST) {
5583                     sv_catsv(pat, cSVOPo_sv);
5584                     if (is_code) {
5585                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5586                         is_code = 0;
5587                     }
5588                 }
5589                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5590                     assert(i+1 < pRExC_state->num_code_blocks);
5591                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5592                     pRExC_state->code_blocks[i].block = o;
5593                     pRExC_state->code_blocks[i].src_regex = NULL;
5594                     is_code = 1;
5595                 }
5596             }
5597         }
5598         else {
5599             assert(expr->op_type == OP_CONST);
5600             pat = cSVOPx_sv(expr);
5601         }
5602     }
5603
5604     exp = SvPV_nomg(pat, plen);
5605
5606     if (!eng->op_comp) {
5607         if ((SvUTF8(pat) && IN_BYTES)
5608                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5609         {
5610             /* make a temporary copy; either to convert to bytes,
5611              * or to avoid repeating get-magic / overloaded stringify */
5612             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5613                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5614         }
5615         Safefree(pRExC_state->code_blocks);
5616         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5617     }
5618
5619     /* ignore the utf8ness if the pattern is 0 length */
5620     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5621     RExC_uni_semantics = 0;
5622     RExC_contains_locale = 0;
5623     pRExC_state->runtime_code_qr = NULL;
5624
5625     /****************** LONG JUMP TARGET HERE***********************/
5626     /* Longjmp back to here if have to switch in midstream to utf8 */
5627     if (! RExC_orig_utf8) {
5628         JMPENV_PUSH(jump_ret);
5629         used_setjump = TRUE;
5630     }
5631
5632     if (jump_ret == 0) {    /* First time through */
5633         xend = exp + plen;
5634
5635         DEBUG_COMPILE_r({
5636             SV *dsv= sv_newmortal();
5637             RE_PV_QUOTED_DECL(s, RExC_utf8,
5638                 dsv, exp, plen, 60);
5639             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5640                            PL_colors[4],PL_colors[5],s);
5641         });
5642     }
5643     else {  /* longjumped back */
5644         U8 *src, *dst;
5645         int n=0;
5646         STRLEN s = 0, d = 0;
5647         bool do_end = 0;
5648
5649         /* If the cause for the longjmp was other than changing to utf8, pop
5650          * our own setjmp, and longjmp to the correct handler */
5651         if (jump_ret != UTF8_LONGJMP) {
5652             JMPENV_POP;
5653             JMPENV_JUMP(jump_ret);
5654         }
5655
5656         GET_RE_DEBUG_FLAGS;
5657
5658         /* It's possible to write a regexp in ascii that represents Unicode
5659         codepoints outside of the byte range, such as via \x{100}. If we
5660         detect such a sequence we have to convert the entire pattern to utf8
5661         and then recompile, as our sizing calculation will have been based
5662         on 1 byte == 1 character, but we will need to use utf8 to encode
5663         at least some part of the pattern, and therefore must convert the whole
5664         thing.
5665         -- dmq */
5666         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5667             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5668
5669         /* upgrade pattern to UTF8, and if there are code blocks,
5670          * recalculate the indices.
5671          * This is essentially an unrolled Perl_bytes_to_utf8() */
5672
5673         src = (U8*)SvPV_nomg(pat, plen);
5674         Newx(dst, plen * 2 + 1, U8);
5675
5676         while (s < plen) {
5677             const UV uv = NATIVE_TO_ASCII(src[s]);
5678             if (UNI_IS_INVARIANT(uv))
5679                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5680             else {
5681                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5682                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5683             }
5684             if (n < pRExC_state->num_code_blocks) {
5685                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5686                     pRExC_state->code_blocks[n].start = d;
5687                     assert(dst[d] == '(');
5688                     do_end = 1;
5689                 }
5690                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5691                     pRExC_state->code_blocks[n].end = d;
5692                     assert(dst[d] == ')');
5693                     do_end = 0;
5694                     n++;
5695                 }
5696             }
5697             s++;
5698             d++;
5699         }
5700         dst[d] = '\0';
5701         plen = d;
5702         exp = (char*) dst;
5703         xend = exp + plen;
5704         SAVEFREEPV(exp);
5705         RExC_orig_utf8 = RExC_utf8 = 1;
5706     }
5707
5708     /* return old regex if pattern hasn't changed */
5709
5710     if (   old_re
5711         && !recompile
5712         && !!RX_UTF8(old_re) == !!RExC_utf8
5713         && RX_PRECOMP(old_re)
5714         && RX_PRELEN(old_re) == plen
5715         && memEQ(RX_PRECOMP(old_re), exp, plen))
5716     {
5717         /* with runtime code, always recompile */
5718         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5719                                             exp, plen);
5720         if (!runtime_code) {
5721             if (used_setjump) {
5722                 JMPENV_POP;
5723             }
5724             Safefree(pRExC_state->code_blocks);
5725             return old_re;
5726         }
5727     }
5728     else if ((pm_flags & PMf_USE_RE_EVAL)
5729                 /* this second condition covers the non-regex literal case,
5730                  * i.e.  $foo =~ '(?{})'. */
5731                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5732                     && (PL_hints & HINT_RE_EVAL))
5733     )
5734         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5735                             exp, plen);
5736
5737 #ifdef TRIE_STUDY_OPT
5738     restudied = 0;
5739 #endif
5740
5741     rx_flags = orig_rx_flags;
5742
5743     if (initial_charset == REGEX_LOCALE_CHARSET) {
5744         RExC_contains_locale = 1;
5745     }
5746     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5747
5748         /* Set to use unicode semantics if the pattern is in utf8 and has the
5749          * 'depends' charset specified, as it means unicode when utf8  */
5750         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5751     }
5752
5753     RExC_precomp = exp;
5754     RExC_flags = rx_flags;
5755     RExC_pm_flags = pm_flags;
5756
5757     if (runtime_code) {
5758         if (PL_tainting && PL_tainted)
5759             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5760
5761         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5762             /* whoops, we have a non-utf8 pattern, whilst run-time code
5763              * got compiled as utf8. Try again with a utf8 pattern */
5764              JMPENV_JUMP(UTF8_LONGJMP);
5765         }
5766     }
5767     assert(!pRExC_state->runtime_code_qr);
5768
5769     RExC_sawback = 0;
5770
5771     RExC_seen = 0;
5772     RExC_in_lookbehind = 0;
5773     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5774     RExC_extralen = 0;
5775     RExC_override_recoding = 0;
5776     RExC_in_multi_char_class = 0;
5777
5778     /* First pass: determine size, legality. */
5779     RExC_parse = exp;
5780     RExC_start = exp;
5781     RExC_end = xend;
5782     RExC_naughty = 0;
5783     RExC_npar = 1;
5784     RExC_nestroot = 0;
5785     RExC_size = 0L;
5786     RExC_emit = &PL_regdummy;
5787     RExC_whilem_seen = 0;
5788     RExC_open_parens = NULL;
5789     RExC_close_parens = NULL;
5790     RExC_opend = NULL;
5791     RExC_paren_names = NULL;
5792 #ifdef DEBUGGING
5793     RExC_paren_name_list = NULL;
5794 #endif
5795     RExC_recurse = NULL;
5796     RExC_recurse_count = 0;
5797     pRExC_state->code_index = 0;
5798
5799 #if 0 /* REGC() is (currently) a NOP at the first pass.
5800        * Clever compilers notice this and complain. --jhi */
5801     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5802 #endif
5803     DEBUG_PARSE_r(
5804         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5805         RExC_lastnum=0;
5806         RExC_lastparse=NULL;
5807     );
5808     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5809         RExC_precomp = NULL;
5810         Safefree(pRExC_state->code_blocks);
5811         return(NULL);
5812     }
5813
5814     /* Here, finished first pass.  Get rid of any added setjmp */
5815     if (used_setjump) {
5816         JMPENV_POP;
5817     }
5818
5819     DEBUG_PARSE_r({
5820         PerlIO_printf(Perl_debug_log,
5821             "Required size %"IVdf" nodes\n"
5822             "Starting second pass (creation)\n",
5823             (IV)RExC_size);
5824         RExC_lastnum=0;
5825         RExC_lastparse=NULL;
5826     });
5827
5828     /* The first pass could have found things that force Unicode semantics */
5829     if ((RExC_utf8 || RExC_uni_semantics)
5830          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5831     {
5832         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5833     }
5834
5835     /* Small enough for pointer-storage convention?
5836        If extralen==0, this means that we will not need long jumps. */
5837     if (RExC_size >= 0x10000L && RExC_extralen)
5838         RExC_size += RExC_extralen;
5839     else
5840         RExC_extralen = 0;
5841     if (RExC_whilem_seen > 15)
5842         RExC_whilem_seen = 15;
5843
5844     /* Allocate space and zero-initialize. Note, the two step process
5845        of zeroing when in debug mode, thus anything assigned has to
5846        happen after that */
5847     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5848     r = (struct regexp*)SvANY(rx);
5849     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5850          char, regexp_internal);
5851     if ( r == NULL || ri == NULL )
5852         FAIL("Regexp out of space");
5853 #ifdef DEBUGGING
5854     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5855     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5856 #else
5857     /* bulk initialize base fields with 0. */
5858     Zero(ri, sizeof(regexp_internal), char);
5859 #endif
5860
5861     /* non-zero initialization begins here */
5862     RXi_SET( r, ri );
5863     r->engine= eng;
5864     r->extflags = rx_flags;
5865     if (pm_flags & PMf_IS_QR) {
5866         ri->code_blocks = pRExC_state->code_blocks;
5867         ri->num_code_blocks = pRExC_state->num_code_blocks;
5868     }
5869     else
5870         SAVEFREEPV(pRExC_state->code_blocks);
5871
5872     {
5873         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5874         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5875
5876         /* The caret is output if there are any defaults: if not all the STD
5877          * flags are set, or if no character set specifier is needed */
5878         bool has_default =
5879                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5880                     || ! has_charset);
5881         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5882         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5883                             >> RXf_PMf_STD_PMMOD_SHIFT);
5884         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5885         char *p;
5886         /* Allocate for the worst case, which is all the std flags are turned
5887          * on.  If more precision is desired, we could do a population count of
5888          * the flags set.  This could be done with a small lookup table, or by
5889          * shifting, masking and adding, or even, when available, assembly
5890          * language for a machine-language population count.
5891          * We never output a minus, as all those are defaults, so are
5892          * covered by the caret */
5893         const STRLEN wraplen = plen + has_p + has_runon
5894             + has_default       /* If needs a caret */
5895
5896                 /* If needs a character set specifier */
5897             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5898             + (sizeof(STD_PAT_MODS) - 1)
5899             + (sizeof("(?:)") - 1);
5900
5901         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5902         SvPOK_on(rx);
5903         if (RExC_utf8)
5904             SvFLAGS(rx) |= SVf_UTF8;
5905         *p++='('; *p++='?';
5906
5907         /* If a default, cover it using the caret */
5908         if (has_default) {
5909             *p++= DEFAULT_PAT_MOD;
5910         }
5911         if (has_charset) {
5912             STRLEN len;
5913             const char* const name = get_regex_charset_name(r->extflags, &len);
5914             Copy(name, p, len, char);
5915             p += len;
5916         }
5917         if (has_p)
5918             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5919         {
5920             char ch;
5921             while((ch = *fptr++)) {
5922                 if(reganch & 1)
5923                     *p++ = ch;
5924                 reganch >>= 1;
5925             }
5926         }
5927
5928         *p++ = ':';
5929         Copy(RExC_precomp, p, plen, char);
5930         assert ((RX_WRAPPED(rx) - p) < 16);
5931         r->pre_prefix = p - RX_WRAPPED(rx);
5932         p += plen;
5933         if (has_runon)
5934             *p++ = '\n';
5935         *p++ = ')';
5936         *p = 0;
5937         SvCUR_set(rx, p - SvPVX_const(rx));
5938     }
5939
5940     r->intflags = 0;
5941     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5942
5943     if (RExC_seen & REG_SEEN_RECURSE) {
5944         Newxz(RExC_open_parens, RExC_npar,regnode *);
5945         SAVEFREEPV(RExC_open_parens);
5946         Newxz(RExC_close_parens,RExC_npar,regnode *);
5947         SAVEFREEPV(RExC_close_parens);
5948     }
5949
5950     /* Useful during FAIL. */
5951 #ifdef RE_TRACK_PATTERN_OFFSETS
5952     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5953     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5954                           "%s %"UVuf" bytes for offset annotations.\n",
5955                           ri->u.offsets ? "Got" : "Couldn't get",
5956                           (UV)((2*RExC_size+1) * sizeof(U32))));
5957 #endif
5958     SetProgLen(ri,RExC_size);
5959     RExC_rx_sv = rx;
5960     RExC_rx = r;
5961     RExC_rxi = ri;
5962
5963     /* Second pass: emit code. */
5964     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5965     RExC_pm_flags = pm_flags;
5966     RExC_parse = exp;
5967     RExC_end = xend;
5968     RExC_naughty = 0;
5969     RExC_npar = 1;
5970     RExC_emit_start = ri->program;
5971     RExC_emit = ri->program;
5972     RExC_emit_bound = ri->program + RExC_size + 1;
5973     pRExC_state->code_index = 0;
5974
5975     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5976     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5977         ReREFCNT_dec(rx);
5978         return(NULL);
5979     }
5980     /* XXXX To minimize changes to RE engine we always allocate
5981        3-units-long substrs field. */
5982     Newx(r->substrs, 1, struct reg_substr_data);
5983     if (RExC_recurse_count) {
5984         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5985         SAVEFREEPV(RExC_recurse);
5986     }
5987
5988 reStudy:
5989     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5990     Zero(r->substrs, 1, struct reg_substr_data);
5991
5992 #ifdef TRIE_STUDY_OPT
5993     if (!restudied) {
5994         StructCopy(&zero_scan_data, &data, scan_data_t);
5995         copyRExC_state = RExC_state;
5996     } else {
5997         U32 seen=RExC_seen;
5998         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5999
6000         RExC_state = copyRExC_state;
6001         if (seen & REG_TOP_LEVEL_BRANCHES)
6002             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6003         else
6004             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6005         if (data.last_found) {
6006             SvREFCNT_dec(data.longest_fixed);
6007             SvREFCNT_dec(data.longest_float);
6008             SvREFCNT_dec(data.last_found);
6009         }
6010         StructCopy(&zero_scan_data, &data, scan_data_t);
6011     }
6012 #else
6013     StructCopy(&zero_scan_data, &data, scan_data_t);
6014 #endif
6015
6016     /* Dig out information for optimizations. */
6017     r->extflags = RExC_flags; /* was pm_op */
6018     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6019
6020     if (UTF)
6021         SvUTF8_on(rx);  /* Unicode in it? */
6022     ri->regstclass = NULL;
6023     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6024         r->intflags |= PREGf_NAUGHTY;
6025     scan = ri->program + 1;             /* First BRANCH. */
6026
6027     /* testing for BRANCH here tells us whether there is "must appear"
6028        data in the pattern. If there is then we can use it for optimisations */
6029     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6030         I32 fake;
6031         STRLEN longest_float_length, longest_fixed_length;
6032         struct regnode_charclass_class ch_class; /* pointed to by data */
6033         int stclass_flag;
6034         I32 last_close = 0; /* pointed to by data */
6035         regnode *first= scan;
6036         regnode *first_next= regnext(first);
6037         /*
6038          * Skip introductions and multiplicators >= 1
6039          * so that we can extract the 'meat' of the pattern that must
6040          * match in the large if() sequence following.
6041          * NOTE that EXACT is NOT covered here, as it is normally
6042          * picked up by the optimiser separately.
6043          *
6044          * This is unfortunate as the optimiser isnt handling lookahead
6045          * properly currently.
6046          *
6047          */
6048         while ((OP(first) == OPEN && (sawopen = 1)) ||
6049                /* An OR of *one* alternative - should not happen now. */
6050             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6051             /* for now we can't handle lookbehind IFMATCH*/
6052             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6053             (OP(first) == PLUS) ||
6054             (OP(first) == MINMOD) ||
6055                /* An {n,m} with n>0 */
6056             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6057             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6058         {
6059                 /*
6060                  * the only op that could be a regnode is PLUS, all the rest
6061                  * will be regnode_1 or regnode_2.
6062                  *
6063                  */
6064                 if (OP(first) == PLUS)
6065                     sawplus = 1;
6066                 else
6067                     first += regarglen[OP(first)];
6068
6069                 first = NEXTOPER(first);
6070                 first_next= regnext(first);
6071         }
6072
6073         /* Starting-point info. */
6074       again:
6075         DEBUG_PEEP("first:",first,0);
6076         /* Ignore EXACT as we deal with it later. */
6077         if (PL_regkind[OP(first)] == EXACT) {
6078             if (OP(first) == EXACT)
6079                 NOOP;   /* Empty, get anchored substr later. */
6080             else
6081                 ri->regstclass = first;
6082         }
6083 #ifdef TRIE_STCLASS
6084         else if (PL_regkind[OP(first)] == TRIE &&
6085                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6086         {
6087             regnode *trie_op;
6088             /* this can happen only on restudy */
6089             if ( OP(first) == TRIE ) {
6090                 struct regnode_1 *trieop = (struct regnode_1 *)
6091                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6092                 StructCopy(first,trieop,struct regnode_1);
6093                 trie_op=(regnode *)trieop;
6094             } else {
6095                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6096                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6097                 StructCopy(first,trieop,struct regnode_charclass);
6098                 trie_op=(regnode *)trieop;
6099             }
6100             OP(trie_op)+=2;
6101             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6102             ri->regstclass = trie_op;
6103         }
6104 #endif
6105         else if (REGNODE_SIMPLE(OP(first)))
6106             ri->regstclass = first;
6107         else if (PL_regkind[OP(first)] == BOUND ||
6108                  PL_regkind[OP(first)] == NBOUND)
6109             ri->regstclass = first;
6110         else if (PL_regkind[OP(first)] == BOL) {
6111             r->extflags |= (OP(first) == MBOL
6112                            ? RXf_ANCH_MBOL
6113                            : (OP(first) == SBOL
6114                               ? RXf_ANCH_SBOL
6115                               : RXf_ANCH_BOL));
6116             first = NEXTOPER(first);
6117             goto again;
6118         }
6119         else if (OP(first) == GPOS) {
6120             r->extflags |= RXf_ANCH_GPOS;
6121             first = NEXTOPER(first);
6122             goto again;
6123         }
6124         else if ((!sawopen || !RExC_sawback) &&
6125             (OP(first) == STAR &&
6126             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6127             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6128         {
6129             /* turn .* into ^.* with an implied $*=1 */
6130             const int type =
6131                 (OP(NEXTOPER(first)) == REG_ANY)
6132                     ? RXf_ANCH_MBOL
6133                     : RXf_ANCH_SBOL;
6134             r->extflags |= type;
6135             r->intflags |= PREGf_IMPLICIT;
6136             first = NEXTOPER(first);
6137             goto again;
6138         }
6139         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6140             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6141             /* x+ must match at the 1st pos of run of x's */
6142             r->intflags |= PREGf_SKIP;
6143
6144         /* Scan is after the zeroth branch, first is atomic matcher. */
6145 #ifdef TRIE_STUDY_OPT
6146         DEBUG_PARSE_r(
6147             if (!restudied)
6148                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6149                               (IV)(first - scan + 1))
6150         );
6151 #else
6152         DEBUG_PARSE_r(
6153             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6154                 (IV)(first - scan + 1))
6155         );
6156 #endif
6157
6158
6159         /*
6160         * If there's something expensive in the r.e., find the
6161         * longest literal string that must appear and make it the
6162         * regmust.  Resolve ties in favor of later strings, since
6163         * the regstart check works with the beginning of the r.e.
6164         * and avoiding duplication strengthens checking.  Not a
6165         * strong reason, but sufficient in the absence of others.
6166         * [Now we resolve ties in favor of the earlier string if
6167         * it happens that c_offset_min has been invalidated, since the
6168         * earlier string may buy us something the later one won't.]
6169         */
6170
6171         data.longest_fixed = newSVpvs("");
6172         data.longest_float = newSVpvs("");
6173         data.last_found = newSVpvs("");
6174         data.longest = &(data.longest_fixed);
6175         first = scan;
6176         if (!ri->regstclass) {
6177             cl_init(pRExC_state, &ch_class);
6178             data.start_class = &ch_class;
6179             stclass_flag = SCF_DO_STCLASS_AND;
6180         } else                          /* XXXX Check for BOUND? */
6181             stclass_flag = 0;
6182         data.last_closep = &last_close;
6183
6184         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6185             &data, -1, NULL, NULL,
6186             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6187
6188
6189         CHECK_RESTUDY_GOTO;
6190
6191
6192         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6193              && data.last_start_min == 0 && data.last_end > 0
6194              && !RExC_seen_zerolen
6195              && !(RExC_seen & REG_SEEN_VERBARG)
6196              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6197             r->extflags |= RXf_CHECK_ALL;
6198         scan_commit(pRExC_state, &data,&minlen,0);
6199         SvREFCNT_dec(data.last_found);
6200
6201         longest_float_length = CHR_SVLEN(data.longest_float);
6202
6203         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6204                    && data.offset_fixed == data.offset_float_min
6205                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6206             && S_setup_longest (aTHX_ pRExC_state,
6207                                     data.longest_float,
6208                                     &(r->float_utf8),
6209                                     &(r->float_substr),
6210                                     &(r->float_end_shift),
6211                                     data.lookbehind_float,
6212                                     data.offset_float_min,
6213                                     data.minlen_float,
6214                                     longest_float_length,
6215                                     data.flags & SF_FL_BEFORE_EOL,
6216                                     data.flags & SF_FL_BEFORE_MEOL))
6217         {
6218             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6219             r->float_max_offset = data.offset_float_max;
6220             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6221                 r->float_max_offset -= data.lookbehind_float;
6222         }
6223         else {
6224             r->float_substr = r->float_utf8 = NULL;
6225             SvREFCNT_dec(data.longest_float);
6226             longest_float_length = 0;
6227         }
6228
6229         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6230
6231         if (S_setup_longest (aTHX_ pRExC_state,
6232                                 data.longest_fixed,
6233                                 &(r->anchored_utf8),
6234                                 &(r->anchored_substr),
6235                                 &(r->anchored_end_shift),
6236                                 data.lookbehind_fixed,
6237                                 data.offset_fixed,
6238                                 data.minlen_fixed,
6239                                 longest_fixed_length,
6240                                 data.flags & SF_FIX_BEFORE_EOL,
6241                                 data.flags & SF_FIX_BEFORE_MEOL))
6242         {
6243             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6244         }
6245         else {
6246             r->anchored_substr = r->anchored_utf8 = NULL;
6247             SvREFCNT_dec(data.longest_fixed);
6248             longest_fixed_length = 0;
6249         }
6250
6251         if (ri->regstclass
6252             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6253             ri->regstclass = NULL;
6254
6255         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6256             && stclass_flag
6257             && !(data.start_class->flags & ANYOF_EOS)
6258             && !cl_is_anything(data.start_class))
6259         {
6260             const U32 n = add_data(pRExC_state, 1, "f");
6261             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6262
6263             Newx(RExC_rxi->data->data[n], 1,
6264                 struct regnode_charclass_class);
6265             StructCopy(data.start_class,
6266                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6267                        struct regnode_charclass_class);
6268             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6269             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6270             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6271                       regprop(r, sv, (regnode*)data.start_class);
6272                       PerlIO_printf(Perl_debug_log,
6273                                     "synthetic stclass \"%s\".\n",
6274                                     SvPVX_const(sv));});
6275         }
6276
6277         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6278         if (longest_fixed_length > longest_float_length) {
6279             r->check_end_shift = r->anchored_end_shift;
6280             r->check_substr = r->anchored_substr;
6281             r->check_utf8 = r->anchored_utf8;
6282             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6283             if (r->extflags & RXf_ANCH_SINGLE)
6284                 r->extflags |= RXf_NOSCAN;
6285         }
6286         else {
6287             r->check_end_shift = r->float_end_shift;
6288             r->check_substr = r->float_substr;
6289             r->check_utf8 = r->float_utf8;
6290             r->check_offset_min = r->float_min_offset;
6291             r->check_offset_max = r->float_max_offset;
6292         }
6293         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6294            This should be changed ASAP!  */
6295         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6296             r->extflags |= RXf_USE_INTUIT;
6297             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6298                 r->extflags |= RXf_INTUIT_TAIL;
6299         }
6300         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6301         if ( (STRLEN)minlen < longest_float_length )
6302             minlen= longest_float_length;
6303         if ( (STRLEN)minlen < longest_fixed_length )
6304             minlen= longest_fixed_length;
6305         */
6306     }
6307     else {
6308         /* Several toplevels. Best we can is to set minlen. */
6309         I32 fake;
6310         struct regnode_charclass_class ch_class;
6311         I32 last_close = 0;
6312
6313         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6314
6315         scan = ri->program + 1;
6316         cl_init(pRExC_state, &ch_class);
6317         data.start_class = &ch_class;
6318         data.last_closep = &last_close;
6319
6320
6321         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6322             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6323
6324         CHECK_RESTUDY_GOTO;
6325
6326         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6327                 = r->float_substr = r->float_utf8 = NULL;
6328
6329         if (!(data.start_class->flags & ANYOF_EOS)
6330             && !cl_is_anything(data.start_class))
6331         {
6332             const U32 n = add_data(pRExC_state, 1, "f");
6333             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6334
6335             Newx(RExC_rxi->data->data[n], 1,
6336                 struct regnode_charclass_class);
6337             StructCopy(data.start_class,
6338                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6339                        struct regnode_charclass_class);
6340             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6341             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6342             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6343                       regprop(r, sv, (regnode*)data.start_class);
6344                       PerlIO_printf(Perl_debug_log,
6345                                     "synthetic stclass \"%s\".\n",
6346                                     SvPVX_const(sv));});
6347         }
6348     }
6349
6350     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6351        the "real" pattern. */
6352     DEBUG_OPTIMISE_r({
6353         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6354                       (IV)minlen, (IV)r->minlen);
6355     });
6356     r->minlenret = minlen;
6357     if (r->minlen < minlen)
6358         r->minlen = minlen;
6359
6360     if (RExC_seen & REG_SEEN_GPOS)
6361         r->extflags |= RXf_GPOS_SEEN;
6362     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6363         r->extflags |= RXf_LOOKBEHIND_SEEN;
6364     if (pRExC_state->num_code_blocks)
6365         r->extflags |= RXf_EVAL_SEEN;
6366     if (RExC_seen & REG_SEEN_CANY)
6367         r->extflags |= RXf_CANY_SEEN;
6368     if (RExC_seen & REG_SEEN_VERBARG)
6369     {
6370         r->intflags |= PREGf_VERBARG_SEEN;
6371         r->extflags |= RXf_MODIFIES_VARS;
6372     }
6373     if (RExC_seen & REG_SEEN_CUTGROUP)
6374         r->intflags |= PREGf_CUTGROUP_SEEN;
6375     if (pm_flags & PMf_USE_RE_EVAL)
6376         r->intflags |= PREGf_USE_RE_EVAL;
6377     if (RExC_paren_names)
6378         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6379     else
6380         RXp_PAREN_NAMES(r) = NULL;
6381
6382 #ifdef STUPID_PATTERN_CHECKS
6383     if (RX_PRELEN(rx) == 0)
6384         r->extflags |= RXf_NULL;
6385     if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6386         r->extflags |= RXf_WHITE;
6387     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6388         r->extflags |= RXf_START_ONLY;
6389 #else
6390     {
6391         regnode *first = ri->program + 1;
6392         U8 fop = OP(first);
6393
6394         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6395             r->extflags |= RXf_NULL;
6396         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6397             r->extflags |= RXf_START_ONLY;
6398         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6399                              && OP(regnext(first)) == END)
6400             r->extflags |= RXf_WHITE;
6401     }
6402 #endif
6403 #ifdef DEBUGGING
6404     if (RExC_paren_names) {
6405         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6406         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6407     } else
6408 #endif
6409         ri->name_list_idx = 0;
6410
6411     if (RExC_recurse_count) {
6412         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6413             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6414             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6415         }
6416     }
6417     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6418     /* assume we don't need to swap parens around before we match */
6419
6420     DEBUG_DUMP_r({
6421         PerlIO_printf(Perl_debug_log,"Final program:\n");
6422         regdump(r);
6423     });
6424 #ifdef RE_TRACK_PATTERN_OFFSETS
6425     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6426         const U32 len = ri->u.offsets[0];
6427         U32 i;
6428         GET_RE_DEBUG_FLAGS_DECL;
6429         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6430         for (i = 1; i <= len; i++) {
6431             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6432                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6433                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6434             }
6435         PerlIO_printf(Perl_debug_log, "\n");
6436     });
6437 #endif
6438     return rx;
6439 }
6440
6441
6442 SV*
6443 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6444                     const U32 flags)
6445 {
6446     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6447
6448     PERL_UNUSED_ARG(value);
6449
6450     if (flags & RXapif_FETCH) {
6451         return reg_named_buff_fetch(rx, key, flags);
6452     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6453         Perl_croak_no_modify(aTHX);
6454         return NULL;
6455     } else if (flags & RXapif_EXISTS) {
6456         return reg_named_buff_exists(rx, key, flags)
6457             ? &PL_sv_yes
6458             : &PL_sv_no;
6459     } else if (flags & RXapif_REGNAMES) {
6460         return reg_named_buff_all(rx, flags);
6461     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6462         return reg_named_buff_scalar(rx, flags);
6463     } else {
6464         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6465         return NULL;
6466     }
6467 }
6468
6469 SV*
6470 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6471                          const U32 flags)
6472 {
6473     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6474     PERL_UNUSED_ARG(lastkey);
6475
6476     if (flags & RXapif_FIRSTKEY)
6477         return reg_named_buff_firstkey(rx, flags);
6478     else if (flags & RXapif_NEXTKEY)
6479         return reg_named_buff_nextkey(rx, flags);
6480     else {
6481         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6482         return NULL;
6483     }
6484 }
6485
6486 SV*
6487 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6488                           const U32 flags)
6489 {
6490     AV *retarray = NULL;
6491     SV *ret;
6492     struct regexp *const rx = (struct regexp *)SvANY(r);
6493
6494     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6495
6496     if (flags & RXapif_ALL)
6497         retarray=newAV();
6498
6499     if (rx && RXp_PAREN_NAMES(rx)) {
6500         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6501         if (he_str) {
6502             IV i;
6503             SV* sv_dat=HeVAL(he_str);
6504             I32 *nums=(I32*)SvPVX(sv_dat);
6505             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6506                 if ((I32)(rx->nparens) >= nums[i]
6507                     && rx->offs[nums[i]].start != -1
6508                     && rx->offs[nums[i]].end != -1)
6509                 {
6510                     ret = newSVpvs("");
6511                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6512                     if (!retarray)
6513                         return ret;
6514                 } else {
6515                     if (retarray)
6516                         ret = newSVsv(&PL_sv_undef);
6517                 }
6518                 if (retarray)
6519                     av_push(retarray, ret);
6520             }
6521             if (retarray)
6522                 return newRV_noinc(MUTABLE_SV(retarray));
6523         }
6524     }
6525     return NULL;
6526 }
6527
6528 bool
6529 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6530                            const U32 flags)
6531 {
6532     struct regexp *const rx = (struct regexp *)SvANY(r);
6533
6534     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6535
6536     if (rx && RXp_PAREN_NAMES(rx)) {
6537         if (flags & RXapif_ALL) {
6538             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6539         } else {
6540             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6541             if (sv) {
6542                 SvREFCNT_dec(sv);
6543                 return TRUE;
6544             } else {
6545                 return FALSE;
6546             }
6547         }
6548     } else {
6549         return FALSE;
6550     }
6551 }
6552
6553 SV*
6554 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6555 {
6556     struct regexp *const rx = (struct regexp *)SvANY(r);
6557
6558     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6559
6560     if ( rx && RXp_PAREN_NAMES(rx) ) {
6561         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6562
6563         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6564     } else {
6565         return FALSE;
6566     }
6567 }
6568
6569 SV*
6570 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6571 {
6572     struct regexp *const rx = (struct regexp *)SvANY(r);
6573     GET_RE_DEBUG_FLAGS_DECL;
6574
6575     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6576
6577     if (rx && RXp_PAREN_NAMES(rx)) {
6578         HV *hv = RXp_PAREN_NAMES(rx);
6579         HE *temphe;
6580         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6581             IV i;
6582             IV parno = 0;
6583             SV* sv_dat = HeVAL(temphe);
6584             I32 *nums = (I32*)SvPVX(sv_dat);
6585             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6586                 if ((I32)(rx->lastparen) >= nums[i] &&
6587                     rx->offs[nums[i]].start != -1 &&
6588                     rx->offs[nums[i]].end != -1)
6589                 {
6590                     parno = nums[i];
6591                     break;
6592                 }
6593             }
6594             if (parno || flags & RXapif_ALL) {
6595                 return newSVhek(HeKEY_hek(temphe));
6596             }
6597         }
6598     }
6599     return NULL;
6600 }
6601
6602 SV*
6603 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6604 {
6605     SV *ret;
6606     AV *av;
6607     I32 length;
6608     struct regexp *const rx = (struct regexp *)SvANY(r);
6609
6610     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6611
6612     if (rx && RXp_PAREN_NAMES(rx)) {
6613         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6614             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6615         } else if (flags & RXapif_ONE) {
6616             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6617             av = MUTABLE_AV(SvRV(ret));
6618             length = av_len(av);
6619             SvREFCNT_dec(ret);
6620             return newSViv(length + 1);
6621         } else {
6622             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6623             return NULL;
6624         }
6625     }
6626     return &PL_sv_undef;
6627 }
6628
6629 SV*
6630 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6631 {
6632     struct regexp *const rx = (struct regexp *)SvANY(r);
6633     AV *av = newAV();
6634
6635     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6636
6637     if (rx && RXp_PAREN_NAMES(rx)) {
6638         HV *hv= RXp_PAREN_NAMES(rx);
6639         HE *temphe;
6640         (void)hv_iterinit(hv);
6641         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6642             IV i;
6643             IV parno = 0;
6644             SV* sv_dat = HeVAL(temphe);
6645             I32 *nums = (I32*)SvPVX(sv_dat);
6646             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6647                 if ((I32)(rx->lastparen) >= nums[i] &&
6648                     rx->offs[nums[i]].start != -1 &&
6649                     rx->offs[nums[i]].end != -1)
6650                 {
6651                     parno = nums[i];
6652                     break;
6653                 }
6654             }
6655             if (parno || flags & RXapif_ALL) {
6656                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6657             }
6658         }
6659     }
6660
6661     return newRV_noinc(MUTABLE_SV(av));
6662 }
6663
6664 void
6665 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6666                              SV * const sv)
6667 {
6668     struct regexp *const rx = (struct regexp *)SvANY(r);
6669     char *s = NULL;
6670     I32 i = 0;
6671     I32 s1, t1;
6672     I32 n = paren;
6673
6674     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6675
6676     if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6677            || n == RX_BUFF_IDX_CARET_FULLMATCH
6678            || n == RX_BUFF_IDX_CARET_POSTMATCH
6679          )
6680          && !(rx->extflags & RXf_PMf_KEEPCOPY)
6681     )
6682         goto ret_undef;
6683
6684     if (!rx->subbeg)
6685         goto ret_undef;
6686
6687     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6688         /* no need to distinguish between them any more */
6689         n = RX_BUFF_IDX_FULLMATCH;
6690
6691     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6692         && rx->offs[0].start != -1)
6693     {
6694         /* $`, ${^PREMATCH} */
6695         i = rx->offs[0].start;
6696         s = rx->subbeg;
6697     }
6698     else
6699     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6700         && rx->offs[0].end != -1)
6701     {
6702         /* $', ${^POSTMATCH} */
6703         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6704         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6705     }
6706     else
6707     if ( 0 <= n && n <= (I32)rx->nparens &&
6708         (s1 = rx->offs[n].start) != -1 &&
6709         (t1 = rx->offs[n].end) != -1)
6710     {
6711         /* $&, ${^MATCH},  $1 ... */
6712         i = t1 - s1;
6713         s = rx->subbeg + s1 - rx->suboffset;
6714     } else {
6715         goto ret_undef;
6716     }
6717
6718     assert(s >= rx->subbeg);
6719     assert(rx->sublen >= (s - rx->subbeg) + i );
6720     if (i >= 0) {
6721         const int oldtainted = PL_tainted;
6722         TAINT_NOT;
6723         sv_setpvn(sv, s, i);
6724         PL_tainted = oldtainted;
6725         if ( (rx->extflags & RXf_CANY_SEEN)
6726             ? (RXp_MATCH_UTF8(rx)
6727                         && (!i || is_utf8_string((U8*)s, i)))
6728             : (RXp_MATCH_UTF8(rx)) )
6729         {
6730             SvUTF8_on(sv);
6731         }
6732         else
6733             SvUTF8_off(sv);
6734         if (PL_tainting) {
6735             if (RXp_MATCH_TAINTED(rx)) {
6736                 if (SvTYPE(sv) >= SVt_PVMG) {
6737                     MAGIC* const mg = SvMAGIC(sv);
6738                     MAGIC* mgt;
6739                     PL_tainted = 1;
6740                     SvMAGIC_set(sv, mg->mg_moremagic);
6741                     SvTAINT(sv);
6742                     if ((mgt = SvMAGIC(sv))) {
6743                         mg->mg_moremagic = mgt;
6744                         SvMAGIC_set(sv, mg);
6745                     }
6746                 } else {
6747                     PL_tainted = 1;
6748                     SvTAINT(sv);
6749                 }
6750             } else
6751                 SvTAINTED_off(sv);
6752         }
6753     } else {
6754       ret_undef:
6755         sv_setsv(sv,&PL_sv_undef);
6756         return;
6757     }
6758 }
6759
6760 void
6761 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6762                                                          SV const * const value)
6763 {
6764     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6765
6766     PERL_UNUSED_ARG(rx);
6767     PERL_UNUSED_ARG(paren);
6768     PERL_UNUSED_ARG(value);
6769
6770     if (!PL_localizing)
6771         Perl_croak_no_modify(aTHX);
6772 }
6773
6774 I32
6775 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6776                               const I32 paren)
6777 {
6778     struct regexp *const rx = (struct regexp *)SvANY(r);
6779     I32 i;
6780     I32 s1, t1;
6781
6782     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6783
6784     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6785     switch (paren) {
6786       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6787          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6788             goto warn_undef;
6789         /*FALLTHROUGH*/
6790
6791       case RX_BUFF_IDX_PREMATCH:       /* $` */
6792         if (rx->offs[0].start != -1) {
6793                         i = rx->offs[0].start;
6794                         if (i > 0) {
6795                                 s1 = 0;
6796                                 t1 = i;
6797                                 goto getlen;
6798                         }
6799             }
6800         return 0;
6801
6802       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6803          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6804             goto warn_undef;
6805       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6806             if (rx->offs[0].end != -1) {
6807                         i = rx->sublen - rx->offs[0].end;
6808                         if (i > 0) {
6809                                 s1 = rx->offs[0].end;
6810                                 t1 = rx->sublen;
6811                                 goto getlen;
6812                         }
6813             }
6814         return 0;
6815
6816       case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6817          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6818             goto warn_undef;
6819         /*FALLTHROUGH*/
6820
6821       /* $& / ${^MATCH}, $1, $2, ... */
6822       default:
6823             if (paren <= (I32)rx->nparens &&
6824             (s1 = rx->offs[paren].start) != -1 &&
6825             (t1 = rx->offs[paren].end) != -1)
6826             {
6827             i = t1 - s1;
6828             goto getlen;
6829         } else {
6830           warn_undef:
6831             if (ckWARN(WARN_UNINITIALIZED))
6832                 report_uninit((const SV *)sv);
6833             return 0;
6834         }
6835     }
6836   getlen:
6837     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6838         const char * const s = rx->subbeg - rx->suboffset + s1;
6839         const U8 *ep;
6840         STRLEN el;
6841
6842         i = t1 - s1;
6843         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6844                         i = el;
6845     }
6846     return i;
6847 }
6848
6849 SV*
6850 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6851 {
6852     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6853         PERL_UNUSED_ARG(rx);
6854         if (0)
6855             return NULL;
6856         else
6857             return newSVpvs("Regexp");
6858 }
6859
6860 /* Scans the name of a named buffer from the pattern.
6861  * If flags is REG_RSN_RETURN_NULL returns null.
6862  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6863  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6864  * to the parsed name as looked up in the RExC_paren_names hash.
6865  * If there is an error throws a vFAIL().. type exception.
6866  */
6867
6868 #define REG_RSN_RETURN_NULL    0
6869 #define REG_RSN_RETURN_NAME    1
6870 #define REG_RSN_RETURN_DATA    2
6871
6872 STATIC SV*
6873 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6874 {
6875     char *name_start = RExC_parse;
6876
6877     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6878
6879     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6880          /* skip IDFIRST by using do...while */
6881         if (UTF)
6882             do {
6883                 RExC_parse += UTF8SKIP(RExC_parse);
6884             } while (isALNUM_utf8((U8*)RExC_parse));
6885         else
6886             do {
6887                 RExC_parse++;
6888             } while (isALNUM(*RExC_parse));
6889     } else {
6890         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6891         vFAIL("Group name must start with a non-digit word character");
6892     }
6893     if ( flags ) {
6894         SV* sv_name
6895             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6896                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6897         if ( flags == REG_RSN_RETURN_NAME)
6898             return sv_name;
6899         else if (flags==REG_RSN_RETURN_DATA) {
6900             HE *he_str = NULL;
6901             SV *sv_dat = NULL;
6902             if ( ! sv_name )      /* should not happen*/
6903                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6904             if (RExC_paren_names)
6905                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6906             if ( he_str )
6907                 sv_dat = HeVAL(he_str);
6908             if ( ! sv_dat )
6909                 vFAIL("Reference to nonexistent named group");
6910             return sv_dat;
6911         }
6912         else {
6913             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6914                        (unsigned long) flags);
6915         }
6916         assert(0); /* NOT REACHED */
6917     }
6918     return NULL;
6919 }
6920
6921 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6922     int rem=(int)(RExC_end - RExC_parse);                       \
6923     int cut;                                                    \
6924     int num;                                                    \
6925     int iscut=0;                                                \
6926     if (rem>10) {                                               \
6927         rem=10;                                                 \
6928         iscut=1;                                                \
6929     }                                                           \
6930     cut=10-rem;                                                 \
6931     if (RExC_lastparse!=RExC_parse)                             \
6932         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6933             rem, RExC_parse,                                    \
6934             cut + 4,                                            \
6935             iscut ? "..." : "<"                                 \
6936         );                                                      \
6937     else                                                        \
6938         PerlIO_printf(Perl_debug_log,"%16s","");                \
6939                                                                 \
6940     if (SIZE_ONLY)                                              \
6941        num = RExC_size + 1;                                     \
6942     else                                                        \
6943        num=REG_NODE_NUM(RExC_emit);                             \
6944     if (RExC_lastnum!=num)                                      \
6945        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6946     else                                                        \
6947        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6948     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6949         (int)((depth*2)), "",                                   \
6950         (funcname)                                              \
6951     );                                                          \
6952     RExC_lastnum=num;                                           \
6953     RExC_lastparse=RExC_parse;                                  \
6954 })
6955
6956
6957
6958 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6959     DEBUG_PARSE_MSG((funcname));                            \
6960     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6961 })
6962 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6963     DEBUG_PARSE_MSG((funcname));                            \
6964     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6965 })
6966
6967 /* This section of code defines the inversion list object and its methods.  The
6968  * interfaces are highly subject to change, so as much as possible is static to
6969  * this file.  An inversion list is here implemented as a malloc'd C UV array
6970  * with some added info that is placed as UVs at the beginning in a header
6971  * portion.  An inversion list for Unicode is an array of code points, sorted
6972  * by ordinal number.  The zeroth element is the first code point in the list.
6973  * The 1th element is the first element beyond that not in the list.  In other
6974  * words, the first range is
6975  *  invlist[0]..(invlist[1]-1)
6976  * The other ranges follow.  Thus every element whose index is divisible by two
6977  * marks the beginning of a range that is in the list, and every element not
6978  * divisible by two marks the beginning of a range not in the list.  A single
6979  * element inversion list that contains the single code point N generally
6980  * consists of two elements
6981  *  invlist[0] == N
6982  *  invlist[1] == N+1
6983  * (The exception is when N is the highest representable value on the
6984  * machine, in which case the list containing just it would be a single
6985  * element, itself.  By extension, if the last range in the list extends to
6986  * infinity, then the first element of that range will be in the inversion list
6987  * at a position that is divisible by two, and is the final element in the
6988  * list.)
6989  * Taking the complement (inverting) an inversion list is quite simple, if the
6990  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6991  * This implementation reserves an element at the beginning of each inversion
6992  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
6993  * actual beginning of the list is either that element if 0, or the next one if
6994  * 1.
6995  *
6996  * More about inversion lists can be found in "Unicode Demystified"
6997  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6998  * More will be coming when functionality is added later.
6999  *
7000  * The inversion list data structure is currently implemented as an SV pointing
7001  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7002  * array of UV whose memory management is automatically handled by the existing
7003  * facilities for SV's.
7004  *
7005  * Some of the methods should always be private to the implementation, and some
7006  * should eventually be made public */
7007
7008 /* The header definitions are in F<inline_invlist.c> */
7009
7010 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7011 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7012
7013 #define INVLIST_INITIAL_LEN 10
7014
7015 PERL_STATIC_INLINE UV*
7016 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7017 {
7018     /* Returns a pointer to the first element in the inversion list's array.
7019      * This is called upon initialization of an inversion list.  Where the
7020      * array begins depends on whether the list has the code point U+0000
7021      * in it or not.  The other parameter tells it whether the code that
7022      * follows this call is about to put a 0 in the inversion list or not.
7023      * The first element is either the element with 0, if 0, or the next one,
7024      * if 1 */
7025
7026     UV* zero = get_invlist_zero_addr(invlist);
7027
7028     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7029
7030     /* Must be empty */
7031     assert(! *_get_invlist_len_addr(invlist));
7032
7033     /* 1^1 = 0; 1^0 = 1 */
7034     *zero = 1 ^ will_have_0;
7035     return zero + *zero;
7036 }
7037
7038 PERL_STATIC_INLINE UV*
7039 S_invlist_array(pTHX_ SV* const invlist)
7040 {
7041     /* Returns the pointer to the inversion list's array.  Every time the
7042      * length changes, this needs to be called in case malloc or realloc moved
7043      * it */
7044
7045     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7046
7047     /* Must not be empty.  If these fail, you probably didn't check for <len>
7048      * being non-zero before trying to get the array */
7049     assert(*_get_invlist_len_addr(invlist));
7050     assert(*get_invlist_zero_addr(invlist) == 0
7051            || *get_invlist_zero_addr(invlist) == 1);
7052
7053     /* The array begins either at the element reserved for zero if the
7054      * list contains 0 (that element will be set to 0), or otherwise the next
7055      * element (in which case the reserved element will be set to 1). */
7056     return (UV *) (get_invlist_zero_addr(invlist)
7057                    + *get_invlist_zero_addr(invlist));
7058 }
7059
7060 PERL_STATIC_INLINE void
7061 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7062 {
7063     /* Sets the current number of elements stored in the inversion list */
7064
7065     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7066
7067     *_get_invlist_len_addr(invlist) = len;
7068
7069     assert(len <= SvLEN(invlist));
7070
7071     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7072     /* If the list contains U+0000, that element is part of the header,
7073      * and should not be counted as part of the array.  It will contain
7074      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7075      * subtract:
7076      *  SvCUR_set(invlist,
7077      *            TO_INTERNAL_SIZE(len
7078      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7079      * But, this is only valid if len is not 0.  The consequences of not doing
7080      * this is that the memory allocation code may think that 1 more UV is
7081      * being used than actually is, and so might do an unnecessary grow.  That
7082      * seems worth not bothering to make this the precise amount.
7083      *
7084      * Note that when inverting, SvCUR shouldn't change */
7085 }
7086
7087 PERL_STATIC_INLINE IV*
7088 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7089 {
7090     /* Return the address of the UV that is reserved to hold the cached index
7091      * */
7092
7093     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7094
7095     return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7096 }
7097
7098 PERL_STATIC_INLINE IV
7099 S_invlist_previous_index(pTHX_ SV* const invlist)
7100 {
7101     /* Returns cached index of previous search */
7102
7103     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7104
7105     return *get_invlist_previous_index_addr(invlist);
7106 }
7107
7108 PERL_STATIC_INLINE void
7109 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7110 {
7111     /* Caches <index> for later retrieval */
7112
7113     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7114
7115     assert(index == 0 || index < (int) _invlist_len(invlist));
7116
7117     *get_invlist_previous_index_addr(invlist) = index;
7118 }
7119
7120 PERL_STATIC_INLINE UV
7121 S_invlist_max(pTHX_ SV* const invlist)
7122 {
7123     /* Returns the maximum number of elements storable in the inversion list's
7124      * array, without having to realloc() */
7125
7126     PERL_ARGS_ASSERT_INVLIST_MAX;
7127
7128     return FROM_INTERNAL_SIZE(SvLEN(invlist));
7129 }
7130
7131 PERL_STATIC_INLINE UV*
7132 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7133 {
7134     /* Return the address of the UV that is reserved to hold 0 if the inversion
7135      * list contains 0.  This has to be the last element of the heading, as the
7136      * list proper starts with either it if 0, or the next element if not.
7137      * (But we force it to contain either 0 or 1) */
7138
7139     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7140
7141     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7142 }
7143
7144 #ifndef PERL_IN_XSUB_RE
7145 SV*
7146 Perl__new_invlist(pTHX_ IV initial_size)
7147 {
7148
7149     /* Return a pointer to a newly constructed inversion list, with enough
7150      * space to store 'initial_size' elements.  If that number is negative, a
7151      * system default is used instead */
7152
7153     SV* new_list;
7154
7155     if (initial_size < 0) {
7156         initial_size = INVLIST_INITIAL_LEN;
7157     }
7158
7159     /* Allocate the initial space */
7160     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7161     invlist_set_len(new_list, 0);
7162
7163     /* Force iterinit() to be used to get iteration to work */
7164     *get_invlist_iter_addr(new_list) = UV_MAX;
7165
7166     /* This should force a segfault if a method doesn't initialize this
7167      * properly */
7168     *get_invlist_zero_addr(new_list) = UV_MAX;
7169
7170     *get_invlist_previous_index_addr(new_list) = 0;
7171     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7172 #if HEADER_LENGTH != 5
7173 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7174 #endif
7175
7176     return new_list;
7177 }
7178 #endif
7179
7180 STATIC SV*
7181 S__new_invlist_C_array(pTHX_ UV* list)
7182 {
7183     /* Return a pointer to a newly constructed inversion list, initialized to
7184      * point to <list>, which has to be in the exact correct inversion list
7185      * form, including internal fields.  Thus this is a dangerous routine that
7186      * should not be used in the wrong hands */
7187
7188     SV* invlist = newSV_type(SVt_PV);
7189
7190     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7191
7192     SvPV_set(invlist, (char *) list);
7193     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7194                                shouldn't touch it */
7195     SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7196
7197     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7198         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7199     }
7200
7201     return invlist;
7202 }
7203
7204 STATIC void
7205 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7206 {
7207     /* Grow the maximum size of an inversion list */
7208
7209     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7210
7211     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7212 }
7213
7214 PERL_STATIC_INLINE void
7215 S_invlist_trim(pTHX_ SV* const invlist)
7216 {
7217     PERL_ARGS_ASSERT_INVLIST_TRIM;
7218
7219     /* Change the length of the inversion list to how many entries it currently
7220      * has */
7221
7222     SvPV_shrink_to_cur((SV *) invlist);
7223 }
7224
7225 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7226
7227 STATIC void
7228 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7229 {
7230    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7231     * the end of the inversion list.  The range must be above any existing
7232     * ones. */
7233
7234     UV* array;
7235     UV max = invlist_max(invlist);
7236     UV len = _invlist_len(invlist);
7237
7238     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7239
7240     if (len == 0) { /* Empty lists must be initialized */
7241         array = _invlist_array_init(invlist, start == 0);
7242     }
7243     else {
7244         /* Here, the existing list is non-empty. The current max entry in the
7245          * list is generally the first value not in the set, except when the
7246          * set extends to the end of permissible values, in which case it is
7247          * the first entry in that final set, and so this call is an attempt to
7248          * append out-of-order */
7249
7250         UV final_element = len - 1;
7251         array = invlist_array(invlist);
7252         if (array[final_element] > start
7253             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7254         {
7255             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7256                        array[final_element], start,
7257                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7258         }
7259
7260         /* Here, it is a legal append.  If the new range begins with the first
7261          * value not in the set, it is extending the set, so the new first
7262          * value not in the set is one greater than the newly extended range.
7263          * */
7264         if (array[final_element] == start) {
7265             if (end != UV_MAX) {
7266                 array[final_element] = end + 1;
7267             }
7268             else {
7269                 /* But if the end is the maximum representable on the machine,
7270                  * just let the range that this would extend to have no end */
7271                 invlist_set_len(invlist, len - 1);
7272             }
7273             return;
7274         }
7275     }
7276
7277     /* Here the new range doesn't extend any existing set.  Add it */
7278
7279     len += 2;   /* Includes an element each for the start and end of range */
7280
7281     /* If overflows the existing space, extend, which may cause the array to be
7282      * moved */
7283     if (max < len) {
7284         invlist_extend(invlist, len);
7285         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7286                                            failure in invlist_array() */
7287         array = invlist_array(invlist);
7288     }
7289     else {
7290         invlist_set_len(invlist, len);
7291     }
7292
7293     /* The next item on the list starts the range, the one after that is
7294      * one past the new range.  */
7295     array[len - 2] = start;
7296     if (end != UV_MAX) {
7297         array[len - 1] = end + 1;
7298     }
7299     else {
7300         /* But if the end is the maximum representable on the machine, just let
7301          * the range have no end */
7302         invlist_set_len(invlist, len - 1);
7303     }
7304 }
7305
7306 #ifndef PERL_IN_XSUB_RE
7307
7308 IV
7309 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7310 {
7311     /* Searches the inversion list for the entry that contains the input code
7312      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7313      * return value is the index into the list's array of the range that
7314      * contains <cp> */
7315
7316     IV low = 0;
7317     IV mid;
7318     IV high = _invlist_len(invlist);
7319     const IV highest_element = high - 1;
7320     const UV* array;
7321
7322     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7323
7324     /* If list is empty, return failure. */
7325     if (high == 0) {
7326         return -1;
7327     }
7328
7329     /* If the code point is before the first element, return failure.  (We
7330      * can't combine this with the test above, because we can't get the array
7331      * unless we know the list is non-empty) */
7332     array = invlist_array(invlist);
7333
7334     mid = invlist_previous_index(invlist);
7335     assert(mid >=0 && mid <= highest_element);
7336
7337     /* <mid> contains the cache of the result of the previous call to this
7338      * function (0 the first time).  See if this call is for the same result,
7339      * or if it is for mid-1.  This is under the theory that calls to this
7340      * function will often be for related code points that are near each other.
7341      * And benchmarks show that caching gives better results.  We also test
7342      * here if the code point is within the bounds of the list.  These tests
7343      * replace others that would have had to be made anyway to make sure that
7344      * the array bounds were not exceeded, and give us extra information at the
7345      * same time */
7346     if (cp >= array[mid]) {
7347         if (cp >= array[highest_element]) {
7348             return highest_element;
7349         }
7350
7351         /* Here, array[mid] <= cp < array[highest_element].  This means that
7352          * the final element is not the answer, so can exclude it; it also
7353          * means that <mid> is not the final element, so can refer to 'mid + 1'
7354          * safely */
7355         if (cp < array[mid + 1]) {
7356             return mid;
7357         }
7358         high--;
7359         low = mid + 1;
7360     }
7361     else { /* cp < aray[mid] */
7362         if (cp < array[0]) { /* Fail if outside the array */
7363             return -1;
7364         }
7365         high = mid;
7366         if (cp >= array[mid - 1]) {
7367             goto found_entry;
7368         }
7369     }
7370
7371     /* Binary search.  What we are looking for is <i> such that
7372      *  array[i] <= cp < array[i+1]
7373      * The loop below converges on the i+1.  Note that there may not be an
7374      * (i+1)th element in the array, and things work nonetheless */
7375     while (low < high) {
7376         mid = (low + high) / 2;
7377         assert(mid <= highest_element);
7378         if (array[mid] <= cp) { /* cp >= array[mid] */
7379             low = mid + 1;
7380
7381             /* We could do this extra test to exit the loop early.
7382             if (cp < array[low]) {
7383                 return mid;
7384             }
7385             */
7386         }
7387         else { /* cp < array[mid] */
7388             high = mid;
7389         }
7390     }
7391
7392   found_entry:
7393     high--;
7394     invlist_set_previous_index(invlist, high);
7395     return high;
7396 }
7397
7398 void
7399 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7400 {
7401     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7402      * but is used when the swash has an inversion list.  This makes this much
7403      * faster, as it uses a binary search instead of a linear one.  This is
7404      * intimately tied to that function, and perhaps should be in utf8.c,
7405      * except it is intimately tied to inversion lists as well.  It assumes
7406      * that <swatch> is all 0's on input */
7407
7408     UV current = start;
7409     const IV len = _invlist_len(invlist);
7410     IV i;
7411     const UV * array;
7412
7413     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7414
7415     if (len == 0) { /* Empty inversion list */
7416         return;
7417     }
7418
7419     array = invlist_array(invlist);
7420
7421     /* Find which element it is */
7422     i = _invlist_search(invlist, start);
7423
7424     /* We populate from <start> to <end> */
7425     while (current < end) {
7426         UV upper;
7427
7428         /* The inversion list gives the results for every possible code point
7429          * after the first one in the list.  Only those ranges whose index is
7430          * even are ones that the inversion list matches.  For the odd ones,
7431          * and if the initial code point is not in the list, we have to skip
7432          * forward to the next element */
7433         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7434             i++;
7435             if (i >= len) { /* Finished if beyond the end of the array */
7436                 return;
7437             }
7438             current = array[i];
7439             if (current >= end) {   /* Finished if beyond the end of what we
7440                                        are populating */
7441                 if (LIKELY(end < UV_MAX)) {
7442                     return;
7443                 }
7444
7445                 /* We get here when the upper bound is the maximum
7446                  * representable on the machine, and we are looking for just
7447                  * that code point.  Have to special case it */
7448                 i = len;
7449                 goto join_end_of_list;
7450             }
7451         }
7452         assert(current >= start);
7453
7454         /* The current range ends one below the next one, except don't go past
7455          * <end> */
7456         i++;
7457         upper = (i < len && array[i] < end) ? array[i] : end;
7458
7459         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7460          * for each code point in it */
7461         for (; current < upper; current++) {
7462             const STRLEN offset = (STRLEN)(current - start);
7463             swatch[offset >> 3] |= 1 << (offset & 7);
7464         }
7465
7466     join_end_of_list:
7467
7468         /* Quit if at the end of the list */
7469         if (i >= len) {
7470
7471             /* But first, have to deal with the highest possible code point on
7472              * the platform.  The previous code assumes that <end> is one
7473              * beyond where we want to populate, but that is impossible at the
7474              * platform's infinity, so have to handle it specially */
7475             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7476             {
7477                 const STRLEN offset = (STRLEN)(end - start);
7478                 swatch[offset >> 3] |= 1 << (offset & 7);
7479             }
7480             return;
7481         }
7482
7483         /* Advance to the next range, which will be for code points not in the
7484          * inversion list */
7485         current = array[i];
7486     }
7487
7488     return;
7489 }
7490
7491 void
7492 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7493 {
7494     /* Take the union of two inversion lists and point <output> to it.  *output
7495      * should be defined upon input, and if it points to one of the two lists,
7496      * the reference count to that list will be decremented.  The first list,
7497      * <a>, may be NULL, in which case a copy of the second list is returned.
7498      * If <complement_b> is TRUE, the union is taken of the complement
7499      * (inversion) of <b> instead of b itself.
7500      *
7501      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7502      * Richard Gillam, published by Addison-Wesley, and explained at some
7503      * length there.  The preface says to incorporate its examples into your
7504      * code at your own risk.
7505      *
7506      * The algorithm is like a merge sort.
7507      *
7508      * XXX A potential performance improvement is to keep track as we go along
7509      * if only one of the inputs contributes to the result, meaning the other
7510      * is a subset of that one.  In that case, we can skip the final copy and
7511      * return the larger of the input lists, but then outside code might need
7512      * to keep track of whether to free the input list or not */
7513
7514     UV* array_a;    /* a's array */
7515     UV* array_b;
7516     UV len_a;       /* length of a's array */
7517     UV len_b;
7518
7519     SV* u;                      /* the resulting union */
7520     UV* array_u;
7521     UV len_u;
7522
7523     UV i_a = 0;             /* current index into a's array */
7524     UV i_b = 0;
7525     UV i_u = 0;
7526
7527     /* running count, as explained in the algorithm source book; items are
7528      * stopped accumulating and are output when the count changes to/from 0.
7529      * The count is incremented when we start a range that's in the set, and
7530      * decremented when we start a range that's not in the set.  So its range
7531      * is 0 to 2.  Only when the count is zero is something not in the set.
7532      */
7533     UV count = 0;
7534
7535     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7536     assert(a != b);
7537
7538     /* If either one is empty, the union is the other one */
7539     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7540         if (*output == a) {
7541             if (a != NULL) {
7542                 SvREFCNT_dec(a);
7543             }
7544         }
7545         if (*output != b) {
7546             *output = invlist_clone(b);
7547             if (complement_b) {
7548                 _invlist_invert(*output);
7549             }
7550         } /* else *output already = b; */
7551         return;
7552     }
7553     else if ((len_b = _invlist_len(b)) == 0) {
7554         if (*output == b) {
7555             SvREFCNT_dec(b);
7556         }
7557
7558         /* The complement of an empty list is a list that has everything in it,
7559          * so the union with <a> includes everything too */
7560         if (complement_b) {
7561             if (a == *output) {
7562                 SvREFCNT_dec(a);
7563             }
7564             *output = _new_invlist(1);
7565             _append_range_to_invlist(*output, 0, UV_MAX);
7566         }
7567         else if (*output != a) {
7568             *output = invlist_clone(a);
7569         }
7570         /* else *output already = a; */
7571         return;
7572     }
7573
7574     /* Here both lists exist and are non-empty */
7575     array_a = invlist_array(a);
7576     array_b = invlist_array(b);
7577
7578     /* If are to take the union of 'a' with the complement of b, set it
7579      * up so are looking at b's complement. */
7580     if (complement_b) {
7581
7582         /* To complement, we invert: if the first element is 0, remove it.  To
7583          * do this, we just pretend the array starts one later, and clear the
7584          * flag as we don't have to do anything else later */
7585         if (array_b[0] == 0) {
7586             array_b++;
7587             len_b--;
7588             complement_b = FALSE;
7589         }
7590         else {
7591
7592             /* But if the first element is not zero, we unshift a 0 before the
7593              * array.  The data structure reserves a space for that 0 (which
7594              * should be a '1' right now), so physical shifting is unneeded,
7595              * but temporarily change that element to 0.  Before exiting the
7596              * routine, we must restore the element to '1' */
7597             array_b--;
7598             len_b++;
7599             array_b[0] = 0;
7600         }
7601     }
7602
7603     /* Size the union for the worst case: that the sets are completely
7604      * disjoint */
7605     u = _new_invlist(len_a + len_b);
7606
7607     /* Will contain U+0000 if either component does */
7608     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7609                                       || (len_b > 0 && array_b[0] == 0));
7610
7611     /* Go through each list item by item, stopping when exhausted one of
7612      * them */
7613     while (i_a < len_a && i_b < len_b) {
7614         UV cp;      /* The element to potentially add to the union's array */
7615         bool cp_in_set;   /* is it in the the input list's set or not */
7616
7617         /* We need to take one or the other of the two inputs for the union.
7618          * Since we are merging two sorted lists, we take the smaller of the
7619          * next items.  In case of a tie, we take the one that is in its set
7620          * first.  If we took one not in the set first, it would decrement the
7621          * count, possibly to 0 which would cause it to be output as ending the
7622          * range, and the next time through we would take the same number, and
7623          * output it again as beginning the next range.  By doing it the
7624          * opposite way, there is no possibility that the count will be
7625          * momentarily decremented to 0, and thus the two adjoining ranges will
7626          * be seamlessly merged.  (In a tie and both are in the set or both not
7627          * in the set, it doesn't matter which we take first.) */
7628         if (array_a[i_a] < array_b[i_b]
7629             || (array_a[i_a] == array_b[i_b]
7630                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7631         {
7632             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7633             cp= array_a[i_a++];
7634         }
7635         else {
7636             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7637             cp= array_b[i_b++];
7638         }
7639
7640         /* Here, have chosen which of the two inputs to look at.  Only output
7641          * if the running count changes to/from 0, which marks the
7642          * beginning/end of a range in that's in the set */
7643         if (cp_in_set) {
7644             if (count == 0) {
7645                 array_u[i_u++] = cp;
7646             }
7647             count++;
7648         }
7649         else {
7650             count--;
7651             if (count == 0) {
7652                 array_u[i_u++] = cp;
7653             }
7654         }
7655     }
7656
7657     /* Here, we are finished going through at least one of the lists, which
7658      * means there is something remaining in at most one.  We check if the list
7659      * that hasn't been exhausted is positioned such that we are in the middle
7660      * of a range in its set or not.  (i_a and i_b point to the element beyond
7661      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7662      * is potentially more to output.
7663      * There are four cases:
7664      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7665      *     in the union is entirely from the non-exhausted set.
7666      *  2) Both were in their sets, count is 2.  Nothing further should
7667      *     be output, as everything that remains will be in the exhausted
7668      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7669      *     that
7670      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7671      *     Nothing further should be output because the union includes
7672      *     everything from the exhausted set.  Not decrementing ensures that.
7673      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7674      *     decrementing to 0 insures that we look at the remainder of the
7675      *     non-exhausted set */
7676     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7677         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7678     {
7679         count--;
7680     }
7681
7682     /* The final length is what we've output so far, plus what else is about to
7683      * be output.  (If 'count' is non-zero, then the input list we exhausted
7684      * has everything remaining up to the machine's limit in its set, and hence
7685      * in the union, so there will be no further output. */
7686     len_u = i_u;
7687     if (count == 0) {
7688         /* At most one of the subexpressions will be non-zero */
7689         len_u += (len_a - i_a) + (len_b - i_b);
7690     }
7691
7692     /* Set result to final length, which can change the pointer to array_u, so
7693      * re-find it */
7694     if (len_u != _invlist_len(u)) {
7695         invlist_set_len(u, len_u);
7696         invlist_trim(u);
7697         array_u = invlist_array(u);
7698     }
7699
7700     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7701      * the other) ended with everything above it not in its set.  That means
7702      * that the remaining part of the union is precisely the same as the
7703      * non-exhausted list, so can just copy it unchanged.  (If both list were
7704      * exhausted at the same time, then the operations below will be both 0.)
7705      */
7706     if (count == 0) {
7707         IV copy_count; /* At most one will have a non-zero copy count */
7708         if ((copy_count = len_a - i_a) > 0) {
7709             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7710         }
7711         else if ((copy_count = len_b - i_b) > 0) {
7712             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7713         }
7714     }
7715
7716     /*  We may be removing a reference to one of the inputs */
7717     if (a == *output || b == *output) {
7718         SvREFCNT_dec(*output);
7719     }
7720
7721     /* If we've changed b, restore it */
7722     if (complement_b) {
7723         array_b[0] = 1;
7724     }
7725
7726     *output = u;
7727     return;
7728 }
7729
7730 void
7731 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7732 {
7733     /* Take the intersection of two inversion lists and point <i> to it.  *i
7734      * should be defined upon input, and if it points to one of the two lists,
7735      * the reference count to that list will be decremented.
7736      * If <complement_b> is TRUE, the result will be the intersection of <a>
7737      * and the complement (or inversion) of <b> instead of <b> directly.
7738      *
7739      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7740      * Richard Gillam, published by Addison-Wesley, and explained at some
7741      * length there.  The preface says to incorporate its examples into your
7742      * code at your own risk.  In fact, it had bugs
7743      *
7744      * The algorithm is like a merge sort, and is essentially the same as the
7745      * union above
7746      */
7747
7748     UV* array_a;                /* a's array */
7749     UV* array_b;
7750     UV len_a;   /* length of a's array */
7751     UV len_b;
7752
7753     SV* r;                   /* the resulting intersection */
7754     UV* array_r;
7755     UV len_r;
7756
7757     UV i_a = 0;             /* current index into a's array */
7758     UV i_b = 0;
7759     UV i_r = 0;
7760
7761     /* running count, as explained in the algorithm source book; items are
7762      * stopped accumulating and are output when the count changes to/from 2.
7763      * The count is incremented when we start a range that's in the set, and
7764      * decremented when we start a range that's not in the set.  So its range
7765      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7766      */
7767     UV count = 0;
7768
7769     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7770     assert(a != b);
7771
7772     /* Special case if either one is empty */
7773     len_a = _invlist_len(a);
7774     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7775
7776         if (len_a != 0 && complement_b) {
7777
7778             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7779              * be empty.  Here, also we are using 'b's complement, which hence
7780              * must be every possible code point.  Thus the intersection is
7781              * simply 'a'. */
7782             if (*i != a) {
7783                 *i = invlist_clone(a);
7784
7785                 if (*i == b) {
7786                     SvREFCNT_dec(b);
7787                 }
7788             }
7789             /* else *i is already 'a' */
7790             return;
7791         }
7792
7793         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7794          * intersection must be empty */
7795         if (*i == a) {
7796             SvREFCNT_dec(a);
7797         }
7798         else if (*i == b) {
7799             SvREFCNT_dec(b);
7800         }
7801         *i = _new_invlist(0);
7802         return;
7803     }
7804
7805     /* Here both lists exist and are non-empty */
7806     array_a = invlist_array(a);
7807     array_b = invlist_array(b);
7808
7809     /* If are to take the intersection of 'a' with the complement of b, set it
7810      * up so are looking at b's complement. */
7811     if (complement_b) {
7812
7813         /* To complement, we invert: if the first element is 0, remove it.  To
7814          * do this, we just pretend the array starts one later, and clear the
7815          * flag as we don't have to do anything else later */
7816         if (array_b[0] == 0) {
7817             array_b++;
7818             len_b--;
7819             complement_b = FALSE;
7820         }
7821         else {
7822
7823             /* But if the first element is not zero, we unshift a 0 before the
7824              * array.  The data structure reserves a space for that 0 (which
7825              * should be a '1' right now), so physical shifting is unneeded,
7826              * but temporarily change that element to 0.  Before exiting the
7827              * routine, we must restore the element to '1' */
7828             array_b--;
7829             len_b++;
7830             array_b[0] = 0;
7831         }
7832     }
7833
7834     /* Size the intersection for the worst case: that the intersection ends up
7835      * fragmenting everything to be completely disjoint */
7836     r= _new_invlist(len_a + len_b);
7837
7838     /* Will contain U+0000 iff both components do */
7839     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7840                                      && len_b > 0 && array_b[0] == 0);
7841
7842     /* Go through each list item by item, stopping when exhausted one of
7843      * them */
7844     while (i_a < len_a && i_b < len_b) {
7845         UV cp;      /* The element to potentially add to the intersection's
7846                        array */
7847         bool cp_in_set; /* Is it in the input list's set or not */
7848
7849         /* We need to take one or the other of the two inputs for the
7850          * intersection.  Since we are merging two sorted lists, we take the
7851          * smaller of the next items.  In case of a tie, we take the one that
7852          * is not in its set first (a difference from the union algorithm).  If
7853          * we took one in the set first, it would increment the count, possibly
7854          * to 2 which would cause it to be output as starting a range in the
7855          * intersection, and the next time through we would take that same
7856          * number, and output it again as ending the set.  By doing it the
7857          * opposite of this, there is no possibility that the count will be
7858          * momentarily incremented to 2.  (In a tie and both are in the set or
7859          * both not in the set, it doesn't matter which we take first.) */
7860         if (array_a[i_a] < array_b[i_b]
7861             || (array_a[i_a] == array_b[i_b]
7862                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7863         {
7864             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7865             cp= array_a[i_a++];
7866         }
7867         else {
7868             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7869             cp= array_b[i_b++];
7870         }
7871
7872         /* Here, have chosen which of the two inputs to look at.  Only output
7873          * if the running count changes to/from 2, which marks the
7874          * beginning/end of a range that's in the intersection */
7875         if (cp_in_set) {
7876             count++;
7877             if (count == 2) {
7878                 array_r[i_r++] = cp;
7879             }
7880         }
7881         else {
7882             if (count == 2) {
7883                 array_r[i_r++] = cp;
7884             }
7885             count--;
7886         }
7887     }
7888
7889     /* Here, we are finished going through at least one of the lists, which
7890      * means there is something remaining in at most one.  We check if the list
7891      * that has been exhausted is positioned such that we are in the middle
7892      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7893      * the ones we care about.)  There are four cases:
7894      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7895      *     nothing left in the intersection.
7896      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7897      *     above 2.  What should be output is exactly that which is in the
7898      *     non-exhausted set, as everything it has is also in the intersection
7899      *     set, and everything it doesn't have can't be in the intersection
7900      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7901      *     gets incremented to 2.  Like the previous case, the intersection is
7902      *     everything that remains in the non-exhausted set.
7903      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7904      *     remains 1.  And the intersection has nothing more. */
7905     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7906         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7907     {
7908         count++;
7909     }
7910
7911     /* The final length is what we've output so far plus what else is in the
7912      * intersection.  At most one of the subexpressions below will be non-zero */
7913     len_r = i_r;
7914     if (count >= 2) {
7915         len_r += (len_a - i_a) + (len_b - i_b);
7916     }
7917
7918     /* Set result to final length, which can change the pointer to array_r, so
7919      * re-find it */
7920     if (len_r != _invlist_len(r)) {
7921         invlist_set_len(r, len_r);
7922         invlist_trim(r);
7923         array_r = invlist_array(r);
7924     }
7925
7926     /* Finish outputting any remaining */
7927     if (count >= 2) { /* At most one will have a non-zero copy count */
7928         IV copy_count;
7929         if ((copy_count = len_a - i_a) > 0) {
7930             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7931         }
7932         else if ((copy_count = len_b - i_b) > 0) {
7933             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7934         }
7935     }
7936
7937     /*  We may be removing a reference to one of the inputs */
7938     if (a == *i || b == *i) {
7939         SvREFCNT_dec(*i);
7940     }
7941
7942     /* If we've changed b, restore it */
7943     if (complement_b) {
7944         array_b[0] = 1;
7945     }
7946
7947     *i = r;
7948     return;
7949 }
7950
7951 SV*
7952 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7953 {
7954     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7955      * set.  A pointer to the inversion list is returned.  This may actually be
7956      * a new list, in which case the passed in one has been destroyed.  The
7957      * passed in inversion list can be NULL, in which case a new one is created
7958      * with just the one range in it */
7959
7960     SV* range_invlist;
7961     UV len;
7962
7963     if (invlist == NULL) {
7964         invlist = _new_invlist(2);
7965         len = 0;
7966     }
7967     else {
7968         len = _invlist_len(invlist);
7969     }
7970
7971     /* If comes after the final entry, can just append it to the end */
7972     if (len == 0
7973         || start >= invlist_array(invlist)
7974                                     [_invlist_len(invlist) - 1])
7975     {
7976         _append_range_to_invlist(invlist, start, end);
7977         return invlist;
7978     }
7979
7980     /* Here, can't just append things, create and return a new inversion list
7981      * which is the union of this range and the existing inversion list */
7982     range_invlist = _new_invlist(2);
7983     _append_range_to_invlist(range_invlist, start, end);
7984
7985     _invlist_union(invlist, range_invlist, &invlist);
7986
7987     /* The temporary can be freed */
7988     SvREFCNT_dec(range_invlist);
7989
7990     return invlist;
7991 }
7992
7993 #endif
7994
7995 PERL_STATIC_INLINE SV*
7996 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7997     return _add_range_to_invlist(invlist, cp, cp);
7998 }
7999
8000 #ifndef PERL_IN_XSUB_RE
8001 void
8002 Perl__invlist_invert(pTHX_ SV* const invlist)
8003 {
8004     /* Complement the input inversion list.  This adds a 0 if the list didn't
8005      * have a zero; removes it otherwise.  As described above, the data
8006      * structure is set up so that this is very efficient */
8007
8008     UV* len_pos = _get_invlist_len_addr(invlist);
8009
8010     PERL_ARGS_ASSERT__INVLIST_INVERT;
8011
8012     /* The inverse of matching nothing is matching everything */
8013     if (*len_pos == 0) {
8014         _append_range_to_invlist(invlist, 0, UV_MAX);
8015         return;
8016     }
8017
8018     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
8019      * zero element was a 0, so it is being removed, so the length decrements
8020      * by 1; and vice-versa.  SvCUR is unaffected */
8021     if (*get_invlist_zero_addr(invlist) ^= 1) {
8022         (*len_pos)--;
8023     }
8024     else {
8025         (*len_pos)++;
8026     }
8027 }
8028
8029 void
8030 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8031 {
8032     /* Complement the input inversion list (which must be a Unicode property,
8033      * all of which don't match above the Unicode maximum code point.)  And
8034      * Perl has chosen to not have the inversion match above that either.  This
8035      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8036      */
8037
8038     UV len;
8039     UV* array;
8040
8041     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8042
8043     _invlist_invert(invlist);
8044
8045     len = _invlist_len(invlist);
8046
8047     if (len != 0) { /* If empty do nothing */
8048         array = invlist_array(invlist);
8049         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8050             /* Add 0x110000.  First, grow if necessary */
8051             len++;
8052             if (invlist_max(invlist) < len) {
8053                 invlist_extend(invlist, len);
8054                 array = invlist_array(invlist);
8055             }
8056             invlist_set_len(invlist, len);
8057             array[len - 1] = PERL_UNICODE_MAX + 1;
8058         }
8059         else {  /* Remove the 0x110000 */
8060             invlist_set_len(invlist, len - 1);
8061         }
8062     }
8063
8064     return;
8065 }
8066 #endif
8067
8068 PERL_STATIC_INLINE SV*
8069 S_invlist_clone(pTHX_ SV* const invlist)
8070 {
8071
8072     /* Return a new inversion list that is a copy of the input one, which is
8073      * unchanged */
8074
8075     /* Need to allocate extra space to accommodate Perl's addition of a
8076      * trailing NUL to SvPV's, since it thinks they are always strings */
8077     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8078     STRLEN length = SvCUR(invlist);
8079
8080     PERL_ARGS_ASSERT_INVLIST_CLONE;
8081
8082     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8083     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8084
8085     return new_invlist;
8086 }
8087
8088 PERL_STATIC_INLINE UV*
8089 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8090 {
8091     /* Return the address of the UV that contains the current iteration
8092      * position */
8093
8094     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8095
8096     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8097 }
8098
8099 PERL_STATIC_INLINE UV*
8100 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8101 {
8102     /* Return the address of the UV that contains the version id. */
8103
8104     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8105
8106     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8107 }
8108
8109 PERL_STATIC_INLINE void
8110 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8111 {
8112     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8113
8114     *get_invlist_iter_addr(invlist) = 0;
8115 }
8116
8117 STATIC bool
8118 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8119 {
8120     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8121      * This call sets in <*start> and <*end>, the next range in <invlist>.
8122      * Returns <TRUE> if successful and the next call will return the next
8123      * range; <FALSE> if was already at the end of the list.  If the latter,
8124      * <*start> and <*end> are unchanged, and the next call to this function
8125      * will start over at the beginning of the list */
8126
8127     UV* pos = get_invlist_iter_addr(invlist);
8128     UV len = _invlist_len(invlist);
8129     UV *array;
8130
8131     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8132
8133     if (*pos >= len) {
8134         *pos = UV_MAX;  /* Force iternit() to be required next time */
8135         return FALSE;
8136     }
8137
8138     array = invlist_array(invlist);
8139
8140     *start = array[(*pos)++];
8141
8142     if (*pos >= len) {
8143         *end = UV_MAX;
8144     }
8145     else {
8146         *end = array[(*pos)++] - 1;
8147     }
8148
8149     return TRUE;
8150 }
8151
8152 PERL_STATIC_INLINE UV
8153 S_invlist_highest(pTHX_ SV* const invlist)
8154 {
8155     /* Returns the highest code point that matches an inversion list.  This API
8156      * has an ambiguity, as it returns 0 under either the highest is actually
8157      * 0, or if the list is empty.  If this distinction matters to you, check
8158      * for emptiness before calling this function */
8159
8160     UV len = _invlist_len(invlist);
8161     UV *array;
8162
8163     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8164
8165     if (len == 0) {
8166         return 0;
8167     }
8168
8169     array = invlist_array(invlist);
8170
8171     /* The last element in the array in the inversion list always starts a
8172      * range that goes to infinity.  That range may be for code points that are
8173      * matched in the inversion list, or it may be for ones that aren't
8174      * matched.  In the latter case, the highest code point in the set is one
8175      * less than the beginning of this range; otherwise it is the final element
8176      * of this range: infinity */
8177     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8178            ? UV_MAX
8179            : array[len - 1] - 1;
8180 }
8181
8182 #ifndef PERL_IN_XSUB_RE
8183 SV *
8184 Perl__invlist_contents(pTHX_ SV* const invlist)
8185 {
8186     /* Get the contents of an inversion list into a string SV so that they can
8187      * be printed out.  It uses the format traditionally done for debug tracing
8188      */
8189
8190     UV start, end;
8191     SV* output = newSVpvs("\n");
8192
8193     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8194
8195     invlist_iterinit(invlist);
8196     while (invlist_iternext(invlist, &start, &end)) {
8197         if (end == UV_MAX) {
8198             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8199         }
8200         else if (end != start) {
8201             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8202                     start,       end);
8203         }
8204         else {
8205             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8206         }
8207     }
8208
8209     return output;
8210 }
8211 #endif
8212
8213 #if 0
8214 void
8215 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8216 {
8217     /* Dumps out the ranges in an inversion list.  The string 'header'
8218      * if present is output on a line before the first range */
8219
8220     UV start, end;
8221
8222     if (header && strlen(header)) {
8223         PerlIO_printf(Perl_debug_log, "%s\n", header);
8224     }
8225     invlist_iterinit(invlist);
8226     while (invlist_iternext(invlist, &start, &end)) {
8227         if (end == UV_MAX) {
8228             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8229         }
8230         else {
8231             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8232         }
8233     }
8234 }
8235 #endif
8236
8237 #if 0
8238 bool
8239 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8240 {
8241     /* Return a boolean as to if the two passed in inversion lists are
8242      * identical.  The final argument, if TRUE, says to take the complement of
8243      * the second inversion list before doing the comparison */
8244
8245     UV* array_a = invlist_array(a);
8246     UV* array_b = invlist_array(b);
8247     UV len_a = _invlist_len(a);
8248     UV len_b = _invlist_len(b);
8249
8250     UV i = 0;               /* current index into the arrays */
8251     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8252
8253     PERL_ARGS_ASSERT__INVLISTEQ;
8254
8255     /* If are to compare 'a' with the complement of b, set it
8256      * up so are looking at b's complement. */
8257     if (complement_b) {
8258
8259         /* The complement of nothing is everything, so <a> would have to have
8260          * just one element, starting at zero (ending at infinity) */
8261         if (len_b == 0) {
8262             return (len_a == 1 && array_a[0] == 0);
8263         }
8264         else if (array_b[0] == 0) {
8265
8266             /* Otherwise, to complement, we invert.  Here, the first element is
8267              * 0, just remove it.  To do this, we just pretend the array starts
8268              * one later, and clear the flag as we don't have to do anything
8269              * else later */
8270
8271             array_b++;
8272             len_b--;
8273             complement_b = FALSE;
8274         }
8275         else {
8276
8277             /* But if the first element is not zero, we unshift a 0 before the
8278              * array.  The data structure reserves a space for that 0 (which
8279              * should be a '1' right now), so physical shifting is unneeded,
8280              * but temporarily change that element to 0.  Before exiting the
8281              * routine, we must restore the element to '1' */
8282             array_b--;
8283             len_b++;
8284             array_b[0] = 0;
8285         }
8286     }
8287
8288     /* Make sure that the lengths are the same, as well as the final element
8289      * before looping through the remainder.  (Thus we test the length, final,
8290      * and first elements right off the bat) */
8291     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8292         retval = FALSE;
8293     }
8294     else for (i = 0; i < len_a - 1; i++) {
8295         if (array_a[i] != array_b[i]) {
8296             retval = FALSE;
8297             break;
8298         }
8299     }
8300
8301     if (complement_b) {
8302         array_b[0] = 1;
8303     }
8304     return retval;
8305 }
8306 #endif
8307
8308 #undef HEADER_LENGTH
8309 #undef INVLIST_INITIAL_LENGTH
8310 #undef TO_INTERNAL_SIZE
8311 #undef FROM_INTERNAL_SIZE
8312 #undef INVLIST_LEN_OFFSET
8313 #undef INVLIST_ZERO_OFFSET
8314 #undef INVLIST_ITER_OFFSET
8315 #undef INVLIST_VERSION_ID
8316
8317 /* End of inversion list object */
8318
8319 /*
8320  - reg - regular expression, i.e. main body or parenthesized thing
8321  *
8322  * Caller must absorb opening parenthesis.
8323  *
8324  * Combining parenthesis handling with the base level of regular expression
8325  * is a trifle forced, but the need to tie the tails of the branches to what
8326  * follows makes it hard to avoid.
8327  */
8328 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8329 #ifdef DEBUGGING
8330 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8331 #else
8332 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8333 #endif
8334
8335 STATIC regnode *
8336 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8337     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8338 {
8339     dVAR;
8340     regnode *ret;               /* Will be the head of the group. */
8341     regnode *br;
8342     regnode *lastbr;
8343     regnode *ender = NULL;
8344     I32 parno = 0;
8345     I32 flags;
8346     U32 oregflags = RExC_flags;
8347     bool have_branch = 0;
8348     bool is_open = 0;
8349     I32 freeze_paren = 0;
8350     I32 after_freeze = 0;
8351
8352     /* for (?g), (?gc), and (?o) warnings; warning
8353        about (?c) will warn about (?g) -- japhy    */
8354
8355 #define WASTED_O  0x01
8356 #define WASTED_G  0x02
8357 #define WASTED_C  0x04
8358 #define WASTED_GC (0x02|0x04)
8359     I32 wastedflags = 0x00;
8360
8361     char * parse_start = RExC_parse; /* MJD */
8362     char * const oregcomp_parse = RExC_parse;
8363
8364     GET_RE_DEBUG_FLAGS_DECL;
8365
8366     PERL_ARGS_ASSERT_REG;
8367     DEBUG_PARSE("reg ");
8368
8369     *flagp = 0;                         /* Tentatively. */
8370
8371
8372     /* Make an OPEN node, if parenthesized. */
8373     if (paren) {
8374         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8375             char *start_verb = RExC_parse;
8376             STRLEN verb_len = 0;
8377             char *start_arg = NULL;
8378             unsigned char op = 0;
8379             int argok = 1;
8380             int internal_argval = 0; /* internal_argval is only useful if !argok */
8381             while ( *RExC_parse && *RExC_parse != ')' ) {
8382                 if ( *RExC_parse == ':' ) {
8383                     start_arg = RExC_parse + 1;
8384                     break;
8385                 }
8386                 RExC_parse++;
8387             }
8388             ++start_verb;
8389             verb_len = RExC_parse - start_verb;
8390             if ( start_arg ) {
8391                 RExC_parse++;
8392                 while ( *RExC_parse && *RExC_parse != ')' )
8393                     RExC_parse++;
8394                 if ( *RExC_parse != ')' )
8395                     vFAIL("Unterminated verb pattern argument");
8396                 if ( RExC_parse == start_arg )
8397                     start_arg = NULL;
8398             } else {
8399                 if ( *RExC_parse != ')' )
8400                     vFAIL("Unterminated verb pattern");
8401             }
8402
8403             switch ( *start_verb ) {
8404             case 'A':  /* (*ACCEPT) */
8405                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8406                     op = ACCEPT;
8407                     internal_argval = RExC_nestroot;
8408                 }
8409                 break;
8410             case 'C':  /* (*COMMIT) */
8411                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8412                     op = COMMIT;
8413                 break;
8414             case 'F':  /* (*FAIL) */
8415                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8416                     op = OPFAIL;
8417                     argok = 0;
8418                 }
8419                 break;
8420             case ':':  /* (*:NAME) */
8421             case 'M':  /* (*MARK:NAME) */
8422                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8423                     op = MARKPOINT;
8424                     argok = -1;
8425                 }
8426                 break;
8427             case 'P':  /* (*PRUNE) */
8428                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8429                     op = PRUNE;
8430                 break;
8431             case 'S':   /* (*SKIP) */
8432                 if ( memEQs(start_verb,verb_len,"SKIP") )
8433                     op = SKIP;
8434                 break;
8435             case 'T':  /* (*THEN) */
8436                 /* [19:06] <TimToady> :: is then */
8437                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8438                     op = CUTGROUP;
8439                     RExC_seen |= REG_SEEN_CUTGROUP;
8440                 }
8441                 break;
8442             }
8443             if ( ! op ) {
8444                 RExC_parse++;
8445                 vFAIL3("Unknown verb pattern '%.*s'",
8446                     verb_len, start_verb);
8447             }
8448             if ( argok ) {
8449                 if ( start_arg && internal_argval ) {
8450                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8451                         verb_len, start_verb);
8452                 } else if ( argok < 0 && !start_arg ) {
8453                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8454                         verb_len, start_verb);
8455                 } else {
8456                     ret = reganode(pRExC_state, op, internal_argval);
8457                     if ( ! internal_argval && ! SIZE_ONLY ) {
8458                         if (start_arg) {
8459                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8460                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8461                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8462                             ret->flags = 0;
8463                         } else {
8464                             ret->flags = 1;
8465                         }
8466                     }
8467                 }
8468                 if (!internal_argval)
8469                     RExC_seen |= REG_SEEN_VERBARG;
8470             } else if ( start_arg ) {
8471                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8472                         verb_len, start_verb);
8473             } else {
8474                 ret = reg_node(pRExC_state, op);
8475             }
8476             nextchar(pRExC_state);
8477             return ret;
8478         } else
8479         if (*RExC_parse == '?') { /* (?...) */
8480             bool is_logical = 0;
8481             const char * const seqstart = RExC_parse;
8482             bool has_use_defaults = FALSE;
8483
8484             RExC_parse++;
8485             paren = *RExC_parse++;
8486             ret = NULL;                 /* For look-ahead/behind. */
8487             switch (paren) {
8488
8489             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8490                 paren = *RExC_parse++;
8491                 if ( paren == '<')         /* (?P<...>) named capture */
8492                     goto named_capture;
8493                 else if (paren == '>') {   /* (?P>name) named recursion */
8494                     goto named_recursion;
8495                 }
8496                 else if (paren == '=') {   /* (?P=...)  named backref */
8497                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8498                        you change this make sure you change that */
8499                     char* name_start = RExC_parse;
8500                     U32 num = 0;
8501                     SV *sv_dat = reg_scan_name(pRExC_state,
8502                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8503                     if (RExC_parse == name_start || *RExC_parse != ')')
8504                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8505
8506                     if (!SIZE_ONLY) {
8507                         num = add_data( pRExC_state, 1, "S" );
8508                         RExC_rxi->data->data[num]=(void*)sv_dat;
8509                         SvREFCNT_inc_simple_void(sv_dat);
8510                     }
8511                     RExC_sawback = 1;
8512                     ret = reganode(pRExC_state,
8513                                    ((! FOLD)
8514                                      ? NREF
8515                                      : (ASCII_FOLD_RESTRICTED)
8516                                        ? NREFFA
8517                                        : (AT_LEAST_UNI_SEMANTICS)
8518                                          ? NREFFU
8519                                          : (LOC)
8520                                            ? NREFFL
8521                                            : NREFF),
8522                                     num);
8523                     *flagp |= HASWIDTH;
8524
8525                     Set_Node_Offset(ret, parse_start+1);
8526                     Set_Node_Cur_Length(ret); /* MJD */
8527
8528                     nextchar(pRExC_state);
8529                     return ret;
8530                 }
8531                 RExC_parse++;
8532                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8533                 /*NOTREACHED*/
8534             case '<':           /* (?<...) */
8535                 if (*RExC_parse == '!')
8536                     paren = ',';
8537                 else if (*RExC_parse != '=')
8538               named_capture:
8539                 {               /* (?<...>) */
8540                     char *name_start;
8541                     SV *svname;
8542                     paren= '>';
8543             case '\'':          /* (?'...') */
8544                     name_start= RExC_parse;
8545                     svname = reg_scan_name(pRExC_state,
8546                         SIZE_ONLY ?  /* reverse test from the others */
8547                         REG_RSN_RETURN_NAME :
8548                         REG_RSN_RETURN_NULL);
8549                     if (RExC_parse == name_start) {
8550                         RExC_parse++;
8551                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8552                         /*NOTREACHED*/
8553                     }
8554                     if (*RExC_parse != paren)
8555                         vFAIL2("Sequence (?%c... not terminated",
8556                             paren=='>' ? '<' : paren);
8557                     if (SIZE_ONLY) {
8558                         HE *he_str;
8559                         SV *sv_dat = NULL;
8560                         if (!svname) /* shouldn't happen */
8561                             Perl_croak(aTHX_
8562                                 "panic: reg_scan_name returned NULL");
8563                         if (!RExC_paren_names) {
8564                             RExC_paren_names= newHV();
8565                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8566 #ifdef DEBUGGING
8567                             RExC_paren_name_list= newAV();
8568                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8569 #endif
8570                         }
8571                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8572                         if ( he_str )
8573                             sv_dat = HeVAL(he_str);
8574                         if ( ! sv_dat ) {
8575                             /* croak baby croak */
8576                             Perl_croak(aTHX_
8577                                 "panic: paren_name hash element allocation failed");
8578                         } else if ( SvPOK(sv_dat) ) {
8579                             /* (?|...) can mean we have dupes so scan to check
8580                                its already been stored. Maybe a flag indicating
8581                                we are inside such a construct would be useful,
8582                                but the arrays are likely to be quite small, so
8583                                for now we punt -- dmq */
8584                             IV count = SvIV(sv_dat);
8585                             I32 *pv = (I32*)SvPVX(sv_dat);
8586                             IV i;
8587                             for ( i = 0 ; i < count ; i++ ) {
8588                                 if ( pv[i] == RExC_npar ) {
8589                                     count = 0;
8590                                     break;
8591                                 }
8592                             }
8593                             if ( count ) {
8594                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8595                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8596                                 pv[count] = RExC_npar;
8597                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8598                             }
8599                         } else {
8600                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8601                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8602                             SvIOK_on(sv_dat);
8603                             SvIV_set(sv_dat, 1);
8604                         }
8605 #ifdef DEBUGGING
8606                         /* Yes this does cause a memory leak in debugging Perls */
8607                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8608                             SvREFCNT_dec(svname);
8609 #endif
8610
8611                         /*sv_dump(sv_dat);*/
8612                     }
8613                     nextchar(pRExC_state);
8614                     paren = 1;
8615                     goto capturing_parens;
8616                 }
8617                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8618                 RExC_in_lookbehind++;
8619                 RExC_parse++;
8620             case '=':           /* (?=...) */
8621                 RExC_seen_zerolen++;
8622                 break;
8623             case '!':           /* (?!...) */
8624                 RExC_seen_zerolen++;
8625                 if (*RExC_parse == ')') {
8626                     ret=reg_node(pRExC_state, OPFAIL);
8627                     nextchar(pRExC_state);
8628                     return ret;
8629                 }
8630                 break;
8631             case '|':           /* (?|...) */
8632                 /* branch reset, behave like a (?:...) except that
8633                    buffers in alternations share the same numbers */
8634                 paren = ':';
8635                 after_freeze = freeze_paren = RExC_npar;
8636                 break;
8637             case ':':           /* (?:...) */
8638             case '>':           /* (?>...) */
8639                 break;
8640             case '$':           /* (?$...) */
8641             case '@':           /* (?@...) */
8642                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8643                 break;
8644             case '#':           /* (?#...) */
8645                 while (*RExC_parse && *RExC_parse != ')')
8646                     RExC_parse++;
8647                 if (*RExC_parse != ')')
8648                     FAIL("Sequence (?#... not terminated");
8649                 nextchar(pRExC_state);
8650                 *flagp = TRYAGAIN;
8651                 return NULL;
8652             case '0' :           /* (?0) */
8653             case 'R' :           /* (?R) */
8654                 if (*RExC_parse != ')')
8655                     FAIL("Sequence (?R) not terminated");
8656                 ret = reg_node(pRExC_state, GOSTART);
8657                 *flagp |= POSTPONED;
8658                 nextchar(pRExC_state);
8659                 return ret;
8660                 /*notreached*/
8661             { /* named and numeric backreferences */
8662                 I32 num;
8663             case '&':            /* (?&NAME) */
8664                 parse_start = RExC_parse - 1;
8665               named_recursion:
8666                 {
8667                     SV *sv_dat = reg_scan_name(pRExC_state,
8668                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8669                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8670                 }
8671                 goto gen_recurse_regop;
8672                 assert(0); /* NOT REACHED */
8673             case '+':
8674                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8675                     RExC_parse++;
8676                     vFAIL("Illegal pattern");
8677                 }
8678                 goto parse_recursion;
8679                 /* NOT REACHED*/
8680             case '-': /* (?-1) */
8681                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8682                     RExC_parse--; /* rewind to let it be handled later */
8683                     goto parse_flags;
8684                 }
8685                 /*FALLTHROUGH */
8686             case '1': case '2': case '3': case '4': /* (?1) */
8687             case '5': case '6': case '7': case '8': case '9':
8688                 RExC_parse--;
8689               parse_recursion:
8690                 num = atoi(RExC_parse);
8691                 parse_start = RExC_parse - 1; /* MJD */
8692                 if (*RExC_parse == '-')
8693                     RExC_parse++;
8694                 while (isDIGIT(*RExC_parse))
8695                         RExC_parse++;
8696                 if (*RExC_parse!=')')
8697                     vFAIL("Expecting close bracket");
8698
8699               gen_recurse_regop:
8700                 if ( paren == '-' ) {
8701                     /*
8702                     Diagram of capture buffer numbering.
8703                     Top line is the normal capture buffer numbers
8704                     Bottom line is the negative indexing as from
8705                     the X (the (?-2))
8706
8707                     +   1 2    3 4 5 X          6 7
8708                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8709                     -   5 4    3 2 1 X          x x
8710
8711                     */
8712                     num = RExC_npar + num;
8713                     if (num < 1)  {
8714                         RExC_parse++;
8715                         vFAIL("Reference to nonexistent group");
8716                     }
8717                 } else if ( paren == '+' ) {
8718                     num = RExC_npar + num - 1;
8719                 }
8720
8721                 ret = reganode(pRExC_state, GOSUB, num);
8722                 if (!SIZE_ONLY) {
8723                     if (num > (I32)RExC_rx->nparens) {
8724                         RExC_parse++;
8725                         vFAIL("Reference to nonexistent group");
8726                     }
8727                     ARG2L_SET( ret, RExC_recurse_count++);
8728                     RExC_emit++;
8729                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8730                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8731                 } else {
8732                     RExC_size++;
8733                 }
8734                 RExC_seen |= REG_SEEN_RECURSE;
8735                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8736                 Set_Node_Offset(ret, parse_start); /* MJD */
8737
8738                 *flagp |= POSTPONED;
8739                 nextchar(pRExC_state);
8740                 return ret;
8741             } /* named and numeric backreferences */
8742             assert(0); /* NOT REACHED */
8743
8744             case '?':           /* (??...) */
8745                 is_logical = 1;
8746                 if (*RExC_parse != '{') {
8747                     RExC_parse++;
8748                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8749                     /*NOTREACHED*/
8750                 }
8751                 *flagp |= POSTPONED;
8752                 paren = *RExC_parse++;
8753                 /* FALL THROUGH */
8754             case '{':           /* (?{...}) */
8755             {
8756                 U32 n = 0;
8757                 struct reg_code_block *cb;
8758
8759                 RExC_seen_zerolen++;
8760
8761                 if (   !pRExC_state->num_code_blocks
8762                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8763                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8764                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8765                             - RExC_start)
8766                 ) {
8767                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8768                         FAIL("panic: Sequence (?{...}): no code block found\n");
8769                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8770                 }
8771                 /* this is a pre-compiled code block (?{...}) */
8772                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8773                 RExC_parse = RExC_start + cb->end;
8774                 if (!SIZE_ONLY) {
8775                     OP *o = cb->block;
8776                     if (cb->src_regex) {
8777                         n = add_data(pRExC_state, 2, "rl");
8778                         RExC_rxi->data->data[n] =
8779                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8780                         RExC_rxi->data->data[n+1] = (void*)o;
8781                     }
8782                     else {
8783                         n = add_data(pRExC_state, 1,
8784                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8785                         RExC_rxi->data->data[n] = (void*)o;
8786                     }
8787                 }
8788                 pRExC_state->code_index++;
8789                 nextchar(pRExC_state);
8790
8791                 if (is_logical) {
8792                     regnode *eval;
8793                     ret = reg_node(pRExC_state, LOGICAL);
8794                     eval = reganode(pRExC_state, EVAL, n);
8795                     if (!SIZE_ONLY) {
8796                         ret->flags = 2;
8797                         /* for later propagation into (??{}) return value */
8798                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8799                     }
8800                     REGTAIL(pRExC_state, ret, eval);
8801                     /* deal with the length of this later - MJD */
8802                     return ret;
8803                 }
8804                 ret = reganode(pRExC_state, EVAL, n);
8805                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8806                 Set_Node_Offset(ret, parse_start);
8807                 return ret;
8808             }
8809             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8810             {
8811                 int is_define= 0;
8812                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8813                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8814                         || RExC_parse[1] == '<'
8815                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8816                         I32 flag;
8817
8818                         ret = reg_node(pRExC_state, LOGICAL);
8819                         if (!SIZE_ONLY)
8820                             ret->flags = 1;
8821                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8822                         goto insert_if;
8823                     }
8824                 }
8825                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8826                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8827                 {
8828                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8829                     char *name_start= RExC_parse++;
8830                     U32 num = 0;
8831                     SV *sv_dat=reg_scan_name(pRExC_state,
8832                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8833                     if (RExC_parse == name_start || *RExC_parse != ch)
8834                         vFAIL2("Sequence (?(%c... not terminated",
8835                             (ch == '>' ? '<' : ch));
8836                     RExC_parse++;
8837                     if (!SIZE_ONLY) {
8838                         num = add_data( pRExC_state, 1, "S" );
8839                         RExC_rxi->data->data[num]=(void*)sv_dat;
8840                         SvREFCNT_inc_simple_void(sv_dat);
8841                     }
8842                     ret = reganode(pRExC_state,NGROUPP,num);
8843                     goto insert_if_check_paren;
8844                 }
8845                 else if (RExC_parse[0] == 'D' &&
8846                          RExC_parse[1] == 'E' &&
8847                          RExC_parse[2] == 'F' &&
8848                          RExC_parse[3] == 'I' &&
8849                          RExC_parse[4] == 'N' &&
8850                          RExC_parse[5] == 'E')
8851                 {
8852                     ret = reganode(pRExC_state,DEFINEP,0);
8853                     RExC_parse +=6 ;
8854                     is_define = 1;
8855                     goto insert_if_check_paren;
8856                 }
8857                 else if (RExC_parse[0] == 'R') {
8858                     RExC_parse++;
8859                     parno = 0;
8860                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8861                         parno = atoi(RExC_parse++);
8862                         while (isDIGIT(*RExC_parse))
8863                             RExC_parse++;
8864                     } else if (RExC_parse[0] == '&') {
8865                         SV *sv_dat;
8866                         RExC_parse++;
8867                         sv_dat = reg_scan_name(pRExC_state,
8868                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8869                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8870                     }
8871                     ret = reganode(pRExC_state,INSUBP,parno);
8872                     goto insert_if_check_paren;
8873                 }
8874                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8875                     /* (?(1)...) */
8876                     char c;
8877                     parno = atoi(RExC_parse++);
8878
8879                     while (isDIGIT(*RExC_parse))
8880                         RExC_parse++;
8881                     ret = reganode(pRExC_state, GROUPP, parno);
8882
8883                  insert_if_check_paren:
8884                     if ((c = *nextchar(pRExC_state)) != ')')
8885                         vFAIL("Switch condition not recognized");
8886                   insert_if:
8887                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8888                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8889                     if (br == NULL)
8890                         br = reganode(pRExC_state, LONGJMP, 0);
8891                     else
8892                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8893                     c = *nextchar(pRExC_state);
8894                     if (flags&HASWIDTH)
8895                         *flagp |= HASWIDTH;
8896                     if (c == '|') {
8897                         if (is_define)
8898                             vFAIL("(?(DEFINE)....) does not allow branches");
8899                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8900                         regbranch(pRExC_state, &flags, 1,depth+1);
8901                         REGTAIL(pRExC_state, ret, lastbr);
8902                         if (flags&HASWIDTH)
8903                             *flagp |= HASWIDTH;
8904                         c = *nextchar(pRExC_state);
8905                     }
8906                     else
8907                         lastbr = NULL;
8908                     if (c != ')')
8909                         vFAIL("Switch (?(condition)... contains too many branches");
8910                     ender = reg_node(pRExC_state, TAIL);
8911                     REGTAIL(pRExC_state, br, ender);
8912                     if (lastbr) {
8913                         REGTAIL(pRExC_state, lastbr, ender);
8914                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8915                     }
8916                     else
8917                         REGTAIL(pRExC_state, ret, ender);
8918                     RExC_size++; /* XXX WHY do we need this?!!
8919                                     For large programs it seems to be required
8920                                     but I can't figure out why. -- dmq*/
8921                     return ret;
8922                 }
8923                 else {
8924                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8925                 }
8926             }
8927             case 0:
8928                 RExC_parse--; /* for vFAIL to print correctly */
8929                 vFAIL("Sequence (? incomplete");
8930                 break;
8931             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8932                                        that follow */
8933                 has_use_defaults = TRUE;
8934                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8935                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8936                                                 ? REGEX_UNICODE_CHARSET
8937                                                 : REGEX_DEPENDS_CHARSET);
8938                 goto parse_flags;
8939             default:
8940                 --RExC_parse;
8941                 parse_flags:      /* (?i) */
8942             {
8943                 U32 posflags = 0, negflags = 0;
8944                 U32 *flagsp = &posflags;
8945                 char has_charset_modifier = '\0';
8946                 regex_charset cs = get_regex_charset(RExC_flags);
8947                 if (cs == REGEX_DEPENDS_CHARSET
8948                     && (RExC_utf8 || RExC_uni_semantics))
8949                 {
8950                     cs = REGEX_UNICODE_CHARSET;
8951                 }
8952
8953                 while (*RExC_parse) {
8954                     /* && strchr("iogcmsx", *RExC_parse) */
8955                     /* (?g), (?gc) and (?o) are useless here
8956                        and must be globally applied -- japhy */
8957                     switch (*RExC_parse) {
8958                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8959                     case LOCALE_PAT_MOD:
8960                         if (has_charset_modifier) {
8961                             goto excess_modifier;
8962                         }
8963                         else if (flagsp == &negflags) {
8964                             goto neg_modifier;
8965                         }
8966                         cs = REGEX_LOCALE_CHARSET;
8967                         has_charset_modifier = LOCALE_PAT_MOD;
8968                         RExC_contains_locale = 1;
8969                         break;
8970                     case UNICODE_PAT_MOD:
8971                         if (has_charset_modifier) {
8972                             goto excess_modifier;
8973                         }
8974                         else if (flagsp == &negflags) {
8975                             goto neg_modifier;
8976                         }
8977                         cs = REGEX_UNICODE_CHARSET;
8978                         has_charset_modifier = UNICODE_PAT_MOD;
8979                         break;
8980                     case ASCII_RESTRICT_PAT_MOD:
8981                         if (flagsp == &negflags) {
8982                             goto neg_modifier;
8983                         }
8984                         if (has_charset_modifier) {
8985                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8986                                 goto excess_modifier;
8987                             }
8988                             /* Doubled modifier implies more restricted */
8989                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8990                         }
8991                         else {
8992                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8993                         }
8994                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8995                         break;
8996                     case DEPENDS_PAT_MOD:
8997                         if (has_use_defaults) {
8998                             goto fail_modifiers;
8999                         }
9000                         else if (flagsp == &negflags) {
9001                             goto neg_modifier;
9002                         }
9003                         else if (has_charset_modifier) {
9004                             goto excess_modifier;
9005                         }
9006
9007                         /* The dual charset means unicode semantics if the
9008                          * pattern (or target, not known until runtime) are
9009                          * utf8, or something in the pattern indicates unicode
9010                          * semantics */
9011                         cs = (RExC_utf8 || RExC_uni_semantics)
9012                              ? REGEX_UNICODE_CHARSET
9013                              : REGEX_DEPENDS_CHARSET;
9014                         has_charset_modifier = DEPENDS_PAT_MOD;
9015                         break;
9016                     excess_modifier:
9017                         RExC_parse++;
9018                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9019                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9020                         }
9021                         else if (has_charset_modifier == *(RExC_parse - 1)) {
9022                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9023                         }
9024                         else {
9025                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9026                         }
9027                         /*NOTREACHED*/
9028                     neg_modifier:
9029                         RExC_parse++;
9030                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9031                         /*NOTREACHED*/
9032                     case ONCE_PAT_MOD: /* 'o' */
9033                     case GLOBAL_PAT_MOD: /* 'g' */
9034                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9035                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9036                             if (! (wastedflags & wflagbit) ) {
9037                                 wastedflags |= wflagbit;
9038                                 vWARN5(
9039                                     RExC_parse + 1,
9040                                     "Useless (%s%c) - %suse /%c modifier",
9041                                     flagsp == &negflags ? "?-" : "?",
9042                                     *RExC_parse,
9043                                     flagsp == &negflags ? "don't " : "",
9044                                     *RExC_parse
9045                                 );
9046                             }
9047                         }
9048                         break;
9049
9050                     case CONTINUE_PAT_MOD: /* 'c' */
9051                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9052                             if (! (wastedflags & WASTED_C) ) {
9053                                 wastedflags |= WASTED_GC;
9054                                 vWARN3(
9055                                     RExC_parse + 1,
9056                                     "Useless (%sc) - %suse /gc modifier",
9057                                     flagsp == &negflags ? "?-" : "?",
9058                                     flagsp == &negflags ? "don't " : ""
9059                                 );
9060                             }
9061                         }
9062                         break;
9063                     case KEEPCOPY_PAT_MOD: /* 'p' */
9064                         if (flagsp == &negflags) {
9065                             if (SIZE_ONLY)
9066                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9067                         } else {
9068                             *flagsp |= RXf_PMf_KEEPCOPY;
9069                         }
9070                         break;
9071                     case '-':
9072                         /* A flag is a default iff it is following a minus, so
9073                          * if there is a minus, it means will be trying to
9074                          * re-specify a default which is an error */
9075                         if (has_use_defaults || flagsp == &negflags) {
9076             fail_modifiers:
9077                             RExC_parse++;
9078                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9079                             /*NOTREACHED*/
9080                         }
9081                         flagsp = &negflags;
9082                         wastedflags = 0;  /* reset so (?g-c) warns twice */
9083                         break;
9084                     case ':':
9085                         paren = ':';
9086                         /*FALLTHROUGH*/
9087                     case ')':
9088                         RExC_flags |= posflags;
9089                         RExC_flags &= ~negflags;
9090                         set_regex_charset(&RExC_flags, cs);
9091                         if (paren != ':') {
9092                             oregflags |= posflags;
9093                             oregflags &= ~negflags;
9094                             set_regex_charset(&oregflags, cs);
9095                         }
9096                         nextchar(pRExC_state);
9097                         if (paren != ':') {
9098                             *flagp = TRYAGAIN;
9099                             return NULL;
9100                         } else {
9101                             ret = NULL;
9102                             goto parse_rest;
9103                         }
9104                         /*NOTREACHED*/
9105                     default:
9106                         RExC_parse++;
9107                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9108                         /*NOTREACHED*/
9109                     }
9110                     ++RExC_parse;
9111                 }
9112             }} /* one for the default block, one for the switch */
9113         }
9114         else {                  /* (...) */
9115           capturing_parens:
9116             parno = RExC_npar;
9117             RExC_npar++;
9118
9119             ret = reganode(pRExC_state, OPEN, parno);
9120             if (!SIZE_ONLY ){
9121                 if (!RExC_nestroot)
9122                     RExC_nestroot = parno;
9123                 if (RExC_seen & REG_SEEN_RECURSE
9124                     && !RExC_open_parens[parno-1])
9125                 {
9126                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9127                         "Setting open paren #%"IVdf" to %d\n",
9128                         (IV)parno, REG_NODE_NUM(ret)));
9129                     RExC_open_parens[parno-1]= ret;
9130                 }
9131             }
9132             Set_Node_Length(ret, 1); /* MJD */
9133             Set_Node_Offset(ret, RExC_parse); /* MJD */
9134             is_open = 1;
9135         }
9136     }
9137     else                        /* ! paren */
9138         ret = NULL;
9139
9140    parse_rest:
9141     /* Pick up the branches, linking them together. */
9142     parse_start = RExC_parse;   /* MJD */
9143     br = regbranch(pRExC_state, &flags, 1,depth+1);
9144
9145     /*     branch_len = (paren != 0); */
9146
9147     if (br == NULL)
9148         return(NULL);
9149     if (*RExC_parse == '|') {
9150         if (!SIZE_ONLY && RExC_extralen) {
9151             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9152         }
9153         else {                  /* MJD */
9154             reginsert(pRExC_state, BRANCH, br, depth+1);
9155             Set_Node_Length(br, paren != 0);
9156             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9157         }
9158         have_branch = 1;
9159         if (SIZE_ONLY)
9160             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9161     }
9162     else if (paren == ':') {
9163         *flagp |= flags&SIMPLE;
9164     }
9165     if (is_open) {                              /* Starts with OPEN. */
9166         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9167     }
9168     else if (paren != '?')              /* Not Conditional */
9169         ret = br;
9170     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9171     lastbr = br;
9172     while (*RExC_parse == '|') {
9173         if (!SIZE_ONLY && RExC_extralen) {
9174             ender = reganode(pRExC_state, LONGJMP,0);
9175             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9176         }
9177         if (SIZE_ONLY)
9178             RExC_extralen += 2;         /* Account for LONGJMP. */
9179         nextchar(pRExC_state);
9180         if (freeze_paren) {
9181             if (RExC_npar > after_freeze)
9182                 after_freeze = RExC_npar;
9183             RExC_npar = freeze_paren;
9184         }
9185         br = regbranch(pRExC_state, &flags, 0, depth+1);
9186
9187         if (br == NULL)
9188             return(NULL);
9189         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9190         lastbr = br;
9191         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9192     }
9193
9194     if (have_branch || paren != ':') {
9195         /* Make a closing node, and hook it on the end. */
9196         switch (paren) {
9197         case ':':
9198             ender = reg_node(pRExC_state, TAIL);
9199             break;
9200         case 1:
9201             ender = reganode(pRExC_state, CLOSE, parno);
9202             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9203                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9204                         "Setting close paren #%"IVdf" to %d\n",
9205                         (IV)parno, REG_NODE_NUM(ender)));
9206                 RExC_close_parens[parno-1]= ender;
9207                 if (RExC_nestroot == parno)
9208                     RExC_nestroot = 0;
9209             }
9210             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9211             Set_Node_Length(ender,1); /* MJD */
9212             break;
9213         case '<':
9214         case ',':
9215         case '=':
9216         case '!':
9217             *flagp &= ~HASWIDTH;
9218             /* FALL THROUGH */
9219         case '>':
9220             ender = reg_node(pRExC_state, SUCCEED);
9221             break;
9222         case 0:
9223             ender = reg_node(pRExC_state, END);
9224             if (!SIZE_ONLY) {
9225                 assert(!RExC_opend); /* there can only be one! */
9226                 RExC_opend = ender;
9227             }
9228             break;
9229         }
9230         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9231             SV * const mysv_val1=sv_newmortal();
9232             SV * const mysv_val2=sv_newmortal();
9233             DEBUG_PARSE_MSG("lsbr");
9234             regprop(RExC_rx, mysv_val1, lastbr);
9235             regprop(RExC_rx, mysv_val2, ender);
9236             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9237                           SvPV_nolen_const(mysv_val1),
9238                           (IV)REG_NODE_NUM(lastbr),
9239                           SvPV_nolen_const(mysv_val2),
9240                           (IV)REG_NODE_NUM(ender),
9241                           (IV)(ender - lastbr)
9242             );
9243         });
9244         REGTAIL(pRExC_state, lastbr, ender);
9245
9246         if (have_branch && !SIZE_ONLY) {
9247             char is_nothing= 1;
9248             if (depth==1)
9249                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9250
9251             /* Hook the tails of the branches to the closing node. */
9252             for (br = ret; br; br = regnext(br)) {
9253                 const U8 op = PL_regkind[OP(br)];
9254                 if (op == BRANCH) {
9255                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9256                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9257                         is_nothing= 0;
9258                 }
9259                 else if (op == BRANCHJ) {
9260                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9261                     /* for now we always disable this optimisation * /
9262                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9263                     */
9264                         is_nothing= 0;
9265                 }
9266             }
9267             if (is_nothing) {
9268                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9269                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9270                     SV * const mysv_val1=sv_newmortal();
9271                     SV * const mysv_val2=sv_newmortal();
9272                     DEBUG_PARSE_MSG("NADA");
9273                     regprop(RExC_rx, mysv_val1, ret);
9274                     regprop(RExC_rx, mysv_val2, ender);
9275                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9276                                   SvPV_nolen_const(mysv_val1),
9277                                   (IV)REG_NODE_NUM(ret),
9278                                   SvPV_nolen_const(mysv_val2),
9279                                   (IV)REG_NODE_NUM(ender),
9280                                   (IV)(ender - ret)
9281                     );
9282                 });
9283                 OP(br)= NOTHING;
9284                 if (OP(ender) == TAIL) {
9285                     NEXT_OFF(br)= 0;
9286                     RExC_emit= br + 1;
9287                 } else {
9288                     regnode *opt;
9289                     for ( opt= br + 1; opt < ender ; opt++ )
9290                         OP(opt)= OPTIMIZED;
9291                     NEXT_OFF(br)= ender - br;
9292                 }
9293             }
9294         }
9295     }
9296
9297     {
9298         const char *p;
9299         static const char parens[] = "=!<,>";
9300
9301         if (paren && (p = strchr(parens, paren))) {
9302             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9303             int flag = (p - parens) > 1;
9304
9305             if (paren == '>')
9306                 node = SUSPEND, flag = 0;
9307             reginsert(pRExC_state, node,ret, depth+1);
9308             Set_Node_Cur_Length(ret);
9309             Set_Node_Offset(ret, parse_start + 1);
9310             ret->flags = flag;
9311             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9312         }
9313     }
9314
9315     /* Check for proper termination. */
9316     if (paren) {
9317         RExC_flags = oregflags;
9318         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9319             RExC_parse = oregcomp_parse;
9320             vFAIL("Unmatched (");
9321         }
9322     }
9323     else if (!paren && RExC_parse < RExC_end) {
9324         if (*RExC_parse == ')') {
9325             RExC_parse++;
9326             vFAIL("Unmatched )");
9327         }
9328         else
9329             FAIL("Junk on end of regexp");      /* "Can't happen". */
9330         assert(0); /* NOTREACHED */
9331     }
9332
9333     if (RExC_in_lookbehind) {
9334         RExC_in_lookbehind--;
9335     }
9336     if (after_freeze > RExC_npar)
9337         RExC_npar = after_freeze;
9338     return(ret);
9339 }
9340
9341 /*
9342  - regbranch - one alternative of an | operator
9343  *
9344  * Implements the concatenation operator.
9345  */
9346 STATIC regnode *
9347 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9348 {
9349     dVAR;
9350     regnode *ret;
9351     regnode *chain = NULL;
9352     regnode *latest;
9353     I32 flags = 0, c = 0;
9354     GET_RE_DEBUG_FLAGS_DECL;
9355
9356     PERL_ARGS_ASSERT_REGBRANCH;
9357
9358     DEBUG_PARSE("brnc");
9359
9360     if (first)
9361         ret = NULL;
9362     else {
9363         if (!SIZE_ONLY && RExC_extralen)
9364             ret = reganode(pRExC_state, BRANCHJ,0);
9365         else {
9366             ret = reg_node(pRExC_state, BRANCH);
9367             Set_Node_Length(ret, 1);
9368         }
9369     }
9370
9371     if (!first && SIZE_ONLY)
9372         RExC_extralen += 1;                     /* BRANCHJ */
9373
9374     *flagp = WORST;                     /* Tentatively. */
9375
9376     RExC_parse--;
9377     nextchar(pRExC_state);
9378     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9379         flags &= ~TRYAGAIN;
9380         latest = regpiece(pRExC_state, &flags,depth+1);
9381         if (latest == NULL) {
9382             if (flags & TRYAGAIN)
9383                 continue;
9384             return(NULL);
9385         }
9386         else if (ret == NULL)
9387             ret = latest;
9388         *flagp |= flags&(HASWIDTH|POSTPONED);
9389         if (chain == NULL)      /* First piece. */
9390             *flagp |= flags&SPSTART;
9391         else {
9392             RExC_naughty++;
9393             REGTAIL(pRExC_state, chain, latest);
9394         }
9395         chain = latest;
9396         c++;
9397     }
9398     if (chain == NULL) {        /* Loop ran zero times. */
9399         chain = reg_node(pRExC_state, NOTHING);
9400         if (ret == NULL)
9401             ret = chain;
9402     }
9403     if (c == 1) {
9404         *flagp |= flags&SIMPLE;
9405     }
9406
9407     return ret;
9408 }
9409
9410 /*
9411  - regpiece - something followed by possible [*+?]
9412  *
9413  * Note that the branching code sequences used for ? and the general cases
9414  * of * and + are somewhat optimized:  they use the same NOTHING node as
9415  * both the endmarker for their branch list and the body of the last branch.
9416  * It might seem that this node could be dispensed with entirely, but the
9417  * endmarker role is not redundant.
9418  */
9419 STATIC regnode *
9420 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9421 {
9422     dVAR;
9423     regnode *ret;
9424     char op;
9425     char *next;
9426     I32 flags;
9427     const char * const origparse = RExC_parse;
9428     I32 min;
9429     I32 max = REG_INFTY;
9430 #ifdef RE_TRACK_PATTERN_OFFSETS
9431     char *parse_start;
9432 #endif
9433     const char *maxpos = NULL;
9434
9435     /* Save the original in case we change the emitted regop to a FAIL. */
9436     regnode * const orig_emit = RExC_emit;
9437
9438     GET_RE_DEBUG_FLAGS_DECL;
9439
9440     PERL_ARGS_ASSERT_REGPIECE;
9441
9442     DEBUG_PARSE("piec");
9443
9444     ret = regatom(pRExC_state, &flags,depth+1);
9445     if (ret == NULL) {
9446         if (flags & TRYAGAIN)
9447             *flagp |= TRYAGAIN;
9448         return(NULL);
9449     }
9450
9451     op = *RExC_parse;
9452
9453     if (op == '{' && regcurly(RExC_parse)) {
9454         maxpos = NULL;
9455 #ifdef RE_TRACK_PATTERN_OFFSETS
9456         parse_start = RExC_parse; /* MJD */
9457 #endif
9458         next = RExC_parse + 1;
9459         while (isDIGIT(*next) || *next == ',') {
9460             if (*next == ',') {
9461                 if (maxpos)
9462                     break;
9463                 else
9464                     maxpos = next;
9465             }
9466             next++;
9467         }
9468         if (*next == '}') {             /* got one */
9469             if (!maxpos)
9470                 maxpos = next;
9471             RExC_parse++;
9472             min = atoi(RExC_parse);
9473             if (*maxpos == ',')
9474                 maxpos++;
9475             else
9476                 maxpos = RExC_parse;
9477             max = atoi(maxpos);
9478             if (!max && *maxpos != '0')
9479                 max = REG_INFTY;                /* meaning "infinity" */
9480             else if (max >= REG_INFTY)
9481                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9482             RExC_parse = next;
9483             nextchar(pRExC_state);
9484             if (max < min) {    /* If can't match, warn and optimize to fail
9485                                    unconditionally */
9486                 if (SIZE_ONLY) {
9487                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9488
9489                     /* We can't back off the size because we have to reserve
9490                      * enough space for all the things we are about to throw
9491                      * away, but we can shrink it by the ammount we are about
9492                      * to re-use here */
9493                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9494                 }
9495                 else {
9496                     RExC_emit = orig_emit;
9497                 }
9498                 ret = reg_node(pRExC_state, OPFAIL);
9499                 return ret;
9500             }
9501
9502         do_curly:
9503             if ((flags&SIMPLE)) {
9504                 RExC_naughty += 2 + RExC_naughty / 2;
9505                 reginsert(pRExC_state, CURLY, ret, depth+1);
9506                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9507                 Set_Node_Cur_Length(ret);
9508             }
9509             else {
9510                 regnode * const w = reg_node(pRExC_state, WHILEM);
9511
9512                 w->flags = 0;
9513                 REGTAIL(pRExC_state, ret, w);
9514                 if (!SIZE_ONLY && RExC_extralen) {
9515                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9516                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9517                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9518                 }
9519                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9520                                 /* MJD hk */
9521                 Set_Node_Offset(ret, parse_start+1);
9522                 Set_Node_Length(ret,
9523                                 op == '{' ? (RExC_parse - parse_start) : 1);
9524
9525                 if (!SIZE_ONLY && RExC_extralen)
9526                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9527                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9528                 if (SIZE_ONLY)
9529                     RExC_whilem_seen++, RExC_extralen += 3;
9530                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9531             }
9532             ret->flags = 0;
9533
9534             if (min > 0)
9535                 *flagp = WORST;
9536             if (max > 0)
9537                 *flagp |= HASWIDTH;
9538             if (!SIZE_ONLY) {
9539                 ARG1_SET(ret, (U16)min);
9540                 ARG2_SET(ret, (U16)max);
9541             }
9542
9543             goto nest_check;
9544         }
9545     }
9546
9547     if (!ISMULT1(op)) {
9548         *flagp = flags;
9549         return(ret);
9550     }
9551
9552 #if 0                           /* Now runtime fix should be reliable. */
9553
9554     /* if this is reinstated, don't forget to put this back into perldiag:
9555
9556             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9557
9558            (F) The part of the regexp subject to either the * or + quantifier
9559            could match an empty string. The {#} shows in the regular
9560            expression about where the problem was discovered.
9561
9562     */
9563
9564     if (!(flags&HASWIDTH) && op != '?')
9565       vFAIL("Regexp *+ operand could be empty");
9566 #endif
9567
9568 #ifdef RE_TRACK_PATTERN_OFFSETS
9569     parse_start = RExC_parse;
9570 #endif
9571     nextchar(pRExC_state);
9572
9573     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9574
9575     if (op == '*' && (flags&SIMPLE)) {
9576         reginsert(pRExC_state, STAR, ret, depth+1);
9577         ret->flags = 0;
9578         RExC_naughty += 4;
9579     }
9580     else if (op == '*') {
9581         min = 0;
9582         goto do_curly;
9583     }
9584     else if (op == '+' && (flags&SIMPLE)) {
9585         reginsert(pRExC_state, PLUS, ret, depth+1);
9586         ret->flags = 0;
9587         RExC_naughty += 3;
9588     }
9589     else if (op == '+') {
9590         min = 1;
9591         goto do_curly;
9592     }
9593     else if (op == '?') {
9594         min = 0; max = 1;
9595         goto do_curly;
9596     }
9597   nest_check:
9598     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9599         ckWARN3reg(RExC_parse,
9600                    "%.*s matches null string many times",
9601                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9602                    origparse);
9603     }
9604
9605     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9606         nextchar(pRExC_state);
9607         reginsert(pRExC_state, MINMOD, ret, depth+1);
9608         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9609     }
9610 #ifndef REG_ALLOW_MINMOD_SUSPEND
9611     else
9612 #endif
9613     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9614         regnode *ender;
9615         nextchar(pRExC_state);
9616         ender = reg_node(pRExC_state, SUCCEED);
9617         REGTAIL(pRExC_state, ret, ender);
9618         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9619         ret->flags = 0;
9620         ender = reg_node(pRExC_state, TAIL);
9621         REGTAIL(pRExC_state, ret, ender);
9622         /*ret= ender;*/
9623     }
9624
9625     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9626         RExC_parse++;
9627         vFAIL("Nested quantifiers");
9628     }
9629
9630     return(ret);
9631 }
9632
9633 STATIC bool
9634 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9635 {
9636
9637  /* This is expected to be called by a parser routine that has recognized '\N'
9638    and needs to handle the rest. RExC_parse is expected to point at the first
9639    char following the N at the time of the call.  On successful return,
9640    RExC_parse has been updated to point to just after the sequence identified
9641    by this routine, and <*flagp> has been updated.
9642
9643    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9644    character class.
9645
9646    \N may begin either a named sequence, or if outside a character class, mean
9647    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9648    attempted to decide which, and in the case of a named sequence, converted it
9649    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9650    where c1... are the characters in the sequence.  For single-quoted regexes,
9651    the tokenizer passes the \N sequence through unchanged; this code will not
9652    attempt to determine this nor expand those, instead raising a syntax error.
9653    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9654    or there is no '}', it signals that this \N occurrence means to match a
9655    non-newline.
9656
9657    Only the \N{U+...} form should occur in a character class, for the same
9658    reason that '.' inside a character class means to just match a period: it
9659    just doesn't make sense.
9660
9661    The function raises an error (via vFAIL), and doesn't return for various
9662    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9663    success; it returns FALSE otherwise.
9664
9665    If <valuep> is non-null, it means the caller can accept an input sequence
9666    consisting of a just a single code point; <*valuep> is set to that value
9667    if the input is such.
9668
9669    If <node_p> is non-null it signifies that the caller can accept any other
9670    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9671    is set as follows:
9672     1) \N means not-a-NL: points to a newly created REG_ANY node;
9673     2) \N{}:              points to a new NOTHING node;
9674     3) otherwise:         points to a new EXACT node containing the resolved
9675                           string.
9676    Note that FALSE is returned for single code point sequences if <valuep> is
9677    null.
9678  */
9679
9680     char * endbrace;    /* '}' following the name */
9681     char* p;
9682     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9683                            stream */
9684     bool has_multiple_chars; /* true if the input stream contains a sequence of
9685                                 more than one character */
9686
9687     GET_RE_DEBUG_FLAGS_DECL;
9688
9689     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9690
9691     GET_RE_DEBUG_FLAGS;
9692
9693     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9694
9695     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9696      * modifier.  The other meaning does not */
9697     p = (RExC_flags & RXf_PMf_EXTENDED)
9698         ? regwhite( pRExC_state, RExC_parse )
9699         : RExC_parse;
9700
9701     /* Disambiguate between \N meaning a named character versus \N meaning
9702      * [^\n].  The former is assumed when it can't be the latter. */
9703     if (*p != '{' || regcurly(p)) {
9704         RExC_parse = p;
9705         if (! node_p) {
9706             /* no bare \N in a charclass */
9707             if (in_char_class) {
9708                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9709             }
9710             return FALSE;
9711         }
9712         nextchar(pRExC_state);
9713         *node_p = reg_node(pRExC_state, REG_ANY);
9714         *flagp |= HASWIDTH|SIMPLE;
9715         RExC_naughty++;
9716         RExC_parse--;
9717         Set_Node_Length(*node_p, 1); /* MJD */
9718         return TRUE;
9719     }
9720
9721     /* Here, we have decided it should be a named character or sequence */
9722
9723     /* The test above made sure that the next real character is a '{', but
9724      * under the /x modifier, it could be separated by space (or a comment and
9725      * \n) and this is not allowed (for consistency with \x{...} and the
9726      * tokenizer handling of \N{NAME}). */
9727     if (*RExC_parse != '{') {
9728         vFAIL("Missing braces on \\N{}");
9729     }
9730
9731     RExC_parse++;       /* Skip past the '{' */
9732
9733     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9734         || ! (endbrace == RExC_parse            /* nothing between the {} */
9735               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9736                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9737     {
9738         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9739         vFAIL("\\N{NAME} must be resolved by the lexer");
9740     }
9741
9742     if (endbrace == RExC_parse) {   /* empty: \N{} */
9743         bool ret = TRUE;
9744         if (node_p) {
9745             *node_p = reg_node(pRExC_state,NOTHING);
9746         }
9747         else if (in_char_class) {
9748             if (SIZE_ONLY && in_char_class) {
9749                 ckWARNreg(RExC_parse,
9750                         "Ignoring zero length \\N{} in character class"
9751                 );
9752             }
9753             ret = FALSE;
9754         }
9755         else {
9756             return FALSE;
9757         }
9758         nextchar(pRExC_state);
9759         return ret;
9760     }
9761
9762     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9763     RExC_parse += 2;    /* Skip past the 'U+' */
9764
9765     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9766
9767     /* Code points are separated by dots.  If none, there is only one code
9768      * point, and is terminated by the brace */
9769     has_multiple_chars = (endchar < endbrace);
9770
9771     if (valuep && (! has_multiple_chars || in_char_class)) {
9772         /* We only pay attention to the first char of
9773         multichar strings being returned in char classes. I kinda wonder
9774         if this makes sense as it does change the behaviour
9775         from earlier versions, OTOH that behaviour was broken
9776         as well. XXX Solution is to recharacterize as
9777         [rest-of-class]|multi1|multi2... */
9778
9779         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9780         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9781             | PERL_SCAN_DISALLOW_PREFIX
9782             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9783
9784         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9785
9786         /* The tokenizer should have guaranteed validity, but it's possible to
9787          * bypass it by using single quoting, so check */
9788         if (length_of_hex == 0
9789             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9790         {
9791             RExC_parse += length_of_hex;        /* Includes all the valid */
9792             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9793                             ? UTF8SKIP(RExC_parse)
9794                             : 1;
9795             /* Guard against malformed utf8 */
9796             if (RExC_parse >= endchar) {
9797                 RExC_parse = endchar;
9798             }
9799             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9800         }
9801
9802         if (in_char_class && has_multiple_chars) {
9803             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9804         }
9805
9806         RExC_parse = endbrace + 1;
9807     }
9808     else if (! node_p || ! has_multiple_chars) {
9809
9810         /* Here, the input is legal, but not according to the caller's
9811          * options.  We fail without advancing the parse, so that the
9812          * caller can try again */
9813         RExC_parse = p;
9814         return FALSE;
9815     }
9816     else {
9817
9818         /* What is done here is to convert this to a sub-pattern of the form
9819          * (?:\x{char1}\x{char2}...)
9820          * and then call reg recursively.  That way, it retains its atomicness,
9821          * while not having to worry about special handling that some code
9822          * points may have.  toke.c has converted the original Unicode values
9823          * to native, so that we can just pass on the hex values unchanged.  We
9824          * do have to set a flag to keep recoding from happening in the
9825          * recursion */
9826
9827         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9828         STRLEN len;
9829         char *orig_end = RExC_end;
9830         I32 flags;
9831
9832         while (RExC_parse < endbrace) {
9833
9834             /* Convert to notation the rest of the code understands */
9835             sv_catpv(substitute_parse, "\\x{");
9836             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9837             sv_catpv(substitute_parse, "}");
9838
9839             /* Point to the beginning of the next character in the sequence. */
9840             RExC_parse = endchar + 1;
9841             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9842         }
9843         sv_catpv(substitute_parse, ")");
9844
9845         RExC_parse = SvPV(substitute_parse, len);
9846
9847         /* Don't allow empty number */
9848         if (len < 8) {
9849             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9850         }
9851         RExC_end = RExC_parse + len;
9852
9853         /* The values are Unicode, and therefore not subject to recoding */
9854         RExC_override_recoding = 1;
9855
9856         *node_p = reg(pRExC_state, 1, &flags, depth+1);
9857         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9858
9859         RExC_parse = endbrace;
9860         RExC_end = orig_end;
9861         RExC_override_recoding = 0;
9862
9863         nextchar(pRExC_state);
9864     }
9865
9866     return TRUE;
9867 }
9868
9869
9870 /*
9871  * reg_recode
9872  *
9873  * It returns the code point in utf8 for the value in *encp.
9874  *    value: a code value in the source encoding
9875  *    encp:  a pointer to an Encode object
9876  *
9877  * If the result from Encode is not a single character,
9878  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9879  */
9880 STATIC UV
9881 S_reg_recode(pTHX_ const char value, SV **encp)
9882 {
9883     STRLEN numlen = 1;
9884     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9885     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9886     const STRLEN newlen = SvCUR(sv);
9887     UV uv = UNICODE_REPLACEMENT;
9888
9889     PERL_ARGS_ASSERT_REG_RECODE;
9890
9891     if (newlen)
9892         uv = SvUTF8(sv)
9893              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9894              : *(U8*)s;
9895
9896     if (!newlen || numlen != newlen) {
9897         uv = UNICODE_REPLACEMENT;
9898         *encp = NULL;
9899     }
9900     return uv;
9901 }
9902
9903 PERL_STATIC_INLINE U8
9904 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9905 {
9906     U8 op;
9907
9908     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9909
9910     if (! FOLD) {
9911         return EXACT;
9912     }
9913
9914     op = get_regex_charset(RExC_flags);
9915     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9916         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9917                  been, so there is no hole */
9918     }
9919
9920     return op + EXACTF;
9921 }
9922
9923 PERL_STATIC_INLINE void
9924 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9925 {
9926     /* This knows the details about sizing an EXACTish node, setting flags for
9927      * it (by setting <*flagp>, and potentially populating it with a single
9928      * character.
9929      *
9930      * If <len> (the length in bytes) is non-zero, this function assumes that
9931      * the node has already been populated, and just does the sizing.  In this
9932      * case <code_point> should be the final code point that has already been
9933      * placed into the node.  This value will be ignored except that under some
9934      * circumstances <*flagp> is set based on it.
9935      *
9936      * If <len> is zero, the function assumes that the node is to contain only
9937      * the single character given by <code_point> and calculates what <len>
9938      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
9939      * additionally will populate the node's STRING with <code_point>, if <len>
9940      * is 0.  In both cases <*flagp> is appropriately set
9941      *
9942      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9943      * folded (the latter only when the rules indicate it can match 'ss') */
9944
9945     bool len_passed_in = cBOOL(len != 0);
9946     U8 character[UTF8_MAXBYTES_CASE+1];
9947
9948     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9949
9950     if (! len_passed_in) {
9951         if (UTF) {
9952             if (FOLD) {
9953                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9954             }
9955             else {
9956                 uvchr_to_utf8( character, code_point);
9957                 len = UTF8SKIP(character);
9958             }
9959         }
9960         else if (! FOLD
9961                  || code_point != LATIN_SMALL_LETTER_SHARP_S
9962                  || ASCII_FOLD_RESTRICTED
9963                  || ! AT_LEAST_UNI_SEMANTICS)
9964         {
9965             *character = (U8) code_point;
9966             len = 1;
9967         }
9968         else {
9969             *character = 's';
9970             *(character + 1) = 's';
9971             len = 2;
9972         }
9973     }
9974
9975     if (SIZE_ONLY) {
9976         RExC_size += STR_SZ(len);
9977     }
9978     else {
9979         RExC_emit += STR_SZ(len);
9980         STR_LEN(node) = len;
9981         if (! len_passed_in) {
9982             Copy((char *) character, STRING(node), len, char);
9983         }
9984     }
9985
9986     *flagp |= HASWIDTH;
9987
9988     /* A single character node is SIMPLE, except for the special-cased SHARP S
9989      * under /di. */
9990     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
9991         && (code_point != LATIN_SMALL_LETTER_SHARP_S
9992             || ! FOLD || ! DEPENDS_SEMANTICS))
9993     {
9994         *flagp |= SIMPLE;
9995     }
9996 }
9997
9998 /*
9999  - regatom - the lowest level
10000
10001    Try to identify anything special at the start of the pattern. If there
10002    is, then handle it as required. This may involve generating a single regop,
10003    such as for an assertion; or it may involve recursing, such as to
10004    handle a () structure.
10005
10006    If the string doesn't start with something special then we gobble up
10007    as much literal text as we can.
10008
10009    Once we have been able to handle whatever type of thing started the
10010    sequence, we return.
10011
10012    Note: we have to be careful with escapes, as they can be both literal
10013    and special, and in the case of \10 and friends, context determines which.
10014
10015    A summary of the code structure is:
10016
10017    switch (first_byte) {
10018         cases for each special:
10019             handle this special;
10020             break;
10021         case '\\':
10022             switch (2nd byte) {
10023                 cases for each unambiguous special:
10024                     handle this special;
10025                     break;
10026                 cases for each ambigous special/literal:
10027                     disambiguate;
10028                     if (special)  handle here
10029                     else goto defchar;
10030                 default: // unambiguously literal:
10031                     goto defchar;
10032             }
10033         default:  // is a literal char
10034             // FALL THROUGH
10035         defchar:
10036             create EXACTish node for literal;
10037             while (more input and node isn't full) {
10038                 switch (input_byte) {
10039                    cases for each special;
10040                        make sure parse pointer is set so that the next call to
10041                            regatom will see this special first
10042                        goto loopdone; // EXACTish node terminated by prev. char
10043                    default:
10044                        append char to EXACTISH node;
10045                 }
10046                 get next input byte;
10047             }
10048         loopdone:
10049    }
10050    return the generated node;
10051
10052    Specifically there are two separate switches for handling
10053    escape sequences, with the one for handling literal escapes requiring
10054    a dummy entry for all of the special escapes that are actually handled
10055    by the other.
10056 */
10057
10058 STATIC regnode *
10059 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10060 {
10061     dVAR;
10062     regnode *ret = NULL;
10063     I32 flags;
10064     char *parse_start = RExC_parse;
10065     U8 op;
10066     GET_RE_DEBUG_FLAGS_DECL;
10067     DEBUG_PARSE("atom");
10068     *flagp = WORST;             /* Tentatively. */
10069
10070     PERL_ARGS_ASSERT_REGATOM;
10071
10072 tryagain:
10073     switch ((U8)*RExC_parse) {
10074     case '^':
10075         RExC_seen_zerolen++;
10076         nextchar(pRExC_state);
10077         if (RExC_flags & RXf_PMf_MULTILINE)
10078             ret = reg_node(pRExC_state, MBOL);
10079         else if (RExC_flags & RXf_PMf_SINGLELINE)
10080             ret = reg_node(pRExC_state, SBOL);
10081         else
10082             ret = reg_node(pRExC_state, BOL);
10083         Set_Node_Length(ret, 1); /* MJD */
10084         break;
10085     case '$':
10086         nextchar(pRExC_state);
10087         if (*RExC_parse)
10088             RExC_seen_zerolen++;
10089         if (RExC_flags & RXf_PMf_MULTILINE)
10090             ret = reg_node(pRExC_state, MEOL);
10091         else if (RExC_flags & RXf_PMf_SINGLELINE)
10092             ret = reg_node(pRExC_state, SEOL);
10093         else
10094             ret = reg_node(pRExC_state, EOL);
10095         Set_Node_Length(ret, 1); /* MJD */
10096         break;
10097     case '.':
10098         nextchar(pRExC_state);
10099         if (RExC_flags & RXf_PMf_SINGLELINE)
10100             ret = reg_node(pRExC_state, SANY);
10101         else
10102             ret = reg_node(pRExC_state, REG_ANY);
10103         *flagp |= HASWIDTH|SIMPLE;
10104         RExC_naughty++;
10105         Set_Node_Length(ret, 1); /* MJD */
10106         break;
10107     case '[':
10108     {
10109         char * const oregcomp_parse = ++RExC_parse;
10110         ret = regclass(pRExC_state, flagp,depth+1);
10111         if (*RExC_parse != ']') {
10112             RExC_parse = oregcomp_parse;
10113             vFAIL("Unmatched [");
10114         }
10115         nextchar(pRExC_state);
10116         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10117         break;
10118     }
10119     case '(':
10120         nextchar(pRExC_state);
10121         ret = reg(pRExC_state, 1, &flags,depth+1);
10122         if (ret == NULL) {
10123                 if (flags & TRYAGAIN) {
10124                     if (RExC_parse == RExC_end) {
10125                          /* Make parent create an empty node if needed. */
10126                         *flagp |= TRYAGAIN;
10127                         return(NULL);
10128                     }
10129                     goto tryagain;
10130                 }
10131                 return(NULL);
10132         }
10133         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10134         break;
10135     case '|':
10136     case ')':
10137         if (flags & TRYAGAIN) {
10138             *flagp |= TRYAGAIN;
10139             return NULL;
10140         }
10141         vFAIL("Internal urp");
10142                                 /* Supposed to be caught earlier. */
10143         break;
10144     case '?':
10145     case '+':
10146     case '*':
10147         RExC_parse++;
10148         vFAIL("Quantifier follows nothing");
10149         break;
10150     case '\\':
10151         /* Special Escapes
10152
10153            This switch handles escape sequences that resolve to some kind
10154            of special regop and not to literal text. Escape sequnces that
10155            resolve to literal text are handled below in the switch marked
10156            "Literal Escapes".
10157
10158            Every entry in this switch *must* have a corresponding entry
10159            in the literal escape switch. However, the opposite is not
10160            required, as the default for this switch is to jump to the
10161            literal text handling code.
10162         */
10163         switch ((U8)*++RExC_parse) {
10164         /* Special Escapes */
10165         case 'A':
10166             RExC_seen_zerolen++;
10167             ret = reg_node(pRExC_state, SBOL);
10168             *flagp |= SIMPLE;
10169             goto finish_meta_pat;
10170         case 'G':
10171             ret = reg_node(pRExC_state, GPOS);
10172             RExC_seen |= REG_SEEN_GPOS;
10173             *flagp |= SIMPLE;
10174             goto finish_meta_pat;
10175         case 'K':
10176             RExC_seen_zerolen++;
10177             ret = reg_node(pRExC_state, KEEPS);
10178             *flagp |= SIMPLE;
10179             /* XXX:dmq : disabling in-place substitution seems to
10180              * be necessary here to avoid cases of memory corruption, as
10181              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10182              */
10183             RExC_seen |= REG_SEEN_LOOKBEHIND;
10184             goto finish_meta_pat;
10185         case 'Z':
10186             ret = reg_node(pRExC_state, SEOL);
10187             *flagp |= SIMPLE;
10188             RExC_seen_zerolen++;                /* Do not optimize RE away */
10189             goto finish_meta_pat;
10190         case 'z':
10191             ret = reg_node(pRExC_state, EOS);
10192             *flagp |= SIMPLE;
10193             RExC_seen_zerolen++;                /* Do not optimize RE away */
10194             goto finish_meta_pat;
10195         case 'C':
10196             ret = reg_node(pRExC_state, CANY);
10197             RExC_seen |= REG_SEEN_CANY;
10198             *flagp |= HASWIDTH|SIMPLE;
10199             goto finish_meta_pat;
10200         case 'X':
10201             ret = reg_node(pRExC_state, CLUMP);
10202             *flagp |= HASWIDTH;
10203             goto finish_meta_pat;
10204         case 'w':
10205             op = ALNUM + get_regex_charset(RExC_flags);
10206             if (op > ALNUMA) {  /* /aa is same as /a */
10207                 op = ALNUMA;
10208             }
10209             ret = reg_node(pRExC_state, op);
10210             *flagp |= HASWIDTH|SIMPLE;
10211             goto finish_meta_pat;
10212         case 'W':
10213             op = NALNUM + get_regex_charset(RExC_flags);
10214             if (op > NALNUMA) { /* /aa is same as /a */
10215                 op = NALNUMA;
10216             }
10217             ret = reg_node(pRExC_state, op);
10218             *flagp |= HASWIDTH|SIMPLE;
10219             goto finish_meta_pat;
10220         case 'b':
10221             RExC_seen_zerolen++;
10222             RExC_seen |= REG_SEEN_LOOKBEHIND;
10223             op = BOUND + get_regex_charset(RExC_flags);
10224             if (op > BOUNDA) {  /* /aa is same as /a */
10225                 op = BOUNDA;
10226             }
10227             ret = reg_node(pRExC_state, op);
10228             FLAGS(ret) = get_regex_charset(RExC_flags);
10229             *flagp |= SIMPLE;
10230             goto finish_meta_pat;
10231         case 'B':
10232             RExC_seen_zerolen++;
10233             RExC_seen |= REG_SEEN_LOOKBEHIND;
10234             op = NBOUND + get_regex_charset(RExC_flags);
10235             if (op > NBOUNDA) { /* /aa is same as /a */
10236                 op = NBOUNDA;
10237             }
10238             ret = reg_node(pRExC_state, op);
10239             FLAGS(ret) = get_regex_charset(RExC_flags);
10240             *flagp |= SIMPLE;
10241             goto finish_meta_pat;
10242         case 's':
10243             op = SPACE + get_regex_charset(RExC_flags);
10244             if (op > SPACEA) {  /* /aa is same as /a */
10245                 op = SPACEA;
10246             }
10247             ret = reg_node(pRExC_state, op);
10248             *flagp |= HASWIDTH|SIMPLE;
10249             goto finish_meta_pat;
10250         case 'S':
10251             op = NSPACE + get_regex_charset(RExC_flags);
10252             if (op > NSPACEA) { /* /aa is same as /a */
10253                 op = NSPACEA;
10254             }
10255             ret = reg_node(pRExC_state, op);
10256             *flagp |= HASWIDTH|SIMPLE;
10257             goto finish_meta_pat;
10258         case 'D':
10259             op = NDIGIT;
10260             goto join_D_and_d;
10261         case 'd':
10262             op = DIGIT;
10263         join_D_and_d:
10264             {
10265                 U8 offset = get_regex_charset(RExC_flags);
10266                 if (offset == REGEX_UNICODE_CHARSET) {
10267                     offset = REGEX_DEPENDS_CHARSET;
10268                 }
10269                 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10270                     offset = REGEX_ASCII_RESTRICTED_CHARSET;
10271                 }
10272                 op += offset;
10273             }
10274             ret = reg_node(pRExC_state, op);
10275             *flagp |= HASWIDTH|SIMPLE;
10276             goto finish_meta_pat;
10277         case 'R':
10278             ret = reg_node(pRExC_state, LNBREAK);
10279             *flagp |= HASWIDTH|SIMPLE;
10280             goto finish_meta_pat;
10281         case 'h':
10282             ret = reg_node(pRExC_state, HORIZWS);
10283             *flagp |= HASWIDTH|SIMPLE;
10284             goto finish_meta_pat;
10285         case 'H':
10286             ret = reg_node(pRExC_state, NHORIZWS);
10287             *flagp |= HASWIDTH|SIMPLE;
10288             goto finish_meta_pat;
10289         case 'v':
10290             ret = reg_node(pRExC_state, VERTWS);
10291             *flagp |= HASWIDTH|SIMPLE;
10292             goto finish_meta_pat;
10293         case 'V':
10294             ret = reg_node(pRExC_state, NVERTWS);
10295             *flagp |= HASWIDTH|SIMPLE;
10296          finish_meta_pat:
10297             nextchar(pRExC_state);
10298             Set_Node_Length(ret, 2); /* MJD */
10299             break;
10300         case 'p':
10301         case 'P':
10302             {
10303                 char* const oldregxend = RExC_end;
10304 #ifdef DEBUGGING
10305                 char* parse_start = RExC_parse - 2;
10306 #endif
10307
10308                 if (RExC_parse[1] == '{') {
10309                   /* a lovely hack--pretend we saw [\pX] instead */
10310                     RExC_end = strchr(RExC_parse, '}');
10311                     if (!RExC_end) {
10312                         const U8 c = (U8)*RExC_parse;
10313                         RExC_parse += 2;
10314                         RExC_end = oldregxend;
10315                         vFAIL2("Missing right brace on \\%c{}", c);
10316                     }
10317                     RExC_end++;
10318                 }
10319                 else {
10320                     RExC_end = RExC_parse + 2;
10321                     if (RExC_end > oldregxend)
10322                         RExC_end = oldregxend;
10323                 }
10324                 RExC_parse--;
10325
10326                 ret = regclass(pRExC_state, flagp,depth+1);
10327
10328                 RExC_end = oldregxend;
10329                 RExC_parse--;
10330
10331                 Set_Node_Offset(ret, parse_start + 2);
10332                 Set_Node_Cur_Length(ret);
10333                 nextchar(pRExC_state);
10334             }
10335             break;
10336         case 'N':
10337             /* Handle \N and \N{NAME} with multiple code points here and not
10338              * below because it can be multicharacter. join_exact() will join
10339              * them up later on.  Also this makes sure that things like
10340              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10341              * The options to the grok function call causes it to fail if the
10342              * sequence is just a single code point.  We then go treat it as
10343              * just another character in the current EXACT node, and hence it
10344              * gets uniform treatment with all the other characters.  The
10345              * special treatment for quantifiers is not needed for such single
10346              * character sequences */
10347             ++RExC_parse;
10348             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10349                 RExC_parse--;
10350                 goto defchar;
10351             }
10352             break;
10353         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10354         parse_named_seq:
10355         {
10356             char ch= RExC_parse[1];
10357             if (ch != '<' && ch != '\'' && ch != '{') {
10358                 RExC_parse++;
10359                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10360             } else {
10361                 /* this pretty much dupes the code for (?P=...) in reg(), if
10362                    you change this make sure you change that */
10363                 char* name_start = (RExC_parse += 2);
10364                 U32 num = 0;
10365                 SV *sv_dat = reg_scan_name(pRExC_state,
10366                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10367                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10368                 if (RExC_parse == name_start || *RExC_parse != ch)
10369                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10370
10371                 if (!SIZE_ONLY) {
10372                     num = add_data( pRExC_state, 1, "S" );
10373                     RExC_rxi->data->data[num]=(void*)sv_dat;
10374                     SvREFCNT_inc_simple_void(sv_dat);
10375                 }
10376
10377                 RExC_sawback = 1;
10378                 ret = reganode(pRExC_state,
10379                                ((! FOLD)
10380                                  ? NREF
10381                                  : (ASCII_FOLD_RESTRICTED)
10382                                    ? NREFFA
10383                                    : (AT_LEAST_UNI_SEMANTICS)
10384                                      ? NREFFU
10385                                      : (LOC)
10386                                        ? NREFFL
10387                                        : NREFF),
10388                                 num);
10389                 *flagp |= HASWIDTH;
10390
10391                 /* override incorrect value set in reganode MJD */
10392                 Set_Node_Offset(ret, parse_start+1);
10393                 Set_Node_Cur_Length(ret); /* MJD */
10394                 nextchar(pRExC_state);
10395
10396             }
10397             break;
10398         }
10399         case 'g':
10400         case '1': case '2': case '3': case '4':
10401         case '5': case '6': case '7': case '8': case '9':
10402             {
10403                 I32 num;
10404                 bool isg = *RExC_parse == 'g';
10405                 bool isrel = 0;
10406                 bool hasbrace = 0;
10407                 if (isg) {
10408                     RExC_parse++;
10409                     if (*RExC_parse == '{') {
10410                         RExC_parse++;
10411                         hasbrace = 1;
10412                     }
10413                     if (*RExC_parse == '-') {
10414                         RExC_parse++;
10415                         isrel = 1;
10416                     }
10417                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10418                         if (isrel) RExC_parse--;
10419                         RExC_parse -= 2;
10420                         goto parse_named_seq;
10421                 }   }
10422                 num = atoi(RExC_parse);
10423                 if (isg && num == 0)
10424                     vFAIL("Reference to invalid group 0");
10425                 if (isrel) {
10426                     num = RExC_npar - num;
10427                     if (num < 1)
10428                         vFAIL("Reference to nonexistent or unclosed group");
10429                 }
10430                 if (!isg && num > 9 && num >= RExC_npar)
10431                     /* Probably a character specified in octal, e.g. \35 */
10432                     goto defchar;
10433                 else {
10434                     char * const parse_start = RExC_parse - 1; /* MJD */
10435                     while (isDIGIT(*RExC_parse))
10436                         RExC_parse++;
10437                     if (parse_start == RExC_parse - 1)
10438                         vFAIL("Unterminated \\g... pattern");
10439                     if (hasbrace) {
10440                         if (*RExC_parse != '}')
10441                             vFAIL("Unterminated \\g{...} pattern");
10442                         RExC_parse++;
10443                     }
10444                     if (!SIZE_ONLY) {
10445                         if (num > (I32)RExC_rx->nparens)
10446                             vFAIL("Reference to nonexistent group");
10447                     }
10448                     RExC_sawback = 1;
10449                     ret = reganode(pRExC_state,
10450                                    ((! FOLD)
10451                                      ? REF
10452                                      : (ASCII_FOLD_RESTRICTED)
10453                                        ? REFFA
10454                                        : (AT_LEAST_UNI_SEMANTICS)
10455                                          ? REFFU
10456                                          : (LOC)
10457                                            ? REFFL
10458                                            : REFF),
10459                                     num);
10460                     *flagp |= HASWIDTH;
10461
10462                     /* override incorrect value set in reganode MJD */
10463                     Set_Node_Offset(ret, parse_start+1);
10464                     Set_Node_Cur_Length(ret); /* MJD */
10465                     RExC_parse--;
10466                     nextchar(pRExC_state);
10467                 }
10468             }
10469             break;
10470         case '\0':
10471             if (RExC_parse >= RExC_end)
10472                 FAIL("Trailing \\");
10473             /* FALL THROUGH */
10474         default:
10475             /* Do not generate "unrecognized" warnings here, we fall
10476                back into the quick-grab loop below */
10477             parse_start--;
10478             goto defchar;
10479         }
10480         break;
10481
10482     case '#':
10483         if (RExC_flags & RXf_PMf_EXTENDED) {
10484             if ( reg_skipcomment( pRExC_state ) )
10485                 goto tryagain;
10486         }
10487         /* FALL THROUGH */
10488
10489     default:
10490
10491             parse_start = RExC_parse - 1;
10492
10493             RExC_parse++;
10494
10495         defchar: {
10496             STRLEN len = 0;
10497             UV ender;
10498             char *p;
10499             char *s;
10500 #define MAX_NODE_STRING_SIZE 127
10501             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10502             char *s0;
10503             U8 upper_parse = MAX_NODE_STRING_SIZE;
10504             STRLEN foldlen;
10505             U8 node_type;
10506             bool next_is_quantifier;
10507             char * oldp = NULL;
10508
10509             /* If a folding node contains only code points that don't
10510              * participate in folds, it can be changed into an EXACT node,
10511              * which allows the optimizer more things to look for */
10512             bool maybe_exact;
10513
10514             ender = 0;
10515             node_type = compute_EXACTish(pRExC_state);
10516             ret = reg_node(pRExC_state, node_type);
10517
10518             /* In pass1, folded, we use a temporary buffer instead of the
10519              * actual node, as the node doesn't exist yet */
10520             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10521
10522             s0 = s;
10523
10524         reparse:
10525
10526             /* We do the EXACTFish to EXACT node only if folding, and not if in
10527              * locale, as whether a character folds or not isn't known until
10528              * runtime */
10529             maybe_exact = FOLD && ! LOC;
10530
10531             /* XXX The node can hold up to 255 bytes, yet this only goes to
10532              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10533              * 255 allows us to not have to worry about overflow due to
10534              * converting to utf8 and fold expansion, but that value is
10535              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10536              * split up by this limit into a single one using the real max of
10537              * 255.  Even at 127, this breaks under rare circumstances.  If
10538              * folding, we do not want to split a node at a character that is a
10539              * non-final in a multi-char fold, as an input string could just
10540              * happen to want to match across the node boundary.  The join
10541              * would solve that problem if the join actually happens.  But a
10542              * series of more than two nodes in a row each of 127 would cause
10543              * the first join to succeed to get to 254, but then there wouldn't
10544              * be room for the next one, which could at be one of those split
10545              * multi-char folds.  I don't know of any fool-proof solution.  One
10546              * could back off to end with only a code point that isn't such a
10547              * non-final, but it is possible for there not to be any in the
10548              * entire node. */
10549             for (p = RExC_parse - 1;
10550                  len < upper_parse && p < RExC_end;
10551                  len++)
10552             {
10553                 oldp = p;
10554
10555                 if (RExC_flags & RXf_PMf_EXTENDED)
10556                     p = regwhite( pRExC_state, p );
10557                 switch ((U8)*p) {
10558                 case '^':
10559                 case '$':
10560                 case '.':
10561                 case '[':
10562                 case '(':
10563                 case ')':
10564                 case '|':
10565                     goto loopdone;
10566                 case '\\':
10567                     /* Literal Escapes Switch
10568
10569                        This switch is meant to handle escape sequences that
10570                        resolve to a literal character.
10571
10572                        Every escape sequence that represents something
10573                        else, like an assertion or a char class, is handled
10574                        in the switch marked 'Special Escapes' above in this
10575                        routine, but also has an entry here as anything that
10576                        isn't explicitly mentioned here will be treated as
10577                        an unescaped equivalent literal.
10578                     */
10579
10580                     switch ((U8)*++p) {
10581                     /* These are all the special escapes. */
10582                     case 'A':             /* Start assertion */
10583                     case 'b': case 'B':   /* Word-boundary assertion*/
10584                     case 'C':             /* Single char !DANGEROUS! */
10585                     case 'd': case 'D':   /* digit class */
10586                     case 'g': case 'G':   /* generic-backref, pos assertion */
10587                     case 'h': case 'H':   /* HORIZWS */
10588                     case 'k': case 'K':   /* named backref, keep marker */
10589                     case 'p': case 'P':   /* Unicode property */
10590                               case 'R':   /* LNBREAK */
10591                     case 's': case 'S':   /* space class */
10592                     case 'v': case 'V':   /* VERTWS */
10593                     case 'w': case 'W':   /* word class */
10594                     case 'X':             /* eXtended Unicode "combining character sequence" */
10595                     case 'z': case 'Z':   /* End of line/string assertion */
10596                         --p;
10597                         goto loopdone;
10598
10599                     /* Anything after here is an escape that resolves to a
10600                        literal. (Except digits, which may or may not)
10601                      */
10602                     case 'n':
10603                         ender = '\n';
10604                         p++;
10605                         break;
10606                     case 'N': /* Handle a single-code point named character. */
10607                         /* The options cause it to fail if a multiple code
10608                          * point sequence.  Handle those in the switch() above
10609                          * */
10610                         RExC_parse = p + 1;
10611                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10612                                             flagp, depth, FALSE))
10613                         {
10614                             RExC_parse = p = oldp;
10615                             goto loopdone;
10616                         }
10617                         p = RExC_parse;
10618                         if (ender > 0xff) {
10619                             REQUIRE_UTF8;
10620                         }
10621                         break;
10622                     case 'r':
10623                         ender = '\r';
10624                         p++;
10625                         break;
10626                     case 't':
10627                         ender = '\t';
10628                         p++;
10629                         break;
10630                     case 'f':
10631                         ender = '\f';
10632                         p++;
10633                         break;
10634                     case 'e':
10635                           ender = ASCII_TO_NATIVE('\033');
10636                         p++;
10637                         break;
10638                     case 'a':
10639                           ender = ASCII_TO_NATIVE('\007');
10640                         p++;
10641                         break;
10642                     case 'o':
10643                         {
10644                             STRLEN brace_len = len;
10645                             UV result;
10646                             const char* error_msg;
10647
10648                             bool valid = grok_bslash_o(p,
10649                                                        &result,
10650                                                        &brace_len,
10651                                                        &error_msg,
10652                                                        1);
10653                             p += brace_len;
10654                             if (! valid) {
10655                                 RExC_parse = p; /* going to die anyway; point
10656                                                    to exact spot of failure */
10657                                 vFAIL(error_msg);
10658                             }
10659                             else
10660                             {
10661                                 ender = result;
10662                             }
10663                             if (PL_encoding && ender < 0x100) {
10664                                 goto recode_encoding;
10665                             }
10666                             if (ender > 0xff) {
10667                                 REQUIRE_UTF8;
10668                             }
10669                             break;
10670                         }
10671                     case 'x':
10672                         {
10673                             STRLEN brace_len = len;
10674                             UV result;
10675                             const char* error_msg;
10676
10677                             bool valid = grok_bslash_x(p,
10678                                                        &result,
10679                                                        &brace_len,
10680                                                        &error_msg,
10681                                                        1);
10682                             p += brace_len;
10683                             if (! valid) {
10684                                 RExC_parse = p; /* going to die anyway; point
10685                                                    to exact spot of failure */
10686                                 vFAIL(error_msg);
10687                             }
10688                             else {
10689                                 ender = result;
10690                             }
10691                             if (PL_encoding && ender < 0x100) {
10692                                 goto recode_encoding;
10693                             }
10694                             if (ender > 0xff) {
10695                                 REQUIRE_UTF8;
10696                             }
10697                             break;
10698                         }
10699                     case 'c':
10700                         p++;
10701                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10702                         break;
10703                     case '0': case '1': case '2': case '3':case '4':
10704                     case '5': case '6': case '7':
10705                         if (*p == '0' ||
10706                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10707                         {
10708                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10709                             STRLEN numlen = 3;
10710                             ender = grok_oct(p, &numlen, &flags, NULL);
10711                             if (ender > 0xff) {
10712                                 REQUIRE_UTF8;
10713                             }
10714                             p += numlen;
10715                         }
10716                         else {
10717                             --p;
10718                             goto loopdone;
10719                         }
10720                         if (PL_encoding && ender < 0x100)
10721                             goto recode_encoding;
10722                         break;
10723                     recode_encoding:
10724                         if (! RExC_override_recoding) {
10725                             SV* enc = PL_encoding;
10726                             ender = reg_recode((const char)(U8)ender, &enc);
10727                             if (!enc && SIZE_ONLY)
10728                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10729                             REQUIRE_UTF8;
10730                         }
10731                         break;
10732                     case '\0':
10733                         if (p >= RExC_end)
10734                             FAIL("Trailing \\");
10735                         /* FALL THROUGH */
10736                     default:
10737                         if (!SIZE_ONLY&& isALNUMC(*p)) {
10738                             ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10739                         }
10740                         goto normal_default;
10741                     }
10742                     break;
10743                 case '{':
10744                     /* Currently we don't warn when the lbrace is at the start
10745                      * of a construct.  This catches it in the middle of a
10746                      * literal string, or when its the first thing after
10747                      * something like "\b" */
10748                     if (! SIZE_ONLY
10749                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10750                     {
10751                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10752                     }
10753                     /*FALLTHROUGH*/
10754                 default:
10755                   normal_default:
10756                     if (UTF8_IS_START(*p) && UTF) {
10757                         STRLEN numlen;
10758                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10759                                                &numlen, UTF8_ALLOW_DEFAULT);
10760                         p += numlen;
10761                     }
10762                     else
10763                         ender = (U8) *p++;
10764                     break;
10765                 } /* End of switch on the literal */
10766
10767                 /* Here, have looked at the literal character and <ender>
10768                  * contains its ordinal, <p> points to the character after it
10769                  */
10770
10771                 if ( RExC_flags & RXf_PMf_EXTENDED)
10772                     p = regwhite( pRExC_state, p );
10773
10774                 /* If the next thing is a quantifier, it applies to this
10775                  * character only, which means that this character has to be in
10776                  * its own node and can't just be appended to the string in an
10777                  * existing node, so if there are already other characters in
10778                  * the node, close the node with just them, and set up to do
10779                  * this character again next time through, when it will be the
10780                  * only thing in its new node */
10781                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10782                 {
10783                     p = oldp;
10784                     goto loopdone;
10785                 }
10786
10787                 if (FOLD) {
10788                     if (UTF
10789                             /* See comments for join_exact() as to why we fold
10790                              * this non-UTF at compile time */
10791                         || (node_type == EXACTFU
10792                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10793                     {
10794
10795
10796                         /* Prime the casefolded buffer.  Locale rules, which
10797                          * apply only to code points < 256, aren't known until
10798                          * execution, so for them, just output the original
10799                          * character using utf8.  If we start to fold non-UTF
10800                          * patterns, be sure to update join_exact() */
10801                         if (LOC && ender < 256) {
10802                             if (UNI_IS_INVARIANT(ender)) {
10803                                 *s = (U8) ender;
10804                                 foldlen = 1;
10805                             } else {
10806                                 *s = UTF8_TWO_BYTE_HI(ender);
10807                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10808                                 foldlen = 2;
10809                             }
10810                         }
10811                         else {
10812                             UV folded = _to_uni_fold_flags(
10813                                            ender,
10814                                            (U8 *) s,
10815                                            &foldlen,
10816                                            FOLD_FLAGS_FULL
10817                                            | ((LOC) ?  FOLD_FLAGS_LOCALE
10818                                                     : (ASCII_FOLD_RESTRICTED)
10819                                                       ? FOLD_FLAGS_NOMIX_ASCII
10820                                                       : 0)
10821                                             );
10822
10823                             /* If this node only contains non-folding code
10824                              * points so far, see if this new one is also
10825                              * non-folding */
10826                             if (maybe_exact) {
10827                                 if (folded != ender) {
10828                                     maybe_exact = FALSE;
10829                                 }
10830                                 else {
10831                                     /* Here the fold is the original; we have
10832                                      * to check further to see if anything
10833                                      * folds to it */
10834                                     if (! PL_utf8_foldable) {
10835                                         SV* swash = swash_init("utf8",
10836                                                            "_Perl_Any_Folds",
10837                                                            &PL_sv_undef, 1, 0);
10838                                         PL_utf8_foldable =
10839                                                     _get_swash_invlist(swash);
10840                                         SvREFCNT_dec(swash);
10841                                     }
10842                                     if (_invlist_contains_cp(PL_utf8_foldable,
10843                                                              ender))
10844                                     {
10845                                         maybe_exact = FALSE;
10846                                     }
10847                                 }
10848                             }
10849                             ender = folded;
10850                         }
10851                         s += foldlen;
10852
10853                         /* The loop increments <len> each time, as all but this
10854                          * path (and the one just below for UTF) through it add
10855                          * a single byte to the EXACTish node.  But this one
10856                          * has changed len to be the correct final value, so
10857                          * subtract one to cancel out the increment that
10858                          * follows */
10859                         len += foldlen - 1;
10860                     }
10861                     else {
10862                         *(s++) = ender;
10863                         maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10864                     }
10865                 }
10866                 else if (UTF) {
10867                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10868                     if (unilen > 0) {
10869                        s   += unilen;
10870                        len += unilen;
10871                     }
10872
10873                     /* See comment just above for - 1 */
10874                     len--;
10875                 }
10876                 else {
10877                     REGC((char)ender, s++);
10878                 }
10879
10880                 if (next_is_quantifier) {
10881
10882                     /* Here, the next input is a quantifier, and to get here,
10883                      * the current character is the only one in the node.
10884                      * Also, here <len> doesn't include the final byte for this
10885                      * character */
10886                     len++;
10887                     goto loopdone;
10888                 }
10889
10890             } /* End of loop through literal characters */
10891
10892             /* Here we have either exhausted the input or ran out of room in
10893              * the node.  (If we encountered a character that can't be in the
10894              * node, transfer is made directly to <loopdone>, and so we
10895              * wouldn't have fallen off the end of the loop.)  In the latter
10896              * case, we artificially have to split the node into two, because
10897              * we just don't have enough space to hold everything.  This
10898              * creates a problem if the final character participates in a
10899              * multi-character fold in the non-final position, as a match that
10900              * should have occurred won't, due to the way nodes are matched,
10901              * and our artificial boundary.  So back off until we find a non-
10902              * problematic character -- one that isn't at the beginning or
10903              * middle of such a fold.  (Either it doesn't participate in any
10904              * folds, or appears only in the final position of all the folds it
10905              * does participate in.)  A better solution with far fewer false
10906              * positives, and that would fill the nodes more completely, would
10907              * be to actually have available all the multi-character folds to
10908              * test against, and to back-off only far enough to be sure that
10909              * this node isn't ending with a partial one.  <upper_parse> is set
10910              * further below (if we need to reparse the node) to include just
10911              * up through that final non-problematic character that this code
10912              * identifies, so when it is set to less than the full node, we can
10913              * skip the rest of this */
10914             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10915
10916                 const STRLEN full_len = len;
10917
10918                 assert(len >= MAX_NODE_STRING_SIZE);
10919
10920                 /* Here, <s> points to the final byte of the final character.
10921                  * Look backwards through the string until find a non-
10922                  * problematic character */
10923
10924                 if (! UTF) {
10925
10926                     /* These two have no multi-char folds to non-UTF characters
10927                      */
10928                     if (ASCII_FOLD_RESTRICTED || LOC) {
10929                         goto loopdone;
10930                     }
10931
10932                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10933                     len = s - s0 + 1;
10934                 }
10935                 else {
10936                     if (!  PL_NonL1NonFinalFold) {
10937                         PL_NonL1NonFinalFold = _new_invlist_C_array(
10938                                         NonL1_Perl_Non_Final_Folds_invlist);
10939                     }
10940
10941                     /* Point to the first byte of the final character */
10942                     s = (char *) utf8_hop((U8 *) s, -1);
10943
10944                     while (s >= s0) {   /* Search backwards until find
10945                                            non-problematic char */
10946                         if (UTF8_IS_INVARIANT(*s)) {
10947
10948                             /* There are no ascii characters that participate
10949                              * in multi-char folds under /aa.  In EBCDIC, the
10950                              * non-ascii invariants are all control characters,
10951                              * so don't ever participate in any folds. */
10952                             if (ASCII_FOLD_RESTRICTED
10953                                 || ! IS_NON_FINAL_FOLD(*s))
10954                             {
10955                                 break;
10956                             }
10957                         }
10958                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10959
10960                             /* No Latin1 characters participate in multi-char
10961                              * folds under /l */
10962                             if (LOC
10963                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10964                                                                 *s, *(s+1))))
10965                             {
10966                                 break;
10967                             }
10968                         }
10969                         else if (! _invlist_contains_cp(
10970                                         PL_NonL1NonFinalFold,
10971                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
10972                         {
10973                             break;
10974                         }
10975
10976                         /* Here, the current character is problematic in that
10977                          * it does occur in the non-final position of some
10978                          * fold, so try the character before it, but have to
10979                          * special case the very first byte in the string, so
10980                          * we don't read outside the string */
10981                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
10982                     } /* End of loop backwards through the string */
10983
10984                     /* If there were only problematic characters in the string,
10985                      * <s> will point to before s0, in which case the length
10986                      * should be 0, otherwise include the length of the
10987                      * non-problematic character just found */
10988                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
10989                 }
10990
10991                 /* Here, have found the final character, if any, that is
10992                  * non-problematic as far as ending the node without splitting
10993                  * it across a potential multi-char fold.  <len> contains the
10994                  * number of bytes in the node up-to and including that
10995                  * character, or is 0 if there is no such character, meaning
10996                  * the whole node contains only problematic characters.  In
10997                  * this case, give up and just take the node as-is.  We can't
10998                  * do any better */
10999                 if (len == 0) {
11000                     len = full_len;
11001                 } else {
11002
11003                     /* Here, the node does contain some characters that aren't
11004                      * problematic.  If one such is the final character in the
11005                      * node, we are done */
11006                     if (len == full_len) {
11007                         goto loopdone;
11008                     }
11009                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11010
11011                         /* If the final character is problematic, but the
11012                          * penultimate is not, back-off that last character to
11013                          * later start a new node with it */
11014                         p = oldp;
11015                         goto loopdone;
11016                     }
11017
11018                     /* Here, the final non-problematic character is earlier
11019                      * in the input than the penultimate character.  What we do
11020                      * is reparse from the beginning, going up only as far as
11021                      * this final ok one, thus guaranteeing that the node ends
11022                      * in an acceptable character.  The reason we reparse is
11023                      * that we know how far in the character is, but we don't
11024                      * know how to correlate its position with the input parse.
11025                      * An alternate implementation would be to build that
11026                      * correlation as we go along during the original parse,
11027                      * but that would entail extra work for every node, whereas
11028                      * this code gets executed only when the string is too
11029                      * large for the node, and the final two characters are
11030                      * problematic, an infrequent occurrence.  Yet another
11031                      * possible strategy would be to save the tail of the
11032                      * string, and the next time regatom is called, initialize
11033                      * with that.  The problem with this is that unless you
11034                      * back off one more character, you won't be guaranteed
11035                      * regatom will get called again, unless regbranch,
11036                      * regpiece ... are also changed.  If you do back off that
11037                      * extra character, so that there is input guaranteed to
11038                      * force calling regatom, you can't handle the case where
11039                      * just the first character in the node is acceptable.  I
11040                      * (khw) decided to try this method which doesn't have that
11041                      * pitfall; if performance issues are found, we can do a
11042                      * combination of the current approach plus that one */
11043                     upper_parse = len;
11044                     len = 0;
11045                     s = s0;
11046                     goto reparse;
11047                 }
11048             }   /* End of verifying node ends with an appropriate char */
11049
11050         loopdone:   /* Jumped to when encounters something that shouldn't be in
11051                        the node */
11052
11053             /* If 'maybe_exact' is still set here, means there are no
11054              * code points in the node that participate in folds */
11055             if (FOLD && maybe_exact) {
11056                 OP(ret) = EXACT;
11057             }
11058
11059             /* I (khw) don't know if you can get here with zero length, but the
11060              * old code handled this situation by creating a zero-length EXACT
11061              * node.  Might as well be NOTHING instead */
11062             if (len == 0) {
11063                 OP(ret) = NOTHING;
11064             }
11065             else{
11066                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11067             }
11068
11069             RExC_parse = p - 1;
11070             Set_Node_Cur_Length(ret); /* MJD */
11071             nextchar(pRExC_state);
11072             {
11073                 /* len is STRLEN which is unsigned, need to copy to signed */
11074                 IV iv = len;
11075                 if (iv < 0)
11076                     vFAIL("Internal disaster");
11077             }
11078
11079         } /* End of label 'defchar:' */
11080         break;
11081     } /* End of giant switch on input character */
11082
11083     return(ret);
11084 }
11085
11086 STATIC char *
11087 S_regwhite( RExC_state_t *pRExC_state, char *p )
11088 {
11089     const char *e = RExC_end;
11090
11091     PERL_ARGS_ASSERT_REGWHITE;
11092
11093     while (p < e) {
11094         if (isSPACE(*p))
11095             ++p;
11096         else if (*p == '#') {
11097             bool ended = 0;
11098             do {
11099                 if (*p++ == '\n') {
11100                     ended = 1;
11101                     break;
11102                 }
11103             } while (p < e);
11104             if (!ended)
11105                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11106         }
11107         else
11108             break;
11109     }
11110     return p;
11111 }
11112
11113 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11114    Character classes ([:foo:]) can also be negated ([:^foo:]).
11115    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11116    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11117    but trigger failures because they are currently unimplemented. */
11118
11119 #define POSIXCC_DONE(c)   ((c) == ':')
11120 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11121 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11122
11123 STATIC I32
11124 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
11125 {
11126     dVAR;
11127     I32 namedclass = OOB_NAMEDCLASS;
11128
11129     PERL_ARGS_ASSERT_REGPPOSIXCC;
11130
11131     if (value == '[' && RExC_parse + 1 < RExC_end &&
11132         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11133         POSIXCC(UCHARAT(RExC_parse))) {
11134         const char c = UCHARAT(RExC_parse);
11135         char* const s = RExC_parse++;
11136
11137         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11138             RExC_parse++;
11139         if (RExC_parse == RExC_end)
11140             /* Grandfather lone [:, [=, [. */
11141             RExC_parse = s;
11142         else {
11143             const char* const t = RExC_parse++; /* skip over the c */
11144             assert(*t == c);
11145
11146             if (UCHARAT(RExC_parse) == ']') {
11147                 const char *posixcc = s + 1;
11148                 RExC_parse++; /* skip over the ending ] */
11149
11150                 if (*s == ':') {
11151                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11152                     const I32 skip = t - posixcc;
11153
11154                     /* Initially switch on the length of the name.  */
11155                     switch (skip) {
11156                     case 4:
11157                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11158                             namedclass = ANYOF_WORDCHAR;
11159                         break;
11160                     case 5:
11161                         /* Names all of length 5.  */
11162                         /* alnum alpha ascii blank cntrl digit graph lower
11163                            print punct space upper  */
11164                         /* Offset 4 gives the best switch position.  */
11165                         switch (posixcc[4]) {
11166                         case 'a':
11167                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11168                                 namedclass = ANYOF_ALPHA;
11169                             break;
11170                         case 'e':
11171                             if (memEQ(posixcc, "spac", 4)) /* space */
11172                                 namedclass = ANYOF_PSXSPC;
11173                             break;
11174                         case 'h':
11175                             if (memEQ(posixcc, "grap", 4)) /* graph */
11176                                 namedclass = ANYOF_GRAPH;
11177                             break;
11178                         case 'i':
11179                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11180                                 namedclass = ANYOF_ASCII;
11181                             break;
11182                         case 'k':
11183                             if (memEQ(posixcc, "blan", 4)) /* blank */
11184                                 namedclass = ANYOF_BLANK;
11185                             break;
11186                         case 'l':
11187                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11188                                 namedclass = ANYOF_CNTRL;
11189                             break;
11190                         case 'm':
11191                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11192                                 namedclass = ANYOF_ALNUMC;
11193                             break;
11194                         case 'r':
11195                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11196                                 namedclass = ANYOF_LOWER;
11197                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11198                                 namedclass = ANYOF_UPPER;
11199                             break;
11200                         case 't':
11201                             if (memEQ(posixcc, "digi", 4)) /* digit */
11202                                 namedclass = ANYOF_DIGIT;
11203                             else if (memEQ(posixcc, "prin", 4)) /* print */
11204                                 namedclass = ANYOF_PRINT;
11205                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11206                                 namedclass = ANYOF_PUNCT;
11207                             break;
11208                         }
11209                         break;
11210                     case 6:
11211                         if (memEQ(posixcc, "xdigit", 6))
11212                             namedclass = ANYOF_XDIGIT;
11213                         break;
11214                     }
11215
11216                     if (namedclass == OOB_NAMEDCLASS)
11217                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11218                                       t - s - 1, s + 1);
11219
11220                     /* The #defines are structured so each complement is +1 to
11221                      * the normal one */
11222                     if (complement) {
11223                         namedclass++;
11224                     }
11225                     assert (posixcc[skip] == ':');
11226                     assert (posixcc[skip+1] == ']');
11227                 } else if (!SIZE_ONLY) {
11228                     /* [[=foo=]] and [[.foo.]] are still future. */
11229
11230                     /* adjust RExC_parse so the warning shows after
11231                        the class closes */
11232                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11233                         RExC_parse++;
11234                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11235                 }
11236             } else {
11237                 /* Maternal grandfather:
11238                  * "[:" ending in ":" but not in ":]" */
11239                 RExC_parse = s;
11240             }
11241         }
11242     }
11243
11244     return namedclass;
11245 }
11246
11247 STATIC void
11248 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
11249 {
11250     dVAR;
11251
11252     PERL_ARGS_ASSERT_CHECKPOSIXCC;
11253
11254     if (POSIXCC(UCHARAT(RExC_parse))) {
11255         const char *s = RExC_parse;
11256         const char  c = *s++;
11257
11258         while (isALNUM(*s))
11259             s++;
11260         if (*s && c == *s && s[1] == ']') {
11261             ckWARN3reg(s+2,
11262                        "POSIX syntax [%c %c] belongs inside character classes",
11263                        c, c);
11264
11265             /* [[=foo=]] and [[.foo.]] are still future. */
11266             if (POSIXCC_NOTYET(c)) {
11267                 /* adjust RExC_parse so the error shows after
11268                    the class closes */
11269                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
11270                     NOOP;
11271                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11272             }
11273         }
11274     }
11275 }
11276
11277 /* Generate the code to add a full posix character <class> to the bracketed
11278  * character class given by <node>.  (<node> is needed only under locale rules)
11279  * destlist     is the inversion list for non-locale rules that this class is
11280  *              to be added to
11281  * sourcelist   is the ASCII-range inversion list to add under /a rules
11282  * Xsourcelist  is the full Unicode range list to use otherwise. */
11283 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
11284     if (LOC) {                                                             \
11285         SV* scratch_list = NULL;                                           \
11286                                                                            \
11287         /* Set this class in the node for runtime matching */              \
11288         ANYOF_CLASS_SET(node, class);                                      \
11289                                                                            \
11290         /* For above Latin1 code points, we use the full Unicode range */  \
11291         _invlist_intersection(PL_AboveLatin1,                              \
11292                               Xsourcelist,                                 \
11293                               &scratch_list);                              \
11294         /* And set the output to it, adding instead if there already is an \
11295          * output.  Checking if <destlist> is NULL first saves an extra    \
11296          * clone.  Its reference count will be decremented at the next     \
11297          * union, etc, or if this is the only instance, at the end of the  \
11298          * routine */                                                      \
11299         if (! destlist) {                                                  \
11300             destlist = scratch_list;                                       \
11301         }                                                                  \
11302         else {                                                             \
11303             _invlist_union(destlist, scratch_list, &destlist);             \
11304             SvREFCNT_dec(scratch_list);                                    \
11305         }                                                                  \
11306     }                                                                      \
11307     else {                                                                 \
11308         /* For non-locale, just add it to any existing list */             \
11309         _invlist_union(destlist,                                           \
11310                        (AT_LEAST_ASCII_RESTRICTED)                         \
11311                            ? sourcelist                                    \
11312                            : Xsourcelist,                                  \
11313                        &destlist);                                         \
11314     }
11315
11316 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11317  */
11318 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
11319     if (LOC) {                                                             \
11320         SV* scratch_list = NULL;                                           \
11321         ANYOF_CLASS_SET(node, class);                                      \
11322         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
11323         if (! destlist) {                                                  \
11324             destlist = scratch_list;                                       \
11325         }                                                                  \
11326         else {                                                             \
11327             _invlist_union(destlist, scratch_list, &destlist);             \
11328             SvREFCNT_dec(scratch_list);                                    \
11329         }                                                                  \
11330     }                                                                      \
11331     else {                                                                 \
11332         _invlist_union_complement_2nd(destlist,                            \
11333                                     (AT_LEAST_ASCII_RESTRICTED)            \
11334                                         ? sourcelist                       \
11335                                         : Xsourcelist,                     \
11336                                     &destlist);                            \
11337         /* Under /d, everything in the upper half of the Latin1 range      \
11338          * matches this complement */                                      \
11339         if (DEPENDS_SEMANTICS) {                                           \
11340             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
11341         }                                                                  \
11342     }
11343
11344 /* Generate the code to add a posix character <class> to the bracketed
11345  * character class given by <node>.  (<node> is needed only under locale rules)
11346  * destlist       is the inversion list for non-locale rules that this class is
11347  *                to be added to
11348  * sourcelist     is the ASCII-range inversion list to add under /a rules
11349  * l1_sourcelist  is the Latin1 range list to use otherwise.
11350  * Xpropertyname  is the name to add to <run_time_list> of the property to
11351  *                specify the code points above Latin1 that will have to be
11352  *                determined at run-time
11353  * run_time_list  is a SV* that contains text names of properties that are to
11354  *                be computed at run time.  This concatenates <Xpropertyname>
11355  *                to it, appropriately
11356  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11357  * time */
11358 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
11359                               l1_sourcelist, Xpropertyname, run_time_list) \
11360         /* First, resolve whether to use the ASCII-only list or the L1     \
11361          * list */                                                         \
11362         DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist,      \
11363                 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11364                 Xpropertyname, run_time_list)
11365
11366 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11367                 Xpropertyname, run_time_list)                              \
11368     /* If not /a matching, there are going to be code points we will have  \
11369      * to defer to runtime to look-up */                                   \
11370     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
11371         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11372     }                                                                      \
11373     if (LOC) {                                                             \
11374         ANYOF_CLASS_SET(node, class);                                      \
11375     }                                                                      \
11376     else {                                                                 \
11377         _invlist_union(destlist, sourcelist, &destlist);                   \
11378     }
11379
11380 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
11381  * this and DO_N_POSIX.  Sets <matches_above_unicode> only if it can; unchanged
11382  * otherwise */
11383 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
11384        l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11385     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
11386         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
11387     }                                                                      \
11388     else {                                                                 \
11389         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11390         matches_above_unicode = TRUE;                                      \
11391         if (LOC) {                                                         \
11392             ANYOF_CLASS_SET(node, namedclass);                             \
11393         }                                                                  \
11394         else {                                                             \
11395             SV* scratch_list = NULL;                                       \
11396             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
11397             if (! destlist) {                                              \
11398                 destlist = scratch_list;                                   \
11399             }                                                              \
11400             else {                                                         \
11401                 _invlist_union(destlist, scratch_list, &destlist);         \
11402                 SvREFCNT_dec(scratch_list);                                \
11403             }                                                              \
11404             if (DEPENDS_SEMANTICS) {                                       \
11405                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
11406             }                                                              \
11407         }                                                                  \
11408     }
11409
11410 /* The names of properties whose definitions are not known at compile time are
11411  * stored in this SV, after a constant heading.  So if the length has been
11412  * changed since initialization, then there is a run-time definition. */
11413 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11414
11415 /* This converts the named class defined in regcomp.h to its equivalent class
11416  * number defined in handy.h. */
11417 #define namedclass_to_classnum(class)  ((class) / 2)
11418
11419 STATIC regnode *
11420 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11421 {
11422     /* parse a bracketed class specification.  Most of these will produce an ANYOF node;
11423      * but something like [a] will produce an EXACT node; [aA], an EXACTFish
11424      * node; [[:ascii:]], a POSIXA node; etc.  It is more complex under /i with
11425      * multi-character folds: it will be rewritten following the paradigm of
11426      * this example, where the <multi-fold>s are characters which fold to
11427      * multiple character sequences:
11428      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11429      * gets effectively rewritten as:
11430      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11431      * reg() gets called (recursively) on the rewritten version, and this
11432      * function will return what it constructs.  (Actually the <multi-fold>s
11433      * aren't physically removed from the [abcdefghi], it's just that they are
11434      * ignored in the recursion by means of a a flag:
11435      * <RExC_in_multi_char_class>.)
11436      *
11437      * ANYOF nodes contain a bit map for the first 256 characters, with the
11438      * corresponding bit set if that character is in the list.  For characters
11439      * above 255, a range list or swash is used.  There are extra bits for \w,
11440      * etc. in locale ANYOFs, as what these match is not determinable at
11441      * compile time */
11442
11443     dVAR;
11444     UV nextvalue;
11445     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11446     IV range = 0;
11447     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11448     regnode *ret;
11449     STRLEN numlen;
11450     IV namedclass = OOB_NAMEDCLASS;
11451     char *rangebegin = NULL;
11452     bool need_class = 0;
11453     SV *listsv = NULL;
11454     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11455                                       than just initialized.  */
11456     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11457     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
11458                                extended beyond the Latin1 range */
11459     UV element_count = 0;   /* Number of distinct elements in the class.
11460                                Optimizations may be possible if this is tiny */
11461     AV * multi_char_matches = NULL; /* Code points that fold to more than one
11462                                        character; used under /i */
11463     UV n;
11464
11465     /* Unicode properties are stored in a swash; this holds the current one
11466      * being parsed.  If this swash is the only above-latin1 component of the
11467      * character class, an optimization is to pass it directly on to the
11468      * execution engine.  Otherwise, it is set to NULL to indicate that there
11469      * are other things in the class that have to be dealt with at execution
11470      * time */
11471     SV* swash = NULL;           /* Code points that match \p{} \P{} */
11472
11473     /* Set if a component of this character class is user-defined; just passed
11474      * on to the engine */
11475     bool has_user_defined_property = FALSE;
11476
11477     /* inversion list of code points this node matches only when the target
11478      * string is in UTF-8.  (Because is under /d) */
11479     SV* depends_list = NULL;
11480
11481     /* inversion list of code points this node matches.  For much of the
11482      * function, it includes only those that match regardless of the utf8ness
11483      * of the target string */
11484     SV* cp_list = NULL;
11485
11486 #ifdef EBCDIC
11487     /* In a range, counts how many 0-2 of the ends of it came from literals,
11488      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
11489     UV literal_endpoint = 0;
11490 #endif
11491     bool invert = FALSE;    /* Is this class to be complemented */
11492
11493     /* Is there any thing like \W or [:^digit:] that matches above the legal
11494      * Unicode range? */
11495     bool runtime_posix_matches_above_Unicode = FALSE;
11496
11497     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11498         case we need to change the emitted regop to an EXACT. */
11499     const char * orig_parse = RExC_parse;
11500     const I32 orig_size = RExC_size;
11501     GET_RE_DEBUG_FLAGS_DECL;
11502
11503     PERL_ARGS_ASSERT_REGCLASS;
11504 #ifndef DEBUGGING
11505     PERL_UNUSED_ARG(depth);
11506 #endif
11507
11508     DEBUG_PARSE("clas");
11509
11510     /* Assume we are going to generate an ANYOF node. */
11511     ret = reganode(pRExC_state, ANYOF, 0);
11512
11513     if (!SIZE_ONLY) {
11514         ANYOF_FLAGS(ret) = 0;
11515     }
11516
11517     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11518         RExC_parse++;
11519         invert = TRUE;
11520         RExC_naughty++;
11521     }
11522
11523     if (SIZE_ONLY) {
11524         RExC_size += ANYOF_SKIP;
11525         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11526     }
11527     else {
11528         RExC_emit += ANYOF_SKIP;
11529         if (LOC) {
11530             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11531         }
11532         listsv = newSVpvs("# comment\n");
11533         initial_listsv_len = SvCUR(listsv);
11534     }
11535
11536     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11537
11538     if (!SIZE_ONLY && POSIXCC(nextvalue))
11539         checkposixcc(pRExC_state);
11540
11541     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11542     if (UCHARAT(RExC_parse) == ']')
11543         goto charclassloop;
11544
11545 parseit:
11546     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11547
11548     charclassloop:
11549
11550         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11551         save_value = value;
11552         save_prevvalue = prevvalue;
11553
11554         if (!range) {
11555             rangebegin = RExC_parse;
11556             element_count++;
11557         }
11558         if (UTF) {
11559             value = utf8n_to_uvchr((U8*)RExC_parse,
11560                                    RExC_end - RExC_parse,
11561                                    &numlen, UTF8_ALLOW_DEFAULT);
11562             RExC_parse += numlen;
11563         }
11564         else
11565             value = UCHARAT(RExC_parse++);
11566
11567         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11568         if (value == '[' && POSIXCC(nextvalue))
11569             namedclass = regpposixcc(pRExC_state, value);
11570         else if (value == '\\') {
11571             if (UTF) {
11572                 value = utf8n_to_uvchr((U8*)RExC_parse,
11573                                    RExC_end - RExC_parse,
11574                                    &numlen, UTF8_ALLOW_DEFAULT);
11575                 RExC_parse += numlen;
11576             }
11577             else
11578                 value = UCHARAT(RExC_parse++);
11579             /* Some compilers cannot handle switching on 64-bit integer
11580              * values, therefore value cannot be an UV.  Yes, this will
11581              * be a problem later if we want switch on Unicode.
11582              * A similar issue a little bit later when switching on
11583              * namedclass. --jhi */
11584             switch ((I32)value) {
11585             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
11586             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
11587             case 's':   namedclass = ANYOF_SPACE;       break;
11588             case 'S':   namedclass = ANYOF_NSPACE;      break;
11589             case 'd':   namedclass = ANYOF_DIGIT;       break;
11590             case 'D':   namedclass = ANYOF_NDIGIT;      break;
11591             case 'v':   namedclass = ANYOF_VERTWS;      break;
11592             case 'V':   namedclass = ANYOF_NVERTWS;     break;
11593             case 'h':   namedclass = ANYOF_HORIZWS;     break;
11594             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
11595             case 'N':  /* Handle \N{NAME} in class */
11596                 {
11597                     /* We only pay attention to the first char of
11598                     multichar strings being returned. I kinda wonder
11599                     if this makes sense as it does change the behaviour
11600                     from earlier versions, OTOH that behaviour was broken
11601                     as well. */
11602                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11603                                       TRUE /* => charclass */))
11604                     {
11605                         goto parseit;
11606                     }
11607                 }
11608                 break;
11609             case 'p':
11610             case 'P':
11611                 {
11612                 char *e;
11613
11614                 /* This routine will handle any undefined properties */
11615                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11616
11617                 if (RExC_parse >= RExC_end)
11618                     vFAIL2("Empty \\%c{}", (U8)value);
11619                 if (*RExC_parse == '{') {
11620                     const U8 c = (U8)value;
11621                     e = strchr(RExC_parse++, '}');
11622                     if (!e)
11623                         vFAIL2("Missing right brace on \\%c{}", c);
11624                     while (isSPACE(UCHARAT(RExC_parse)))
11625                         RExC_parse++;
11626                     if (e == RExC_parse)
11627                         vFAIL2("Empty \\%c{}", c);
11628                     n = e - RExC_parse;
11629                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11630                         n--;
11631                 }
11632                 else {
11633                     e = RExC_parse;
11634                     n = 1;
11635                 }
11636                 if (!SIZE_ONLY) {
11637                     SV* invlist;
11638                     char* name;
11639
11640                     if (UCHARAT(RExC_parse) == '^') {
11641                          RExC_parse++;
11642                          n--;
11643                          value = value == 'p' ? 'P' : 'p'; /* toggle */
11644                          while (isSPACE(UCHARAT(RExC_parse))) {
11645                               RExC_parse++;
11646                               n--;
11647                          }
11648                     }
11649                     /* Try to get the definition of the property into
11650                      * <invlist>.  If /i is in effect, the effective property
11651                      * will have its name be <__NAME_i>.  The design is
11652                      * discussed in commit
11653                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11654                     Newx(name, n + sizeof("_i__\n"), char);
11655
11656                     sprintf(name, "%s%.*s%s\n",
11657                                     (FOLD) ? "__" : "",
11658                                     (int)n,
11659                                     RExC_parse,
11660                                     (FOLD) ? "_i" : ""
11661                     );
11662
11663                     /* Look up the property name, and get its swash and
11664                      * inversion list, if the property is found  */
11665                     if (swash) {
11666                         SvREFCNT_dec(swash);
11667                     }
11668                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
11669                                              1, /* binary */
11670                                              0, /* not tr/// */
11671                                              NULL, /* No inversion list */
11672                                              &swash_init_flags
11673                                             );
11674                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11675                         if (swash) {
11676                             SvREFCNT_dec(swash);
11677                             swash = NULL;
11678                         }
11679
11680                         /* Here didn't find it.  It could be a user-defined
11681                          * property that will be available at run-time.  Add it
11682                          * to the list to look up then */
11683                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11684                                         (value == 'p' ? '+' : '!'),
11685                                         name);
11686                         has_user_defined_property = TRUE;
11687
11688                         /* We don't know yet, so have to assume that the
11689                          * property could match something in the Latin1 range,
11690                          * hence something that isn't utf8.  Note that this
11691                          * would cause things in <depends_list> to match
11692                          * inappropriately, except that any \p{}, including
11693                          * this one forces Unicode semantics, which means there
11694                          * is <no depends_list> */
11695                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11696                     }
11697                     else {
11698
11699                         /* Here, did get the swash and its inversion list.  If
11700                          * the swash is from a user-defined property, then this
11701                          * whole character class should be regarded as such */
11702                         has_user_defined_property =
11703                                     (swash_init_flags
11704                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11705
11706                         /* Invert if asking for the complement */
11707                         if (value == 'P') {
11708                             _invlist_union_complement_2nd(properties,
11709                                                           invlist,
11710                                                           &properties);
11711
11712                             /* The swash can't be used as-is, because we've
11713                              * inverted things; delay removing it to here after
11714                              * have copied its invlist above */
11715                             SvREFCNT_dec(swash);
11716                             swash = NULL;
11717                         }
11718                         else {
11719                             _invlist_union(properties, invlist, &properties);
11720                         }
11721                     }
11722                     Safefree(name);
11723                 }
11724                 RExC_parse = e + 1;
11725                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
11726
11727                 /* \p means they want Unicode semantics */
11728                 RExC_uni_semantics = 1;
11729                 }
11730                 break;
11731             case 'n':   value = '\n';                   break;
11732             case 'r':   value = '\r';                   break;
11733             case 't':   value = '\t';                   break;
11734             case 'f':   value = '\f';                   break;
11735             case 'b':   value = '\b';                   break;
11736             case 'e':   value = ASCII_TO_NATIVE('\033');break;
11737             case 'a':   value = ASCII_TO_NATIVE('\007');break;
11738             case 'o':
11739                 RExC_parse--;   /* function expects to be pointed at the 'o' */
11740                 {
11741                     const char* error_msg;
11742                     bool valid = grok_bslash_o(RExC_parse,
11743                                                &value,
11744                                                &numlen,
11745                                                &error_msg,
11746                                                SIZE_ONLY);
11747                     RExC_parse += numlen;
11748                     if (! valid) {
11749                         vFAIL(error_msg);
11750                     }
11751                 }
11752                 if (PL_encoding && value < 0x100) {
11753                     goto recode_encoding;
11754                 }
11755                 break;
11756             case 'x':
11757                 RExC_parse--;   /* function expects to be pointed at the 'x' */
11758                 {
11759                     const char* error_msg;
11760                     bool valid = grok_bslash_x(RExC_parse,
11761                                                &value,
11762                                                &numlen,
11763                                                &error_msg,
11764                                                1);
11765                     RExC_parse += numlen;
11766                     if (! valid) {
11767                         vFAIL(error_msg);
11768                     }
11769                 }
11770                 if (PL_encoding && value < 0x100)
11771                     goto recode_encoding;
11772                 break;
11773             case 'c':
11774                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11775                 break;
11776             case '0': case '1': case '2': case '3': case '4':
11777             case '5': case '6': case '7':
11778                 {
11779                     /* Take 1-3 octal digits */
11780                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11781                     numlen = 3;
11782                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11783                     RExC_parse += numlen;
11784                     if (PL_encoding && value < 0x100)
11785                         goto recode_encoding;
11786                     break;
11787                 }
11788             recode_encoding:
11789                 if (! RExC_override_recoding) {
11790                     SV* enc = PL_encoding;
11791                     value = reg_recode((const char)(U8)value, &enc);
11792                     if (!enc && SIZE_ONLY)
11793                         ckWARNreg(RExC_parse,
11794                                   "Invalid escape in the specified encoding");
11795                     break;
11796                 }
11797             default:
11798                 /* Allow \_ to not give an error */
11799                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11800                     ckWARN2reg(RExC_parse,
11801                                "Unrecognized escape \\%c in character class passed through",
11802                                (int)value);
11803                 }
11804                 break;
11805             }
11806         } /* end of \blah */
11807 #ifdef EBCDIC
11808         else
11809             literal_endpoint++;
11810 #endif
11811
11812             /* What matches in a locale is not known until runtime.  This
11813              * includes what the Posix classes (like \w, [:space:]) match.
11814              * Room must be reserved (one time per class) to store such
11815              * classes, either if Perl is compiled so that locale nodes always
11816              * should have this space, or if there is such class info to be
11817              * stored.  The space will contain a bit for each named class that
11818              * is to be matched against.  This isn't needed for \p{} and
11819              * pseudo-classes, as they are not affected by locale, and hence
11820              * are dealt with separately */
11821             if (LOC
11822                 && ! need_class
11823                 && (ANYOF_LOCALE == ANYOF_CLASS
11824                     || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11825             {
11826                 need_class = 1;
11827                 if (SIZE_ONLY) {
11828                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11829                 }
11830                 else {
11831                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11832                     ANYOF_CLASS_ZERO(ret);
11833                 }
11834                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11835             }
11836
11837         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11838
11839             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
11840              * literal, as is the character that began the false range, i.e.
11841              * the 'a' in the examples */
11842             if (range) {
11843                 if (!SIZE_ONLY) {
11844                     const int w =
11845                         RExC_parse >= rangebegin ?
11846                         RExC_parse - rangebegin : 0;
11847                     ckWARN4reg(RExC_parse,
11848                                "False [] range \"%*.*s\"",
11849                                w, w, rangebegin);
11850                     cp_list = add_cp_to_invlist(cp_list, '-');
11851                     cp_list = add_cp_to_invlist(cp_list, prevvalue);
11852                 }
11853
11854                 range = 0; /* this was not a true range */
11855                 element_count += 2; /* So counts for three values */
11856             }
11857
11858             if (! SIZE_ONLY) {
11859                 switch ((I32)namedclass) {
11860
11861                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11862                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11863                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11864                     break;
11865                 case ANYOF_NALNUMC:
11866                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11867                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11868                         runtime_posix_matches_above_Unicode);
11869                     break;
11870                 case ANYOF_ALPHA:
11871                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11872                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11873                     break;
11874                 case ANYOF_NALPHA:
11875                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11876                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11877                         runtime_posix_matches_above_Unicode);
11878                     break;
11879                 case ANYOF_ASCII:
11880 #ifdef HAS_ISASCII
11881                     if (LOC) {
11882                         ANYOF_CLASS_SET(ret, namedclass);
11883                     }
11884                     else
11885 #endif  /* Not isascii(); just use the hard-coded definition for it */
11886                         _invlist_union(posixes, PL_ASCII, &posixes);
11887                     break;
11888                 case ANYOF_NASCII:
11889 #ifdef HAS_ISASCII
11890                     if (LOC) {
11891                         ANYOF_CLASS_SET(ret, namedclass);
11892                     }
11893                     else {
11894 #endif
11895                         _invlist_union_complement_2nd(posixes,
11896                                                     PL_ASCII, &posixes);
11897                         if (DEPENDS_SEMANTICS) {
11898                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11899                         }
11900 #ifdef HAS_ISASCII
11901                     }
11902 #endif
11903                     break;
11904                 case ANYOF_BLANK:
11905                     if (hasISBLANK || ! LOC) {
11906                         DO_POSIX(ret, namedclass, posixes,
11907                                             PL_PosixBlank, PL_XPosixBlank);
11908                     }
11909                     else { /* There is no isblank() and we are in locale:  We
11910                               use the ASCII range and the above-Latin1 range
11911                               code points */
11912                         SV* scratch_list = NULL;
11913
11914                         /* Include all above-Latin1 blanks */
11915                         _invlist_intersection(PL_AboveLatin1,
11916                                               PL_XPosixBlank,
11917                                               &scratch_list);
11918                         /* Add it to the running total of posix classes */
11919                         if (! posixes) {
11920                             posixes = scratch_list;
11921                         }
11922                         else {
11923                             _invlist_union(posixes, scratch_list, &posixes);
11924                             SvREFCNT_dec(scratch_list);
11925                         }
11926                         /* Add the ASCII-range blanks to the running total. */
11927                         _invlist_union(posixes, PL_PosixBlank, &posixes);
11928                     }
11929                     break;
11930                 case ANYOF_NBLANK:
11931                     if (hasISBLANK || ! LOC) {
11932                         DO_N_POSIX(ret, namedclass, posixes,
11933                                                 PL_PosixBlank, PL_XPosixBlank);
11934                     }
11935                     else { /* There is no isblank() and we are in locale */
11936                         SV* scratch_list = NULL;
11937
11938                         /* Include all above-Latin1 non-blanks */
11939                         _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11940                                           &scratch_list);
11941
11942                         /* Add them to the running total of posix classes */
11943                         _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11944                                           &scratch_list);
11945                         if (! posixes) {
11946                             posixes = scratch_list;
11947                         }
11948                         else {
11949                             _invlist_union(posixes, scratch_list, &posixes);
11950                             SvREFCNT_dec(scratch_list);
11951                         }
11952
11953                         /* Get the list of all non-ASCII-blanks in Latin 1, and
11954                          * add them to the running total */
11955                         _invlist_subtract(PL_Latin1, PL_PosixBlank,
11956                                           &scratch_list);
11957                         _invlist_union(posixes, scratch_list, &posixes);
11958                         SvREFCNT_dec(scratch_list);
11959                     }
11960                     break;
11961                 case ANYOF_CNTRL:
11962                     DO_POSIX(ret, namedclass, posixes,
11963                                             PL_PosixCntrl, PL_XPosixCntrl);
11964                     break;
11965                 case ANYOF_NCNTRL:
11966                     DO_N_POSIX(ret, namedclass, posixes,
11967                                             PL_PosixCntrl, PL_XPosixCntrl);
11968                     break;
11969                 case ANYOF_DIGIT:
11970                     /* There are no digits in the Latin1 range outside of
11971                      * ASCII, so call the macro that doesn't have to resolve
11972                      * them */
11973                     DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
11974                         PL_PosixDigit, "XPosixDigit", listsv);
11975                     break;
11976                 case ANYOF_NDIGIT:
11977                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11978                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
11979                         runtime_posix_matches_above_Unicode);
11980                     break;
11981                 case ANYOF_GRAPH:
11982                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11983                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11984                     break;
11985                 case ANYOF_NGRAPH:
11986                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11987                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
11988                         runtime_posix_matches_above_Unicode);
11989                     break;
11990                 case ANYOF_HORIZWS:
11991                     /* For these, we use the cp_list, as /d doesn't make a
11992                      * difference in what these match.  There would be problems
11993                      * if these characters had folds other than themselves, as
11994                      * cp_list is subject to folding.  It turns out that \h
11995                      * is just a synonym for XPosixBlank */
11996                     _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
11997                     break;
11998                 case ANYOF_NHORIZWS:
11999                     _invlist_union_complement_2nd(cp_list,
12000                                                  PL_XPosixBlank, &cp_list);
12001                     break;
12002                 case ANYOF_LOWER:
12003                 case ANYOF_NLOWER:
12004                 {   /* These require special handling, as they differ under
12005                        folding, matching Cased there (which in the ASCII range
12006                        is the same as Alpha */
12007
12008                     SV* ascii_source;
12009                     SV* l1_source;
12010                     const char *Xname;
12011
12012                     if (FOLD && ! LOC) {
12013                         ascii_source = PL_PosixAlpha;
12014                         l1_source = PL_L1Cased;
12015                         Xname = "Cased";
12016                     }
12017                     else {
12018                         ascii_source = PL_PosixLower;
12019                         l1_source = PL_L1PosixLower;
12020                         Xname = "XPosixLower";
12021                     }
12022                     if (namedclass == ANYOF_LOWER) {
12023                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12024                                     ascii_source, l1_source, Xname, listsv);
12025                     }
12026                     else {
12027                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12028                             posixes, ascii_source, l1_source, Xname, listsv,
12029                             runtime_posix_matches_above_Unicode);
12030                     }
12031                     break;
12032                 }
12033                 case ANYOF_PRINT:
12034                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12035                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
12036                     break;
12037                 case ANYOF_NPRINT:
12038                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12039                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
12040                         runtime_posix_matches_above_Unicode);
12041                     break;
12042                 case ANYOF_PUNCT:
12043                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12044                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
12045                     break;
12046                 case ANYOF_NPUNCT:
12047                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12048                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
12049                         runtime_posix_matches_above_Unicode);
12050                     break;
12051                 case ANYOF_PSXSPC:
12052                     DO_POSIX(ret, namedclass, posixes,
12053                                             PL_PosixSpace, PL_XPosixSpace);
12054                     break;
12055                 case ANYOF_NPSXSPC:
12056                     DO_N_POSIX(ret, namedclass, posixes,
12057                                             PL_PosixSpace, PL_XPosixSpace);
12058                     break;
12059                 case ANYOF_SPACE:
12060                     DO_POSIX(ret, namedclass, posixes,
12061                                             PL_PerlSpace, PL_XPerlSpace);
12062                     break;
12063                 case ANYOF_NSPACE:
12064                     DO_N_POSIX(ret, namedclass, posixes,
12065                                             PL_PerlSpace, PL_XPerlSpace);
12066                     break;
12067                 case ANYOF_UPPER:   /* Same as LOWER, above */
12068                 case ANYOF_NUPPER:
12069                 {
12070                     SV* ascii_source;
12071                     SV* l1_source;
12072                     const char *Xname;
12073
12074                     if (FOLD && ! LOC) {
12075                         ascii_source = PL_PosixAlpha;
12076                         l1_source = PL_L1Cased;
12077                         Xname = "Cased";
12078                     }
12079                     else {
12080                         ascii_source = PL_PosixUpper;
12081                         l1_source = PL_L1PosixUpper;
12082                         Xname = "XPosixUpper";
12083                     }
12084                     if (namedclass == ANYOF_UPPER) {
12085                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12086                                     ascii_source, l1_source, Xname, listsv);
12087                     }
12088                     else {
12089                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12090                         posixes, ascii_source, l1_source, Xname, listsv,
12091                         runtime_posix_matches_above_Unicode);
12092                     }
12093                     break;
12094                 }
12095                 case ANYOF_WORDCHAR:
12096                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12097                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
12098                     break;
12099                 case ANYOF_NWORDCHAR:
12100                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12101                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
12102                             runtime_posix_matches_above_Unicode);
12103                     break;
12104                 case ANYOF_VERTWS:
12105                     /* For these, we use the cp_list, as /d doesn't make a
12106                      * difference in what these match.  There would be problems
12107                      * if these characters had folds other than themselves, as
12108                      * cp_list is subject to folding */
12109                     _invlist_union(cp_list, PL_VertSpace, &cp_list);
12110                     break;
12111                 case ANYOF_NVERTWS:
12112                     _invlist_union_complement_2nd(cp_list,
12113                                                     PL_VertSpace, &cp_list);
12114                     break;
12115                 case ANYOF_XDIGIT:
12116                     DO_POSIX(ret, namedclass, posixes,
12117                                             PL_PosixXDigit, PL_XPosixXDigit);
12118                     break;
12119                 case ANYOF_NXDIGIT:
12120                     DO_N_POSIX(ret, namedclass, posixes,
12121                                             PL_PosixXDigit, PL_XPosixXDigit);
12122                     break;
12123                 case ANYOF_MAX:
12124                     /* this is to handle \p and \P */
12125                     break;
12126                 default:
12127                     vFAIL("Invalid [::] class");
12128                     break;
12129                 }
12130
12131                 continue;   /* Go get next character */
12132             }
12133         } /* end of namedclass \blah */
12134
12135         if (range) {
12136             if (prevvalue > value) /* b-a */ {
12137                 const int w = RExC_parse - rangebegin;
12138                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12139                 range = 0; /* not a valid range */
12140             }
12141         }
12142         else {
12143             prevvalue = value; /* save the beginning of the potential range */
12144             if (RExC_parse+1 < RExC_end
12145                 && *RExC_parse == '-'
12146                 && RExC_parse[1] != ']')
12147             {
12148                 RExC_parse++;
12149
12150                 /* a bad range like \w-, [:word:]- ? */
12151                 if (namedclass > OOB_NAMEDCLASS) {
12152                     if (ckWARN(WARN_REGEXP)) {
12153                         const int w =
12154                             RExC_parse >= rangebegin ?
12155                             RExC_parse - rangebegin : 0;
12156                         vWARN4(RExC_parse,
12157                                "False [] range \"%*.*s\"",
12158                                w, w, rangebegin);
12159                     }
12160                     if (!SIZE_ONLY) {
12161                         cp_list = add_cp_to_invlist(cp_list, '-');
12162                     }
12163                     element_count++;
12164                 } else
12165                     range = 1;  /* yeah, it's a range! */
12166                 continue;       /* but do it the next time */
12167             }
12168         }
12169
12170         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12171          * if not */
12172
12173         /* non-Latin1 code point implies unicode semantics.  Must be set in
12174          * pass1 so is there for the whole of pass 2 */
12175         if (value > 255) {
12176             RExC_uni_semantics = 1;
12177         }
12178
12179         /* Ready to process either the single value, or the completed range.
12180          * For single-valued non-inverted ranges, we consider the possibility
12181          * of multi-char folds.  (We made a conscious decision to not do this
12182          * for the other cases because it can often lead to non-intuitive
12183          * results.  For example, you have the peculiar case that:
12184          *  "s s" =~ /^[^\xDF]+$/i => Y
12185          *  "ss"  =~ /^[^\xDF]+$/i => N
12186          *
12187          * See [perl #89750] */
12188         if (FOLD && ! invert && value == prevvalue) {
12189             if (value == LATIN_SMALL_LETTER_SHARP_S
12190                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12191                                                         value)))
12192             {
12193                 /* Here <value> is indeed a multi-char fold.  Get what it is */
12194
12195                 U8 foldbuf[UTF8_MAXBYTES_CASE];
12196                 STRLEN foldlen;
12197
12198                 UV folded = _to_uni_fold_flags(
12199                                 value,
12200                                 foldbuf,
12201                                 &foldlen,
12202                                 FOLD_FLAGS_FULL
12203                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
12204                                             : (ASCII_FOLD_RESTRICTED)
12205                                               ? FOLD_FLAGS_NOMIX_ASCII
12206                                               : 0)
12207                                 );
12208
12209                 /* Here, <folded> should be the first character of the
12210                  * multi-char fold of <value>, with <foldbuf> containing the
12211                  * whole thing.  But, if this fold is not allowed (because of
12212                  * the flags), <fold> will be the same as <value>, and should
12213                  * be processed like any other character, so skip the special
12214                  * handling */
12215                 if (folded != value) {
12216
12217                     /* Skip if we are recursed, currently parsing the class
12218                      * again.  Otherwise add this character to the list of
12219                      * multi-char folds. */
12220                     if (! RExC_in_multi_char_class) {
12221                         AV** this_array_ptr;
12222                         AV* this_array;
12223                         STRLEN cp_count = utf8_length(foldbuf,
12224                                                       foldbuf + foldlen);
12225                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12226
12227                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12228
12229
12230                         if (! multi_char_matches) {
12231                             multi_char_matches = newAV();
12232                         }
12233
12234                         /* <multi_char_matches> is actually an array of arrays.
12235                          * There will be one or two top-level elements: [2],
12236                          * and/or [3].  The [2] element is an array, each
12237                          * element thereof is a character which folds to two
12238                          * characters; likewise for [3].  (Unicode guarantees a
12239                          * maximum of 3 characters in any fold.)  When we
12240                          * rewrite the character class below, we will do so
12241                          * such that the longest folds are written first, so
12242                          * that it prefers the longest matching strings first.
12243                          * This is done even if it turns out that any
12244                          * quantifier is non-greedy, out of programmer
12245                          * laziness.  Tom Christiansen has agreed that this is
12246                          * ok.  This makes the test for the ligature 'ffi' come
12247                          * before the test for 'ff' */
12248                         if (av_exists(multi_char_matches, cp_count)) {
12249                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
12250                                                              cp_count, FALSE);
12251                             this_array = *this_array_ptr;
12252                         }
12253                         else {
12254                             this_array = newAV();
12255                             av_store(multi_char_matches, cp_count,
12256                                      (SV*) this_array);
12257                         }
12258                         av_push(this_array, multi_fold);
12259                     }
12260
12261                     /* This element should not be processed further in this
12262                      * class */
12263                     element_count--;
12264                     value = save_value;
12265                     prevvalue = save_prevvalue;
12266                     continue;
12267                 }
12268             }
12269         }
12270
12271         /* Deal with this element of the class */
12272         if (! SIZE_ONLY) {
12273 #ifndef EBCDIC
12274             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12275 #else
12276             UV* this_range = _new_invlist(1);
12277             _append_range_to_invlist(this_range, prevvalue, value);
12278
12279             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12280              * If this range was specified using something like 'i-j', we want
12281              * to include only the 'i' and the 'j', and not anything in
12282              * between, so exclude non-ASCII, non-alphabetics from it.
12283              * However, if the range was specified with something like
12284              * [\x89-\x91] or [\x89-j], all code points within it should be
12285              * included.  literal_endpoint==2 means both ends of the range used
12286              * a literal character, not \x{foo} */
12287             if (literal_endpoint == 2
12288                 && (prevvalue >= 'a' && value <= 'z')
12289                     || (prevvalue >= 'A' && value <= 'Z'))
12290             {
12291                 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12292                 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12293             }
12294             _invlist_union(cp_list, this_range, &cp_list);
12295             literal_endpoint = 0;
12296 #endif
12297         }
12298
12299         range = 0; /* this range (if it was one) is done now */
12300     } /* End of loop through all the text within the brackets */
12301
12302     /* If anything in the class expands to more than one character, we have to
12303      * deal with them by building up a substitute parse string, and recursively
12304      * calling reg() on it, instead of proceeding */
12305     if (multi_char_matches) {
12306         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12307         I32 cp_count;
12308         STRLEN len;
12309         char *save_end = RExC_end;
12310         char *save_parse = RExC_parse;
12311         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
12312                                        a "|" */
12313         I32 reg_flags;
12314
12315         assert(! invert);
12316 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
12317            because too confusing */
12318         if (invert) {
12319             sv_catpv(substitute_parse, "(?:");
12320         }
12321 #endif
12322
12323         /* Look at the longest folds first */
12324         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12325
12326             if (av_exists(multi_char_matches, cp_count)) {
12327                 AV** this_array_ptr;
12328                 SV* this_sequence;
12329
12330                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12331                                                  cp_count, FALSE);
12332                 while ((this_sequence = av_pop(*this_array_ptr)) !=
12333                                                                 &PL_sv_undef)
12334                 {
12335                     if (! first_time) {
12336                         sv_catpv(substitute_parse, "|");
12337                     }
12338                     first_time = FALSE;
12339
12340                     sv_catpv(substitute_parse, SvPVX(this_sequence));
12341                 }
12342             }
12343         }
12344
12345         /* If the character class contains anything else besides these
12346          * multi-character folds, have to include it in recursive parsing */
12347         if (element_count) {
12348             sv_catpv(substitute_parse, "|[");
12349             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12350             sv_catpv(substitute_parse, "]");
12351         }
12352
12353         sv_catpv(substitute_parse, ")");
12354 #if 0
12355         if (invert) {
12356             /* This is a way to get the parse to skip forward a whole named
12357              * sequence instead of matching the 2nd character when it fails the
12358              * first */
12359             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12360         }
12361 #endif
12362
12363         RExC_parse = SvPV(substitute_parse, len);
12364         RExC_end = RExC_parse + len;
12365         RExC_in_multi_char_class = 1;
12366         RExC_emit = (regnode *)orig_emit;
12367
12368         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
12369
12370         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12371
12372         RExC_parse = save_parse;
12373         RExC_end = save_end;
12374         RExC_in_multi_char_class = 0;
12375         SvREFCNT_dec(multi_char_matches);
12376         return ret;
12377     }
12378
12379     /* If the character class contains only a single element, it may be
12380      * optimizable into another node type which is smaller and runs faster.
12381      * Check if this is the case for this class */
12382     if (element_count == 1) {
12383         U8 op = END;
12384         U8 arg = 0;
12385
12386         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12387                                               [:digit:] or \p{foo} */
12388
12389             /* Certain named classes have equivalents that can appear outside a
12390              * character class, e.g. \w, \H.  We use these instead of a
12391              * character class. */
12392             switch ((I32)namedclass) {
12393                 U8 offset;
12394
12395                 /* The first group is for node types that depend on the charset
12396                  * modifier to the regex.  We first calculate the base node
12397                  * type, and if it should be inverted */
12398
12399                 case ANYOF_NWORDCHAR:
12400                     invert = ! invert;
12401                     /* FALLTHROUGH */
12402                 case ANYOF_WORDCHAR:
12403                     op = ALNUM;
12404                     goto join_charset_classes;
12405
12406                 case ANYOF_NSPACE:
12407                     invert = ! invert;
12408                     /* FALLTHROUGH */
12409                 case ANYOF_SPACE:
12410                     op = SPACE;
12411                     goto join_charset_classes;
12412
12413                 case ANYOF_NDIGIT:
12414                     invert = ! invert;
12415                     /* FALLTHROUGH */
12416                 case ANYOF_DIGIT:
12417                     op = DIGIT;
12418
12419                   join_charset_classes:
12420
12421                     /* Now that we have the base node type, we take advantage
12422                      * of the enum ordering of the charset modifiers to get the
12423                      * exact node type,  For example the base SPACE also has
12424                      * SPACEL, SPACEU, and SPACEA */
12425
12426                     offset = get_regex_charset(RExC_flags);
12427
12428                     /* /aa is the same as /a for these */
12429                     if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12430                         offset = REGEX_ASCII_RESTRICTED_CHARSET;
12431                     }
12432                     else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12433                         offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12434                     }
12435
12436                     op += offset;
12437
12438                     /* The number of varieties of each of these is the same,
12439                      * hence, so is the delta between the normal and
12440                      * complemented nodes */
12441                     if (invert) {
12442                         op += NALNUM - ALNUM;
12443                     }
12444                     *flagp |= HASWIDTH|SIMPLE;
12445                     break;
12446
12447                 /* The second group doesn't depend of the charset modifiers.
12448                  * We just have normal and complemented */
12449                 case ANYOF_NHORIZWS:
12450                     invert = ! invert;
12451                     /* FALLTHROUGH */
12452                 case ANYOF_HORIZWS:
12453                   is_horizws:
12454                     op = (invert) ? NHORIZWS : HORIZWS;
12455                     *flagp |= HASWIDTH|SIMPLE;
12456                     break;
12457
12458                 case ANYOF_NVERTWS:
12459                     invert = ! invert;
12460                     /* FALLTHROUGH */
12461                 case ANYOF_VERTWS:
12462                     op = (invert) ? NVERTWS : VERTWS;
12463                     *flagp |= HASWIDTH|SIMPLE;
12464                     break;
12465
12466                 case ANYOF_MAX:
12467                     break;
12468
12469                 case ANYOF_NBLANK:
12470                     invert = ! invert;
12471                     /* FALLTHROUGH */
12472                 case ANYOF_BLANK:
12473                     if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12474                         goto is_horizws;
12475                     }
12476                     /* FALLTHROUGH */
12477                 default:
12478                     /* A generic posix class.  All the /a ones can be handled
12479                      * by the POSIXA opcode.  And all are closed under folding
12480                      * in the ASCII range, so FOLD doesn't matter */
12481                     if (AT_LEAST_ASCII_RESTRICTED
12482                         || (! LOC && namedclass == ANYOF_ASCII))
12483                     {
12484                         /* The odd numbered ones are the complements of the
12485                          * next-lower even number one */
12486                         if (namedclass % 2 == 1) {
12487                             invert = ! invert;
12488                             namedclass--;
12489                         }
12490                         arg = namedclass_to_classnum(namedclass);
12491                         op = (invert) ? NPOSIXA : POSIXA;
12492                     }
12493                     break;
12494             }
12495         }
12496         else if (value == prevvalue) {
12497
12498             /* Here, the class consists of just a single code point */
12499
12500             if (invert) {
12501                 if (! LOC && value == '\n') {
12502                     op = REG_ANY; /* Optimize [^\n] */
12503                     *flagp |= HASWIDTH|SIMPLE;
12504                     RExC_naughty++;
12505                 }
12506             }
12507             else if (value < 256 || UTF) {
12508
12509                 /* Optimize a single value into an EXACTish node, but not if it
12510                  * would require converting the pattern to UTF-8. */
12511                 op = compute_EXACTish(pRExC_state);
12512             }
12513         } /* Otherwise is a range */
12514         else if (! LOC) {   /* locale could vary these */
12515             if (prevvalue == '0') {
12516                 if (value == '9') {
12517                     op = (invert) ? NDIGITA : DIGITA;
12518                     *flagp |= HASWIDTH|SIMPLE;
12519                 }
12520             }
12521         }
12522
12523         /* Here, we have changed <op> away from its initial value iff we found
12524          * an optimization */
12525         if (op != END) {
12526
12527             /* Throw away this ANYOF regnode, and emit the calculated one,
12528              * which should correspond to the beginning, not current, state of
12529              * the parse */
12530             const char * cur_parse = RExC_parse;
12531             RExC_parse = (char *)orig_parse;
12532             if ( SIZE_ONLY) {
12533                 if (! LOC) {
12534
12535                     /* To get locale nodes to not use the full ANYOF size would
12536                      * require moving the code above that writes the portions
12537                      * of it that aren't in other nodes to after this point.
12538                      * e.g.  ANYOF_CLASS_SET */
12539                     RExC_size = orig_size;
12540                 }
12541             }
12542             else {
12543                 RExC_emit = (regnode *)orig_emit;
12544             }
12545
12546             ret = reg_node(pRExC_state, op);
12547
12548             if (PL_regkind[op] == POSIXD) {
12549                 if (! SIZE_ONLY) {
12550                     FLAGS(ret) = arg;
12551                 }
12552                 *flagp |= HASWIDTH|SIMPLE;
12553             }
12554             else if (PL_regkind[op] == EXACT) {
12555                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12556             }
12557
12558             RExC_parse = (char *) cur_parse;
12559
12560             SvREFCNT_dec(listsv);
12561             return ret;
12562         }
12563     }
12564
12565     if (SIZE_ONLY)
12566         return ret;
12567     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12568
12569     /* If folding, we calculate all characters that could fold to or from the
12570      * ones already on the list */
12571     if (FOLD && cp_list) {
12572         UV start, end;  /* End points of code point ranges */
12573
12574         SV* fold_intersection = NULL;
12575
12576         /* If the highest code point is within Latin1, we can use the
12577          * compiled-in Alphas list, and not have to go out to disk.  This
12578          * yields two false positives, the masculine and feminine oridinal
12579          * indicators, which are weeded out below using the
12580          * IS_IN_SOME_FOLD_L1() macro */
12581         if (invlist_highest(cp_list) < 256) {
12582             _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12583         }
12584         else {
12585
12586             /* Here, there are non-Latin1 code points, so we will have to go
12587              * fetch the list of all the characters that participate in folds
12588              */
12589             if (! PL_utf8_foldable) {
12590                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12591                                        &PL_sv_undef, 1, 0);
12592                 PL_utf8_foldable = _get_swash_invlist(swash);
12593                 SvREFCNT_dec(swash);
12594             }
12595
12596             /* This is a hash that for a particular fold gives all characters
12597              * that are involved in it */
12598             if (! PL_utf8_foldclosures) {
12599
12600                 /* If we were unable to find any folds, then we likely won't be
12601                  * able to find the closures.  So just create an empty list.
12602                  * Folding will effectively be restricted to the non-Unicode
12603                  * rules hard-coded into Perl.  (This case happens legitimately
12604                  * during compilation of Perl itself before the Unicode tables
12605                  * are generated) */
12606                 if (_invlist_len(PL_utf8_foldable) == 0) {
12607                     PL_utf8_foldclosures = newHV();
12608                 }
12609                 else {
12610                     /* If the folds haven't been read in, call a fold function
12611                      * to force that */
12612                     if (! PL_utf8_tofold) {
12613                         U8 dummy[UTF8_MAXBYTES+1];
12614
12615                         /* This string is just a short named one above \xff */
12616                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
12617                         assert(PL_utf8_tofold); /* Verify that worked */
12618                     }
12619                     PL_utf8_foldclosures =
12620                                         _swash_inversion_hash(PL_utf8_tofold);
12621                 }
12622             }
12623
12624             /* Only the characters in this class that participate in folds need
12625              * be checked.  Get the intersection of this class and all the
12626              * possible characters that are foldable.  This can quickly narrow
12627              * down a large class */
12628             _invlist_intersection(PL_utf8_foldable, cp_list,
12629                                   &fold_intersection);
12630         }
12631
12632         /* Now look at the foldable characters in this class individually */
12633         invlist_iterinit(fold_intersection);
12634         while (invlist_iternext(fold_intersection, &start, &end)) {
12635             UV j;
12636
12637             /* Locale folding for Latin1 characters is deferred until runtime */
12638             if (LOC && start < 256) {
12639                 start = 256;
12640             }
12641
12642             /* Look at every character in the range */
12643             for (j = start; j <= end; j++) {
12644
12645                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12646                 STRLEN foldlen;
12647                 SV** listp;
12648
12649                 if (j < 256) {
12650
12651                     /* We have the latin1 folding rules hard-coded here so that
12652                      * an innocent-looking character class, like /[ks]/i won't
12653                      * have to go out to disk to find the possible matches.
12654                      * XXX It would be better to generate these via regen, in
12655                      * case a new version of the Unicode standard adds new
12656                      * mappings, though that is not really likely, and may be
12657                      * caught by the default: case of the switch below. */
12658
12659                     if (IS_IN_SOME_FOLD_L1(j)) {
12660
12661                         /* ASCII is always matched; non-ASCII is matched only
12662                          * under Unicode rules */
12663                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12664                             cp_list =
12665                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12666                         }
12667                         else {
12668                             depends_list =
12669                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12670                         }
12671                     }
12672
12673                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12674                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12675                     {
12676                         /* Certain Latin1 characters have matches outside
12677                          * Latin1.  To get here, <j> is one of those
12678                          * characters.   None of these matches is valid for
12679                          * ASCII characters under /aa, which is why the 'if'
12680                          * just above excludes those.  These matches only
12681                          * happen when the target string is utf8.  The code
12682                          * below adds the single fold closures for <j> to the
12683                          * inversion list. */
12684                         switch (j) {
12685                             case 'k':
12686                             case 'K':
12687                                 cp_list =
12688                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
12689                                 break;
12690                             case 's':
12691                             case 'S':
12692                                 cp_list = add_cp_to_invlist(cp_list,
12693                                                     LATIN_SMALL_LETTER_LONG_S);
12694                                 break;
12695                             case MICRO_SIGN:
12696                                 cp_list = add_cp_to_invlist(cp_list,
12697                                                     GREEK_CAPITAL_LETTER_MU);
12698                                 cp_list = add_cp_to_invlist(cp_list,
12699                                                     GREEK_SMALL_LETTER_MU);
12700                                 break;
12701                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12702                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12703                                 cp_list =
12704                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12705                                 break;
12706                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12707                                 cp_list = add_cp_to_invlist(cp_list,
12708                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12709                                 break;
12710                             case LATIN_SMALL_LETTER_SHARP_S:
12711                                 cp_list = add_cp_to_invlist(cp_list,
12712                                                 LATIN_CAPITAL_LETTER_SHARP_S);
12713                                 break;
12714                             case 'F': case 'f':
12715                             case 'I': case 'i':
12716                             case 'L': case 'l':
12717                             case 'T': case 't':
12718                             case 'A': case 'a':
12719                             case 'H': case 'h':
12720                             case 'J': case 'j':
12721                             case 'N': case 'n':
12722                             case 'W': case 'w':
12723                             case 'Y': case 'y':
12724                                 /* These all are targets of multi-character
12725                                  * folds from code points that require UTF8 to
12726                                  * express, so they can't match unless the
12727                                  * target string is in UTF-8, so no action here
12728                                  * is necessary, as regexec.c properly handles
12729                                  * the general case for UTF-8 matching and
12730                                  * multi-char folds */
12731                                 break;
12732                             default:
12733                                 /* Use deprecated warning to increase the
12734                                  * chances of this being output */
12735                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12736                                 break;
12737                         }
12738                     }
12739                     continue;
12740                 }
12741
12742                 /* Here is an above Latin1 character.  We don't have the rules
12743                  * hard-coded for it.  First, get its fold.  This is the simple
12744                  * fold, as the multi-character folds have been handled earlier
12745                  * and separated out */
12746                 _to_uni_fold_flags(j, foldbuf, &foldlen,
12747                                                ((LOC)
12748                                                ? FOLD_FLAGS_LOCALE
12749                                                : (ASCII_FOLD_RESTRICTED)
12750                                                   ? FOLD_FLAGS_NOMIX_ASCII
12751                                                   : 0));
12752
12753                 /* Single character fold of above Latin1.  Add everything in
12754                  * its fold closure to the list that this node should match.
12755                  * The fold closures data structure is a hash with the keys
12756                  * being the UTF-8 of every character that is folded to, like
12757                  * 'k', and the values each an array of all code points that
12758                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
12759                  * Multi-character folds are not included */
12760                 if ((listp = hv_fetch(PL_utf8_foldclosures,
12761                                       (char *) foldbuf, foldlen, FALSE)))
12762                 {
12763                     AV* list = (AV*) *listp;
12764                     IV k;
12765                     for (k = 0; k <= av_len(list); k++) {
12766                         SV** c_p = av_fetch(list, k, FALSE);
12767                         UV c;
12768                         if (c_p == NULL) {
12769                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12770                         }
12771                         c = SvUV(*c_p);
12772
12773                         /* /aa doesn't allow folds between ASCII and non-; /l
12774                          * doesn't allow them between above and below 256 */
12775                         if ((ASCII_FOLD_RESTRICTED
12776                                   && (isASCII(c) != isASCII(j)))
12777                             || (LOC && ((c < 256) != (j < 256))))
12778                         {
12779                             continue;
12780                         }
12781
12782                         /* Folds involving non-ascii Latin1 characters
12783                          * under /d are added to a separate list */
12784                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12785                         {
12786                             cp_list = add_cp_to_invlist(cp_list, c);
12787                         }
12788                         else {
12789                           depends_list = add_cp_to_invlist(depends_list, c);
12790                         }
12791                     }
12792                 }
12793             }
12794         }
12795         SvREFCNT_dec(fold_intersection);
12796     }
12797
12798     /* And combine the result (if any) with any inversion list from posix
12799      * classes.  The lists are kept separate up to now because we don't want to
12800      * fold the classes (folding of those is automatically handled by the swash
12801      * fetching code) */
12802     if (posixes) {
12803         if (! DEPENDS_SEMANTICS) {
12804             if (cp_list) {
12805                 _invlist_union(cp_list, posixes, &cp_list);
12806                 SvREFCNT_dec(posixes);
12807             }
12808             else {
12809                 cp_list = posixes;
12810             }
12811         }
12812         else {
12813             /* Under /d, we put into a separate list the Latin1 things that
12814              * match only when the target string is utf8 */
12815             SV* nonascii_but_latin1_properties = NULL;
12816             _invlist_intersection(posixes, PL_Latin1,
12817                                   &nonascii_but_latin1_properties);
12818             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12819                               &nonascii_but_latin1_properties);
12820             _invlist_subtract(posixes, nonascii_but_latin1_properties,
12821                               &posixes);
12822             if (cp_list) {
12823                 _invlist_union(cp_list, posixes, &cp_list);
12824                 SvREFCNT_dec(posixes);
12825             }
12826             else {
12827                 cp_list = posixes;
12828             }
12829
12830             if (depends_list) {
12831                 _invlist_union(depends_list, nonascii_but_latin1_properties,
12832                                &depends_list);
12833                 SvREFCNT_dec(nonascii_but_latin1_properties);
12834             }
12835             else {
12836                 depends_list = nonascii_but_latin1_properties;
12837             }
12838         }
12839     }
12840
12841     /* And combine the result (if any) with any inversion list from properties.
12842      * The lists are kept separate up to now so that we can distinguish the two
12843      * in regards to matching above-Unicode.  A run-time warning is generated
12844      * if a Unicode property is matched against a non-Unicode code point. But,
12845      * we allow user-defined properties to match anything, without any warning,
12846      * and we also suppress the warning if there is a portion of the character
12847      * class that isn't a Unicode property, and which matches above Unicode, \W
12848      * or [\x{110000}] for example.
12849      * (Note that in this case, unlike the Posix one above, there is no
12850      * <depends_list>, because having a Unicode property forces Unicode
12851      * semantics */
12852     if (properties) {
12853         bool warn_super = ! has_user_defined_property;
12854         if (cp_list) {
12855
12856             /* If it matters to the final outcome, see if a non-property
12857              * component of the class matches above Unicode.  If so, the
12858              * warning gets suppressed.  This is true even if just a single
12859              * such code point is specified, as though not strictly correct if
12860              * another such code point is matched against, the fact that they
12861              * are using above-Unicode code points indicates they should know
12862              * the issues involved */
12863             if (warn_super) {
12864                 bool non_prop_matches_above_Unicode =
12865                             runtime_posix_matches_above_Unicode
12866                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12867                 if (invert) {
12868                     non_prop_matches_above_Unicode =
12869                                             !  non_prop_matches_above_Unicode;
12870                 }
12871                 warn_super = ! non_prop_matches_above_Unicode;
12872             }
12873
12874             _invlist_union(properties, cp_list, &cp_list);
12875             SvREFCNT_dec(properties);
12876         }
12877         else {
12878             cp_list = properties;
12879         }
12880
12881         if (warn_super) {
12882             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12883         }
12884     }
12885
12886     /* Here, we have calculated what code points should be in the character
12887      * class.
12888      *
12889      * Now we can see about various optimizations.  Fold calculation (which we
12890      * did above) needs to take place before inversion.  Otherwise /[^k]/i
12891      * would invert to include K, which under /i would match k, which it
12892      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
12893      * folded until runtime */
12894
12895     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12896      * at compile time.  Besides not inverting folded locale now, we can't
12897      * invert if there are things such as \w, which aren't known until runtime
12898      * */
12899     if (invert
12900         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12901         && ! depends_list
12902         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12903     {
12904         _invlist_invert(cp_list);
12905
12906         /* Any swash can't be used as-is, because we've inverted things */
12907         if (swash) {
12908             SvREFCNT_dec(swash);
12909             swash = NULL;
12910         }
12911
12912         /* Clear the invert flag since have just done it here */
12913         invert = FALSE;
12914     }
12915
12916     /* If we didn't do folding, it's because some information isn't available
12917      * until runtime; set the run-time fold flag for these.  (We don't have to
12918      * worry about properties folding, as that is taken care of by the swash
12919      * fetching) */
12920     if (FOLD && LOC)
12921     {
12922        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
12923     }
12924
12925     /* Some character classes are equivalent to other nodes.  Such nodes take
12926      * up less room and generally fewer operations to execute than ANYOF nodes.
12927      * Above, we checked for and optimized into some such equivalents for
12928      * certain common classes that are easy to test.  Getting to this point in
12929      * the code means that the class didn't get optimized there.  Since this
12930      * code is only executed in Pass 2, it is too late to save space--it has
12931      * been allocated in Pass 1, and currently isn't given back.  But turning
12932      * things into an EXACTish node can allow the optimizer to join it to any
12933      * adjacent such nodes.  And if the class is equivalent to things like /./,
12934      * expensive run-time swashes can be avoided.  Now that we have more
12935      * complete information, we can find things necessarily missed by the
12936      * earlier code.  I (khw) am not sure how much to look for here.  It would
12937      * be easy, but perhaps too slow, to check any candidates against all the
12938      * node types they could possibly match using _invlistEQ(). */
12939
12940     if (cp_list
12941         && ! invert
12942         && ! depends_list
12943         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12944         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12945     {
12946        UV start, end;
12947        U8 op = END;  /* The optimzation node-type */
12948         const char * cur_parse= RExC_parse;
12949
12950        invlist_iterinit(cp_list);
12951        if (! invlist_iternext(cp_list, &start, &end)) {
12952
12953             /* Here, the list is empty.  This happens, for example, when a
12954              * Unicode property is the only thing in the character class, and
12955              * it doesn't match anything.  (perluniprops.pod notes such
12956              * properties) */
12957             op = OPFAIL;
12958             *flagp |= HASWIDTH|SIMPLE;
12959         }
12960         else if (start == end) {    /* The range is a single code point */
12961             if (! invlist_iternext(cp_list, &start, &end)
12962
12963                     /* Don't do this optimization if it would require changing
12964                      * the pattern to UTF-8 */
12965                 && (start < 256 || UTF))
12966             {
12967                 /* Here, the list contains a single code point.  Can optimize
12968                  * into an EXACT node */
12969
12970                 value = start;
12971
12972                 if (! FOLD) {
12973                     op = EXACT;
12974                 }
12975                 else if (LOC) {
12976
12977                     /* A locale node under folding with one code point can be
12978                      * an EXACTFL, as its fold won't be calculated until
12979                      * runtime */
12980                     op = EXACTFL;
12981                 }
12982                 else {
12983
12984                     /* Here, we are generally folding, but there is only one
12985                      * code point to match.  If we have to, we use an EXACT
12986                      * node, but it would be better for joining with adjacent
12987                      * nodes in the optimization pass if we used the same
12988                      * EXACTFish node that any such are likely to be.  We can
12989                      * do this iff the code point doesn't participate in any
12990                      * folds.  For example, an EXACTF of a colon is the same as
12991                      * an EXACT one, since nothing folds to or from a colon. */
12992                     if (value < 256) {
12993                         if (IS_IN_SOME_FOLD_L1(value)) {
12994                             op = EXACT;
12995                         }
12996                     }
12997                     else {
12998                         if (! PL_utf8_foldable) {
12999                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13000                                                 &PL_sv_undef, 1, 0);
13001                             PL_utf8_foldable = _get_swash_invlist(swash);
13002                             SvREFCNT_dec(swash);
13003                         }
13004                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13005                             op = EXACT;
13006                         }
13007                     }
13008
13009                     /* If we haven't found the node type, above, it means we
13010                      * can use the prevailing one */
13011                     if (op == END) {
13012                         op = compute_EXACTish(pRExC_state);
13013                     }
13014                 }
13015             }
13016         }
13017         else if (start == 0) {
13018             if (end == UV_MAX) {
13019                 op = SANY;
13020                 *flagp |= HASWIDTH|SIMPLE;
13021                 RExC_naughty++;
13022             }
13023             else if (end == '\n' - 1
13024                     && invlist_iternext(cp_list, &start, &end)
13025                     && start == '\n' + 1 && end == UV_MAX)
13026             {
13027                 op = REG_ANY;
13028                 *flagp |= HASWIDTH|SIMPLE;
13029                 RExC_naughty++;
13030             }
13031         }
13032
13033         if (op != END) {
13034             RExC_parse = (char *)orig_parse;
13035             RExC_emit = (regnode *)orig_emit;
13036
13037             ret = reg_node(pRExC_state, op);
13038
13039             RExC_parse = (char *)cur_parse;
13040
13041             if (PL_regkind[op] == EXACT) {
13042                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13043             }
13044
13045             SvREFCNT_dec(listsv);
13046             return ret;
13047         }
13048     }
13049
13050     /* Here, <cp_list> contains all the code points we can determine at
13051      * compile time that match under all conditions.  Go through it, and
13052      * for things that belong in the bitmap, put them there, and delete from
13053      * <cp_list>.  While we are at it, see if everything above 255 is in the
13054      * list, and if so, set a flag to speed up execution */
13055     ANYOF_BITMAP_ZERO(ret);
13056     if (cp_list) {
13057
13058         /* This gets set if we actually need to modify things */
13059         bool change_invlist = FALSE;
13060
13061         UV start, end;
13062
13063         /* Start looking through <cp_list> */
13064         invlist_iterinit(cp_list);
13065         while (invlist_iternext(cp_list, &start, &end)) {
13066             UV high;
13067             int i;
13068
13069             if (end == UV_MAX && start <= 256) {
13070                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13071             }
13072
13073             /* Quit if are above what we should change */
13074             if (start > 255) {
13075                 break;
13076             }
13077
13078             change_invlist = TRUE;
13079
13080             /* Set all the bits in the range, up to the max that we are doing */
13081             high = (end < 255) ? end : 255;
13082             for (i = start; i <= (int) high; i++) {
13083                 if (! ANYOF_BITMAP_TEST(ret, i)) {
13084                     ANYOF_BITMAP_SET(ret, i);
13085                     prevvalue = value;
13086                     value = i;
13087                 }
13088             }
13089         }
13090
13091         /* Done with loop; remove any code points that are in the bitmap from
13092          * <cp_list> */
13093         if (change_invlist) {
13094             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13095         }
13096
13097         /* If have completely emptied it, remove it completely */
13098         if (_invlist_len(cp_list) == 0) {
13099             SvREFCNT_dec(cp_list);
13100             cp_list = NULL;
13101         }
13102     }
13103
13104     if (invert) {
13105         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13106     }
13107
13108     /* Here, the bitmap has been populated with all the Latin1 code points that
13109      * always match.  Can now add to the overall list those that match only
13110      * when the target string is UTF-8 (<depends_list>). */
13111     if (depends_list) {
13112         if (cp_list) {
13113             _invlist_union(cp_list, depends_list, &cp_list);
13114             SvREFCNT_dec(depends_list);
13115         }
13116         else {
13117             cp_list = depends_list;
13118         }
13119     }
13120
13121     /* If there is a swash and more than one element, we can't use the swash in
13122      * the optimization below. */
13123     if (swash && element_count > 1) {
13124         SvREFCNT_dec(swash);
13125         swash = NULL;
13126     }
13127
13128     if (! cp_list
13129         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13130     {
13131         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13132         SvREFCNT_dec(listsv);
13133     }
13134     else {
13135         /* av[0] stores the character class description in its textual form:
13136          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13137          *       appropriate swash, and is also useful for dumping the regnode.
13138          * av[1] if NULL, is a placeholder to later contain the swash computed
13139          *       from av[0].  But if no further computation need be done, the
13140          *       swash is stored there now.
13141          * av[2] stores the cp_list inversion list for use in addition or
13142          *       instead of av[0]; used only if av[1] is NULL
13143          * av[3] is set if any component of the class is from a user-defined
13144          *       property; used only if av[1] is NULL */
13145         AV * const av = newAV();
13146         SV *rv;
13147
13148         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13149                         ? listsv
13150                         : &PL_sv_undef);
13151         if (swash) {
13152             av_store(av, 1, swash);
13153             SvREFCNT_dec(cp_list);
13154         }
13155         else {
13156             av_store(av, 1, NULL);
13157             if (cp_list) {
13158                 av_store(av, 2, cp_list);
13159                 av_store(av, 3, newSVuv(has_user_defined_property));
13160             }
13161         }
13162
13163         rv = newRV_noinc(MUTABLE_SV(av));
13164         n = add_data(pRExC_state, 1, "s");
13165         RExC_rxi->data->data[n] = (void*)rv;
13166         ARG_SET(ret, n);
13167     }
13168
13169     *flagp |= HASWIDTH|SIMPLE;
13170     return ret;
13171 }
13172 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13173
13174
13175 /* reg_skipcomment()
13176
13177    Absorbs an /x style # comments from the input stream.
13178    Returns true if there is more text remaining in the stream.
13179    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13180    terminates the pattern without including a newline.
13181
13182    Note its the callers responsibility to ensure that we are
13183    actually in /x mode
13184
13185 */
13186
13187 STATIC bool
13188 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13189 {
13190     bool ended = 0;
13191
13192     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13193
13194     while (RExC_parse < RExC_end)
13195         if (*RExC_parse++ == '\n') {
13196             ended = 1;
13197             break;
13198         }
13199     if (!ended) {
13200         /* we ran off the end of the pattern without ending
13201            the comment, so we have to add an \n when wrapping */
13202         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13203         return 0;
13204     } else
13205         return 1;
13206 }
13207
13208 /* nextchar()
13209
13210    Advances the parse position, and optionally absorbs
13211    "whitespace" from the inputstream.
13212
13213    Without /x "whitespace" means (?#...) style comments only,
13214    with /x this means (?#...) and # comments and whitespace proper.
13215
13216    Returns the RExC_parse point from BEFORE the scan occurs.
13217
13218    This is the /x friendly way of saying RExC_parse++.
13219 */
13220
13221 STATIC char*
13222 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13223 {
13224     char* const retval = RExC_parse++;
13225
13226     PERL_ARGS_ASSERT_NEXTCHAR;
13227
13228     for (;;) {
13229         if (RExC_end - RExC_parse >= 3
13230             && *RExC_parse == '('
13231             && RExC_parse[1] == '?'
13232             && RExC_parse[2] == '#')
13233         {
13234             while (*RExC_parse != ')') {
13235                 if (RExC_parse == RExC_end)
13236                     FAIL("Sequence (?#... not terminated");
13237                 RExC_parse++;
13238             }
13239             RExC_parse++;
13240             continue;
13241         }
13242         if (RExC_flags & RXf_PMf_EXTENDED) {
13243             if (isSPACE(*RExC_parse)) {
13244                 RExC_parse++;
13245                 continue;
13246             }
13247             else if (*RExC_parse == '#') {
13248                 if ( reg_skipcomment( pRExC_state ) )
13249                     continue;
13250             }
13251         }
13252         return retval;
13253     }
13254 }
13255
13256 /*
13257 - reg_node - emit a node
13258 */
13259 STATIC regnode *                        /* Location. */
13260 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13261 {
13262     dVAR;
13263     regnode *ptr;
13264     regnode * const ret = RExC_emit;
13265     GET_RE_DEBUG_FLAGS_DECL;
13266
13267     PERL_ARGS_ASSERT_REG_NODE;
13268
13269     if (SIZE_ONLY) {
13270         SIZE_ALIGN(RExC_size);
13271         RExC_size += 1;
13272         return(ret);
13273     }
13274     if (RExC_emit >= RExC_emit_bound)
13275         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13276                    op, RExC_emit, RExC_emit_bound);
13277
13278     NODE_ALIGN_FILL(ret);
13279     ptr = ret;
13280     FILL_ADVANCE_NODE(ptr, op);
13281 #ifdef RE_TRACK_PATTERN_OFFSETS
13282     if (RExC_offsets) {         /* MJD */
13283         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13284               "reg_node", __LINE__,
13285               PL_reg_name[op],
13286               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13287                 ? "Overwriting end of array!\n" : "OK",
13288               (UV)(RExC_emit - RExC_emit_start),
13289               (UV)(RExC_parse - RExC_start),
13290               (UV)RExC_offsets[0]));
13291         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13292     }
13293 #endif
13294     RExC_emit = ptr;
13295     return(ret);
13296 }
13297
13298 /*
13299 - reganode - emit a node with an argument
13300 */
13301 STATIC regnode *                        /* Location. */
13302 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13303 {
13304     dVAR;
13305     regnode *ptr;
13306     regnode * const ret = RExC_emit;
13307     GET_RE_DEBUG_FLAGS_DECL;
13308
13309     PERL_ARGS_ASSERT_REGANODE;
13310
13311     if (SIZE_ONLY) {
13312         SIZE_ALIGN(RExC_size);
13313         RExC_size += 2;
13314         /*
13315            We can't do this:
13316
13317            assert(2==regarglen[op]+1);
13318
13319            Anything larger than this has to allocate the extra amount.
13320            If we changed this to be:
13321
13322            RExC_size += (1 + regarglen[op]);
13323
13324            then it wouldn't matter. Its not clear what side effect
13325            might come from that so its not done so far.
13326            -- dmq
13327         */
13328         return(ret);
13329     }
13330     if (RExC_emit >= RExC_emit_bound)
13331         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13332                    op, RExC_emit, RExC_emit_bound);
13333
13334     NODE_ALIGN_FILL(ret);
13335     ptr = ret;
13336     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13337 #ifdef RE_TRACK_PATTERN_OFFSETS
13338     if (RExC_offsets) {         /* MJD */
13339         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13340               "reganode",
13341               __LINE__,
13342               PL_reg_name[op],
13343               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13344               "Overwriting end of array!\n" : "OK",
13345               (UV)(RExC_emit - RExC_emit_start),
13346               (UV)(RExC_parse - RExC_start),
13347               (UV)RExC_offsets[0]));
13348         Set_Cur_Node_Offset;
13349     }
13350 #endif
13351     RExC_emit = ptr;
13352     return(ret);
13353 }
13354
13355 /*
13356 - reguni - emit (if appropriate) a Unicode character
13357 */
13358 STATIC STRLEN
13359 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13360 {
13361     dVAR;
13362
13363     PERL_ARGS_ASSERT_REGUNI;
13364
13365     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13366 }
13367
13368 /*
13369 - reginsert - insert an operator in front of already-emitted operand
13370 *
13371 * Means relocating the operand.
13372 */
13373 STATIC void
13374 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13375 {
13376     dVAR;
13377     regnode *src;
13378     regnode *dst;
13379     regnode *place;
13380     const int offset = regarglen[(U8)op];
13381     const int size = NODE_STEP_REGNODE + offset;
13382     GET_RE_DEBUG_FLAGS_DECL;
13383
13384     PERL_ARGS_ASSERT_REGINSERT;
13385     PERL_UNUSED_ARG(depth);
13386 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13387     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13388     if (SIZE_ONLY) {
13389         RExC_size += size;
13390         return;
13391     }
13392
13393     src = RExC_emit;
13394     RExC_emit += size;
13395     dst = RExC_emit;
13396     if (RExC_open_parens) {
13397         int paren;
13398         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13399         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13400             if ( RExC_open_parens[paren] >= opnd ) {
13401                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13402                 RExC_open_parens[paren] += size;
13403             } else {
13404                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13405             }
13406             if ( RExC_close_parens[paren] >= opnd ) {
13407                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13408                 RExC_close_parens[paren] += size;
13409             } else {
13410                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13411             }
13412         }
13413     }
13414
13415     while (src > opnd) {
13416         StructCopy(--src, --dst, regnode);
13417 #ifdef RE_TRACK_PATTERN_OFFSETS
13418         if (RExC_offsets) {     /* MJD 20010112 */
13419             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13420                   "reg_insert",
13421                   __LINE__,
13422                   PL_reg_name[op],
13423                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13424                     ? "Overwriting end of array!\n" : "OK",
13425                   (UV)(src - RExC_emit_start),
13426                   (UV)(dst - RExC_emit_start),
13427                   (UV)RExC_offsets[0]));
13428             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13429             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13430         }
13431 #endif
13432     }
13433
13434
13435     place = opnd;               /* Op node, where operand used to be. */
13436 #ifdef RE_TRACK_PATTERN_OFFSETS
13437     if (RExC_offsets) {         /* MJD */
13438         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13439               "reginsert",
13440               __LINE__,
13441               PL_reg_name[op],
13442               (UV)(place - RExC_emit_start) > RExC_offsets[0]
13443               ? "Overwriting end of array!\n" : "OK",
13444               (UV)(place - RExC_emit_start),
13445               (UV)(RExC_parse - RExC_start),
13446               (UV)RExC_offsets[0]));
13447         Set_Node_Offset(place, RExC_parse);
13448         Set_Node_Length(place, 1);
13449     }
13450 #endif
13451     src = NEXTOPER(place);
13452     FILL_ADVANCE_NODE(place, op);
13453     Zero(src, offset, regnode);
13454 }
13455
13456 /*
13457 - regtail - set the next-pointer at the end of a node chain of p to val.
13458 - SEE ALSO: regtail_study
13459 */
13460 /* TODO: All three parms should be const */
13461 STATIC void
13462 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13463 {
13464     dVAR;
13465     regnode *scan;
13466     GET_RE_DEBUG_FLAGS_DECL;
13467
13468     PERL_ARGS_ASSERT_REGTAIL;
13469 #ifndef DEBUGGING
13470     PERL_UNUSED_ARG(depth);
13471 #endif
13472
13473     if (SIZE_ONLY)
13474         return;
13475
13476     /* Find last node. */
13477     scan = p;
13478     for (;;) {
13479         regnode * const temp = regnext(scan);
13480         DEBUG_PARSE_r({
13481             SV * const mysv=sv_newmortal();
13482             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13483             regprop(RExC_rx, mysv, scan);
13484             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13485                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13486                     (temp == NULL ? "->" : ""),
13487                     (temp == NULL ? PL_reg_name[OP(val)] : "")
13488             );
13489         });
13490         if (temp == NULL)
13491             break;
13492         scan = temp;
13493     }
13494
13495     if (reg_off_by_arg[OP(scan)]) {
13496         ARG_SET(scan, val - scan);
13497     }
13498     else {
13499         NEXT_OFF(scan) = val - scan;
13500     }
13501 }
13502
13503 #ifdef DEBUGGING
13504 /*
13505 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13506 - Look for optimizable sequences at the same time.
13507 - currently only looks for EXACT chains.
13508
13509 This is experimental code. The idea is to use this routine to perform
13510 in place optimizations on branches and groups as they are constructed,
13511 with the long term intention of removing optimization from study_chunk so
13512 that it is purely analytical.
13513
13514 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13515 to control which is which.
13516
13517 */
13518 /* TODO: All four parms should be const */
13519
13520 STATIC U8
13521 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13522 {
13523     dVAR;
13524     regnode *scan;
13525     U8 exact = PSEUDO;
13526 #ifdef EXPERIMENTAL_INPLACESCAN
13527     I32 min = 0;
13528 #endif
13529     GET_RE_DEBUG_FLAGS_DECL;
13530
13531     PERL_ARGS_ASSERT_REGTAIL_STUDY;
13532
13533
13534     if (SIZE_ONLY)
13535         return exact;
13536
13537     /* Find last node. */
13538
13539     scan = p;
13540     for (;;) {
13541         regnode * const temp = regnext(scan);
13542 #ifdef EXPERIMENTAL_INPLACESCAN
13543         if (PL_regkind[OP(scan)] == EXACT) {
13544             bool has_exactf_sharp_s;    /* Unexamined in this routine */
13545             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13546                 return EXACT;
13547         }
13548 #endif
13549         if ( exact ) {
13550             switch (OP(scan)) {
13551                 case EXACT:
13552                 case EXACTF:
13553                 case EXACTFA:
13554                 case EXACTFU:
13555                 case EXACTFU_SS:
13556                 case EXACTFU_TRICKYFOLD:
13557                 case EXACTFL:
13558                         if( exact == PSEUDO )
13559                             exact= OP(scan);
13560                         else if ( exact != OP(scan) )
13561                             exact= 0;
13562                 case NOTHING:
13563                     break;
13564                 default:
13565                     exact= 0;
13566             }
13567         }
13568         DEBUG_PARSE_r({
13569             SV * const mysv=sv_newmortal();
13570             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13571             regprop(RExC_rx, mysv, scan);
13572             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13573                 SvPV_nolen_const(mysv),
13574                 REG_NODE_NUM(scan),
13575                 PL_reg_name[exact]);
13576         });
13577         if (temp == NULL)
13578             break;
13579         scan = temp;
13580     }
13581     DEBUG_PARSE_r({
13582         SV * const mysv_val=sv_newmortal();
13583         DEBUG_PARSE_MSG("");
13584         regprop(RExC_rx, mysv_val, val);
13585         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13586                       SvPV_nolen_const(mysv_val),
13587                       (IV)REG_NODE_NUM(val),
13588                       (IV)(val - scan)
13589         );
13590     });
13591     if (reg_off_by_arg[OP(scan)]) {
13592         ARG_SET(scan, val - scan);
13593     }
13594     else {
13595         NEXT_OFF(scan) = val - scan;
13596     }
13597
13598     return exact;
13599 }
13600 #endif
13601
13602 /*
13603  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13604  */
13605 #ifdef DEBUGGING
13606 static void
13607 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13608 {
13609     int bit;
13610     int set=0;
13611     regex_charset cs;
13612
13613     for (bit=0; bit<32; bit++) {
13614         if (flags & (1<<bit)) {
13615             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
13616                 continue;
13617             }
13618             if (!set++ && lead)
13619                 PerlIO_printf(Perl_debug_log, "%s",lead);
13620             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13621         }
13622     }
13623     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13624             if (!set++ && lead) {
13625                 PerlIO_printf(Perl_debug_log, "%s",lead);
13626             }
13627             switch (cs) {
13628                 case REGEX_UNICODE_CHARSET:
13629                     PerlIO_printf(Perl_debug_log, "UNICODE");
13630                     break;
13631                 case REGEX_LOCALE_CHARSET:
13632                     PerlIO_printf(Perl_debug_log, "LOCALE");
13633                     break;
13634                 case REGEX_ASCII_RESTRICTED_CHARSET:
13635                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13636                     break;
13637                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13638                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13639                     break;
13640                 default:
13641                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13642                     break;
13643             }
13644     }
13645     if (lead)  {
13646         if (set)
13647             PerlIO_printf(Perl_debug_log, "\n");
13648         else
13649             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13650     }
13651 }
13652 #endif
13653
13654 void
13655 Perl_regdump(pTHX_ const regexp *r)
13656 {
13657 #ifdef DEBUGGING
13658     dVAR;
13659     SV * const sv = sv_newmortal();
13660     SV *dsv= sv_newmortal();
13661     RXi_GET_DECL(r,ri);
13662     GET_RE_DEBUG_FLAGS_DECL;
13663
13664     PERL_ARGS_ASSERT_REGDUMP;
13665
13666     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13667
13668     /* Header fields of interest. */
13669     if (r->anchored_substr) {
13670         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13671             RE_SV_DUMPLEN(r->anchored_substr), 30);
13672         PerlIO_printf(Perl_debug_log,
13673                       "anchored %s%s at %"IVdf" ",
13674                       s, RE_SV_TAIL(r->anchored_substr),
13675                       (IV)r->anchored_offset);
13676     } else if (r->anchored_utf8) {
13677         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13678             RE_SV_DUMPLEN(r->anchored_utf8), 30);
13679         PerlIO_printf(Perl_debug_log,
13680                       "anchored utf8 %s%s at %"IVdf" ",
13681                       s, RE_SV_TAIL(r->anchored_utf8),
13682                       (IV)r->anchored_offset);
13683     }
13684     if (r->float_substr) {
13685         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13686             RE_SV_DUMPLEN(r->float_substr), 30);
13687         PerlIO_printf(Perl_debug_log,
13688                       "floating %s%s at %"IVdf"..%"UVuf" ",
13689                       s, RE_SV_TAIL(r->float_substr),
13690                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13691     } else if (r->float_utf8) {
13692         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13693             RE_SV_DUMPLEN(r->float_utf8), 30);
13694         PerlIO_printf(Perl_debug_log,
13695                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13696                       s, RE_SV_TAIL(r->float_utf8),
13697                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13698     }
13699     if (r->check_substr || r->check_utf8)
13700         PerlIO_printf(Perl_debug_log,
13701                       (const char *)
13702                       (r->check_substr == r->float_substr
13703                        && r->check_utf8 == r->float_utf8
13704                        ? "(checking floating" : "(checking anchored"));
13705     if (r->extflags & RXf_NOSCAN)
13706         PerlIO_printf(Perl_debug_log, " noscan");
13707     if (r->extflags & RXf_CHECK_ALL)
13708         PerlIO_printf(Perl_debug_log, " isall");
13709     if (r->check_substr || r->check_utf8)
13710         PerlIO_printf(Perl_debug_log, ") ");
13711
13712     if (ri->regstclass) {
13713         regprop(r, sv, ri->regstclass);
13714         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13715     }
13716     if (r->extflags & RXf_ANCH) {
13717         PerlIO_printf(Perl_debug_log, "anchored");
13718         if (r->extflags & RXf_ANCH_BOL)
13719             PerlIO_printf(Perl_debug_log, "(BOL)");
13720         if (r->extflags & RXf_ANCH_MBOL)
13721             PerlIO_printf(Perl_debug_log, "(MBOL)");
13722         if (r->extflags & RXf_ANCH_SBOL)
13723             PerlIO_printf(Perl_debug_log, "(SBOL)");
13724         if (r->extflags & RXf_ANCH_GPOS)
13725             PerlIO_printf(Perl_debug_log, "(GPOS)");
13726         PerlIO_putc(Perl_debug_log, ' ');
13727     }
13728     if (r->extflags & RXf_GPOS_SEEN)
13729         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13730     if (r->intflags & PREGf_SKIP)
13731         PerlIO_printf(Perl_debug_log, "plus ");
13732     if (r->intflags & PREGf_IMPLICIT)
13733         PerlIO_printf(Perl_debug_log, "implicit ");
13734     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13735     if (r->extflags & RXf_EVAL_SEEN)
13736         PerlIO_printf(Perl_debug_log, "with eval ");
13737     PerlIO_printf(Perl_debug_log, "\n");
13738     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13739 #else
13740     PERL_ARGS_ASSERT_REGDUMP;
13741     PERL_UNUSED_CONTEXT;
13742     PERL_UNUSED_ARG(r);
13743 #endif  /* DEBUGGING */
13744 }
13745
13746 /*
13747 - regprop - printable representation of opcode
13748 */
13749 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13750 STMT_START { \
13751         if (do_sep) {                           \
13752             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13753             if (flags & ANYOF_INVERT)           \
13754                 /*make sure the invert info is in each */ \
13755                 sv_catpvs(sv, "^");             \
13756             do_sep = 0;                         \
13757         }                                       \
13758 } STMT_END
13759
13760 void
13761 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13762 {
13763 #ifdef DEBUGGING
13764     dVAR;
13765     int k;
13766
13767     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13768     static const char * const anyofs[] = {
13769         "\\w",
13770         "\\W",
13771         "\\s",
13772         "\\S",
13773         "\\d",
13774         "\\D",
13775         "[:alnum:]",
13776         "[:^alnum:]",
13777         "[:alpha:]",
13778         "[:^alpha:]",
13779         "[:ascii:]",
13780         "[:^ascii:]",
13781         "[:cntrl:]",
13782         "[:^cntrl:]",
13783         "[:graph:]",
13784         "[:^graph:]",
13785         "[:lower:]",
13786         "[:^lower:]",
13787         "[:print:]",
13788         "[:^print:]",
13789         "[:punct:]",
13790         "[:^punct:]",
13791         "[:upper:]",
13792         "[:^upper:]",
13793         "[:xdigit:]",
13794         "[:^xdigit:]",
13795         "[:space:]",
13796         "[:^space:]",
13797         "[:blank:]",
13798         "[:^blank:]"
13799     };
13800     RXi_GET_DECL(prog,progi);
13801     GET_RE_DEBUG_FLAGS_DECL;
13802
13803     PERL_ARGS_ASSERT_REGPROP;
13804
13805     sv_setpvs(sv, "");
13806
13807     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
13808         /* It would be nice to FAIL() here, but this may be called from
13809            regexec.c, and it would be hard to supply pRExC_state. */
13810         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13811     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13812
13813     k = PL_regkind[OP(o)];
13814
13815     if (k == EXACT) {
13816         sv_catpvs(sv, " ");
13817         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13818          * is a crude hack but it may be the best for now since
13819          * we have no flag "this EXACTish node was UTF-8"
13820          * --jhi */
13821         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13822                   PERL_PV_ESCAPE_UNI_DETECT |
13823                   PERL_PV_ESCAPE_NONASCII   |
13824                   PERL_PV_PRETTY_ELLIPSES   |
13825                   PERL_PV_PRETTY_LTGT       |
13826                   PERL_PV_PRETTY_NOCLEAR
13827                   );
13828     } else if (k == TRIE) {
13829         /* print the details of the trie in dumpuntil instead, as
13830          * progi->data isn't available here */
13831         const char op = OP(o);
13832         const U32 n = ARG(o);
13833         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13834                (reg_ac_data *)progi->data->data[n] :
13835                NULL;
13836         const reg_trie_data * const trie
13837             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13838
13839         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13840         DEBUG_TRIE_COMPILE_r(
13841             Perl_sv_catpvf(aTHX_ sv,
13842                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13843                 (UV)trie->startstate,
13844                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13845                 (UV)trie->wordcount,
13846                 (UV)trie->minlen,
13847                 (UV)trie->maxlen,
13848                 (UV)TRIE_CHARCOUNT(trie),
13849                 (UV)trie->uniquecharcount
13850             )
13851         );
13852         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13853             int i;
13854             int rangestart = -1;
13855             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13856             sv_catpvs(sv, "[");
13857             for (i = 0; i <= 256; i++) {
13858                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13859                     if (rangestart == -1)
13860                         rangestart = i;
13861                 } else if (rangestart != -1) {
13862                     if (i <= rangestart + 3)
13863                         for (; rangestart < i; rangestart++)
13864                             put_byte(sv, rangestart);
13865                     else {
13866                         put_byte(sv, rangestart);
13867                         sv_catpvs(sv, "-");
13868                         put_byte(sv, i - 1);
13869                     }
13870                     rangestart = -1;
13871                 }
13872             }
13873             sv_catpvs(sv, "]");
13874         }
13875
13876     } else if (k == CURLY) {
13877         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13878             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13879         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13880     }
13881     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
13882         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13883     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13884         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
13885         if ( RXp_PAREN_NAMES(prog) ) {
13886             if ( k != REF || (OP(o) < NREF)) {
13887                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13888                 SV **name= av_fetch(list, ARG(o), 0 );
13889                 if (name)
13890                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13891             }
13892             else {
13893                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13894                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13895                 I32 *nums=(I32*)SvPVX(sv_dat);
13896                 SV **name= av_fetch(list, nums[0], 0 );
13897                 I32 n;
13898                 if (name) {
13899                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
13900                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13901                                     (n ? "," : ""), (IV)nums[n]);
13902                     }
13903                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13904                 }
13905             }
13906         }
13907     } else if (k == GOSUB)
13908         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13909     else if (k == VERB) {
13910         if (!o->flags)
13911             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13912                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13913     } else if (k == LOGICAL)
13914         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
13915     else if (k == ANYOF) {
13916         int i, rangestart = -1;
13917         const U8 flags = ANYOF_FLAGS(o);
13918         int do_sep = 0;
13919
13920
13921         if (flags & ANYOF_LOCALE)
13922             sv_catpvs(sv, "{loc}");
13923         if (flags & ANYOF_LOC_FOLD)
13924             sv_catpvs(sv, "{i}");
13925         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13926         if (flags & ANYOF_INVERT)
13927             sv_catpvs(sv, "^");
13928
13929         /* output what the standard cp 0-255 bitmap matches */
13930         for (i = 0; i <= 256; i++) {
13931             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13932                 if (rangestart == -1)
13933                     rangestart = i;
13934             } else if (rangestart != -1) {
13935                 if (i <= rangestart + 3)
13936                     for (; rangestart < i; rangestart++)
13937                         put_byte(sv, rangestart);
13938                 else {
13939                     put_byte(sv, rangestart);
13940                     sv_catpvs(sv, "-");
13941                     put_byte(sv, i - 1);
13942                 }
13943                 do_sep = 1;
13944                 rangestart = -1;
13945             }
13946         }
13947
13948         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13949         /* output any special charclass tests (used entirely under use locale) */
13950         if (ANYOF_CLASS_TEST_ANY_SET(o))
13951             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13952                 if (ANYOF_CLASS_TEST(o,i)) {
13953                     sv_catpv(sv, anyofs[i]);
13954                     do_sep = 1;
13955                 }
13956
13957         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13958
13959         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13960             sv_catpvs(sv, "{non-utf8-latin1-all}");
13961         }
13962
13963         /* output information about the unicode matching */
13964         if (flags & ANYOF_UNICODE_ALL)
13965             sv_catpvs(sv, "{unicode_all}");
13966         else if (ANYOF_NONBITMAP(o))
13967             sv_catpvs(sv, "{unicode}");
13968         if (flags & ANYOF_NONBITMAP_NON_UTF8)
13969             sv_catpvs(sv, "{outside bitmap}");
13970
13971         if (ANYOF_NONBITMAP(o)) {
13972             SV *lv; /* Set if there is something outside the bit map */
13973             SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
13974             bool byte_output = FALSE;   /* If something in the bitmap has been
13975                                            output */
13976
13977             if (lv && lv != &PL_sv_undef) {
13978                 if (sw) {
13979                     U8 s[UTF8_MAXBYTES_CASE+1];
13980
13981                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13982                         uvchr_to_utf8(s, i);
13983
13984                         if (i < 256
13985                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
13986                                                                things already
13987                                                                output as part
13988                                                                of the bitmap */
13989                             && swash_fetch(sw, s, TRUE))
13990                         {
13991                             if (rangestart == -1)
13992                                 rangestart = i;
13993                         } else if (rangestart != -1) {
13994                             byte_output = TRUE;
13995                             if (i <= rangestart + 3)
13996                                 for (; rangestart < i; rangestart++) {
13997                                     put_byte(sv, rangestart);
13998                                 }
13999                             else {
14000                                 put_byte(sv, rangestart);
14001                                 sv_catpvs(sv, "-");
14002                                 put_byte(sv, i-1);
14003                             }
14004                             rangestart = -1;
14005                         }
14006                     }
14007                 }
14008
14009                 {
14010                     char *s = savesvpv(lv);
14011                     char * const origs = s;
14012
14013                     while (*s && *s != '\n')
14014                         s++;
14015
14016                     if (*s == '\n') {
14017                         const char * const t = ++s;
14018
14019                         if (byte_output) {
14020                             sv_catpvs(sv, " ");
14021                         }
14022
14023                         while (*s) {
14024                             if (*s == '\n') {
14025
14026                                 /* Truncate very long output */
14027                                 if (s - origs > 256) {
14028                                     Perl_sv_catpvf(aTHX_ sv,
14029                                                    "%.*s...",
14030                                                    (int) (s - origs - 1),
14031                                                    t);
14032                                     goto out_dump;
14033                                 }
14034                                 *s = ' ';
14035                             }
14036                             else if (*s == '\t') {
14037                                 *s = '-';
14038                             }
14039                             s++;
14040                         }
14041                         if (s[-1] == ' ')
14042                             s[-1] = 0;
14043
14044                         sv_catpv(sv, t);
14045                     }
14046
14047                 out_dump:
14048
14049                     Safefree(origs);
14050                 }
14051                 SvREFCNT_dec(lv);
14052             }
14053         }
14054
14055         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14056     }
14057     else if (k == POSIXD) {
14058         U8 index = FLAGS(o) * 2;
14059         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14060             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14061         }
14062         else {
14063             sv_catpv(sv, anyofs[index]);
14064         }
14065     }
14066     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14067         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14068 #else
14069     PERL_UNUSED_CONTEXT;
14070     PERL_UNUSED_ARG(sv);
14071     PERL_UNUSED_ARG(o);
14072     PERL_UNUSED_ARG(prog);
14073 #endif  /* DEBUGGING */
14074 }
14075
14076 SV *
14077 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14078 {                               /* Assume that RE_INTUIT is set */
14079     dVAR;
14080     struct regexp *const prog = (struct regexp *)SvANY(r);
14081     GET_RE_DEBUG_FLAGS_DECL;
14082
14083     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14084     PERL_UNUSED_CONTEXT;
14085
14086     DEBUG_COMPILE_r(
14087         {
14088             const char * const s = SvPV_nolen_const(prog->check_substr
14089                       ? prog->check_substr : prog->check_utf8);
14090
14091             if (!PL_colorset) reginitcolors();
14092             PerlIO_printf(Perl_debug_log,
14093                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14094                       PL_colors[4],
14095                       prog->check_substr ? "" : "utf8 ",
14096                       PL_colors[5],PL_colors[0],
14097                       s,
14098                       PL_colors[1],
14099                       (strlen(s) > 60 ? "..." : ""));
14100         } );
14101
14102     return prog->check_substr ? prog->check_substr : prog->check_utf8;
14103 }
14104
14105 /*
14106    pregfree()
14107
14108    handles refcounting and freeing the perl core regexp structure. When
14109    it is necessary to actually free the structure the first thing it
14110    does is call the 'free' method of the regexp_engine associated to
14111    the regexp, allowing the handling of the void *pprivate; member
14112    first. (This routine is not overridable by extensions, which is why
14113    the extensions free is called first.)
14114
14115    See regdupe and regdupe_internal if you change anything here.
14116 */
14117 #ifndef PERL_IN_XSUB_RE
14118 void
14119 Perl_pregfree(pTHX_ REGEXP *r)
14120 {
14121     SvREFCNT_dec(r);
14122 }
14123
14124 void
14125 Perl_pregfree2(pTHX_ REGEXP *rx)
14126 {
14127     dVAR;
14128     struct regexp *const r = (struct regexp *)SvANY(rx);
14129     GET_RE_DEBUG_FLAGS_DECL;
14130
14131     PERL_ARGS_ASSERT_PREGFREE2;
14132
14133     if (r->mother_re) {
14134         ReREFCNT_dec(r->mother_re);
14135     } else {
14136         CALLREGFREE_PVT(rx); /* free the private data */
14137         SvREFCNT_dec(RXp_PAREN_NAMES(r));
14138     }
14139     if (r->substrs) {
14140         SvREFCNT_dec(r->anchored_substr);
14141         SvREFCNT_dec(r->anchored_utf8);
14142         SvREFCNT_dec(r->float_substr);
14143         SvREFCNT_dec(r->float_utf8);
14144         Safefree(r->substrs);
14145     }
14146     RX_MATCH_COPY_FREE(rx);
14147 #ifdef PERL_OLD_COPY_ON_WRITE
14148     SvREFCNT_dec(r->saved_copy);
14149 #endif
14150     Safefree(r->offs);
14151     SvREFCNT_dec(r->qr_anoncv);
14152 }
14153
14154 /*  reg_temp_copy()
14155
14156     This is a hacky workaround to the structural issue of match results
14157     being stored in the regexp structure which is in turn stored in
14158     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14159     could be PL_curpm in multiple contexts, and could require multiple
14160     result sets being associated with the pattern simultaneously, such
14161     as when doing a recursive match with (??{$qr})
14162
14163     The solution is to make a lightweight copy of the regexp structure
14164     when a qr// is returned from the code executed by (??{$qr}) this
14165     lightweight copy doesn't actually own any of its data except for
14166     the starp/end and the actual regexp structure itself.
14167
14168 */
14169
14170
14171 REGEXP *
14172 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14173 {
14174     struct regexp *ret;
14175     struct regexp *const r = (struct regexp *)SvANY(rx);
14176
14177     PERL_ARGS_ASSERT_REG_TEMP_COPY;
14178
14179     if (!ret_x)
14180         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14181     ret = (struct regexp *)SvANY(ret_x);
14182
14183     (void)ReREFCNT_inc(rx);
14184     /* We can take advantage of the existing "copied buffer" mechanism in SVs
14185        by pointing directly at the buffer, but flagging that the allocated
14186        space in the copy is zero. As we've just done a struct copy, it's now
14187        a case of zero-ing that, rather than copying the current length.  */
14188     SvPV_set(ret_x, RX_WRAPPED(rx));
14189     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
14190     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14191            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14192     SvLEN_set(ret_x, 0);
14193     SvSTASH_set(ret_x, NULL);
14194     SvMAGIC_set(ret_x, NULL);
14195     if (r->offs) {
14196         const I32 npar = r->nparens+1;
14197         Newx(ret->offs, npar, regexp_paren_pair);
14198         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14199     }
14200     if (r->substrs) {
14201         Newx(ret->substrs, 1, struct reg_substr_data);
14202         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14203
14204         SvREFCNT_inc_void(ret->anchored_substr);
14205         SvREFCNT_inc_void(ret->anchored_utf8);
14206         SvREFCNT_inc_void(ret->float_substr);
14207         SvREFCNT_inc_void(ret->float_utf8);
14208
14209         /* check_substr and check_utf8, if non-NULL, point to either their
14210            anchored or float namesakes, and don't hold a second reference.  */
14211     }
14212     RX_MATCH_COPIED_off(ret_x);
14213 #ifdef PERL_OLD_COPY_ON_WRITE
14214     ret->saved_copy = NULL;
14215 #endif
14216     ret->mother_re = rx;
14217     SvREFCNT_inc_void(ret->qr_anoncv);
14218
14219     return ret_x;
14220 }
14221 #endif
14222
14223 /* regfree_internal()
14224
14225    Free the private data in a regexp. This is overloadable by
14226    extensions. Perl takes care of the regexp structure in pregfree(),
14227    this covers the *pprivate pointer which technically perl doesn't
14228    know about, however of course we have to handle the
14229    regexp_internal structure when no extension is in use.
14230
14231    Note this is called before freeing anything in the regexp
14232    structure.
14233  */
14234
14235 void
14236 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14237 {
14238     dVAR;
14239     struct regexp *const r = (struct regexp *)SvANY(rx);
14240     RXi_GET_DECL(r,ri);
14241     GET_RE_DEBUG_FLAGS_DECL;
14242
14243     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14244
14245     DEBUG_COMPILE_r({
14246         if (!PL_colorset)
14247             reginitcolors();
14248         {
14249             SV *dsv= sv_newmortal();
14250             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14251                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14252             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14253                 PL_colors[4],PL_colors[5],s);
14254         }
14255     });
14256 #ifdef RE_TRACK_PATTERN_OFFSETS
14257     if (ri->u.offsets)
14258         Safefree(ri->u.offsets);             /* 20010421 MJD */
14259 #endif
14260     if (ri->code_blocks) {
14261         int n;
14262         for (n = 0; n < ri->num_code_blocks; n++)
14263             SvREFCNT_dec(ri->code_blocks[n].src_regex);
14264         Safefree(ri->code_blocks);
14265     }
14266
14267     if (ri->data) {
14268         int n = ri->data->count;
14269
14270         while (--n >= 0) {
14271           /* If you add a ->what type here, update the comment in regcomp.h */
14272             switch (ri->data->what[n]) {
14273             case 'a':
14274             case 'r':
14275             case 's':
14276             case 'S':
14277             case 'u':
14278                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14279                 break;
14280             case 'f':
14281                 Safefree(ri->data->data[n]);
14282                 break;
14283             case 'l':
14284             case 'L':
14285                 break;
14286             case 'T':
14287                 { /* Aho Corasick add-on structure for a trie node.
14288                      Used in stclass optimization only */
14289                     U32 refcount;
14290                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14291                     OP_REFCNT_LOCK;
14292                     refcount = --aho->refcount;
14293                     OP_REFCNT_UNLOCK;
14294                     if ( !refcount ) {
14295                         PerlMemShared_free(aho->states);
14296                         PerlMemShared_free(aho->fail);
14297                          /* do this last!!!! */
14298                         PerlMemShared_free(ri->data->data[n]);
14299                         PerlMemShared_free(ri->regstclass);
14300                     }
14301                 }
14302                 break;
14303             case 't':
14304                 {
14305                     /* trie structure. */
14306                     U32 refcount;
14307                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14308                     OP_REFCNT_LOCK;
14309                     refcount = --trie->refcount;
14310                     OP_REFCNT_UNLOCK;
14311                     if ( !refcount ) {
14312                         PerlMemShared_free(trie->charmap);
14313                         PerlMemShared_free(trie->states);
14314                         PerlMemShared_free(trie->trans);
14315                         if (trie->bitmap)
14316                             PerlMemShared_free(trie->bitmap);
14317                         if (trie->jump)
14318                             PerlMemShared_free(trie->jump);
14319                         PerlMemShared_free(trie->wordinfo);
14320                         /* do this last!!!! */
14321                         PerlMemShared_free(ri->data->data[n]);
14322                     }
14323                 }
14324                 break;
14325             default:
14326                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14327             }
14328         }
14329         Safefree(ri->data->what);
14330         Safefree(ri->data);
14331     }
14332
14333     Safefree(ri);
14334 }
14335
14336 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14337 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14338 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
14339
14340 /*
14341    re_dup - duplicate a regexp.
14342
14343    This routine is expected to clone a given regexp structure. It is only
14344    compiled under USE_ITHREADS.
14345
14346    After all of the core data stored in struct regexp is duplicated
14347    the regexp_engine.dupe method is used to copy any private data
14348    stored in the *pprivate pointer. This allows extensions to handle
14349    any duplication it needs to do.
14350
14351    See pregfree() and regfree_internal() if you change anything here.
14352 */
14353 #if defined(USE_ITHREADS)
14354 #ifndef PERL_IN_XSUB_RE
14355 void
14356 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14357 {
14358     dVAR;
14359     I32 npar;
14360     const struct regexp *r = (const struct regexp *)SvANY(sstr);
14361     struct regexp *ret = (struct regexp *)SvANY(dstr);
14362
14363     PERL_ARGS_ASSERT_RE_DUP_GUTS;
14364
14365     npar = r->nparens+1;
14366     Newx(ret->offs, npar, regexp_paren_pair);
14367     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14368     if(ret->swap) {
14369         /* no need to copy these */
14370         Newx(ret->swap, npar, regexp_paren_pair);
14371     }
14372
14373     if (ret->substrs) {
14374         /* Do it this way to avoid reading from *r after the StructCopy().
14375            That way, if any of the sv_dup_inc()s dislodge *r from the L1
14376            cache, it doesn't matter.  */
14377         const bool anchored = r->check_substr
14378             ? r->check_substr == r->anchored_substr
14379             : r->check_utf8 == r->anchored_utf8;
14380         Newx(ret->substrs, 1, struct reg_substr_data);
14381         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14382
14383         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14384         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14385         ret->float_substr = sv_dup_inc(ret->float_substr, param);
14386         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14387
14388         /* check_substr and check_utf8, if non-NULL, point to either their
14389            anchored or float namesakes, and don't hold a second reference.  */
14390
14391         if (ret->check_substr) {
14392             if (anchored) {
14393                 assert(r->check_utf8 == r->anchored_utf8);
14394                 ret->check_substr = ret->anchored_substr;
14395                 ret->check_utf8 = ret->anchored_utf8;
14396             } else {
14397                 assert(r->check_substr == r->float_substr);
14398                 assert(r->check_utf8 == r->float_utf8);
14399                 ret->check_substr = ret->float_substr;
14400                 ret->check_utf8 = ret->float_utf8;
14401             }
14402         } else if (ret->check_utf8) {
14403             if (anchored) {
14404                 ret->check_utf8 = ret->anchored_utf8;
14405             } else {
14406                 ret->check_utf8 = ret->float_utf8;
14407             }
14408         }
14409     }
14410
14411     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14412     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14413
14414     if (ret->pprivate)
14415         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14416
14417     if (RX_MATCH_COPIED(dstr))
14418         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
14419     else
14420         ret->subbeg = NULL;
14421 #ifdef PERL_OLD_COPY_ON_WRITE
14422     ret->saved_copy = NULL;
14423 #endif
14424
14425     if (ret->mother_re) {
14426         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
14427             /* Our storage points directly to our mother regexp, but that's
14428                1: a buffer in a different thread
14429                2: something we no longer hold a reference on
14430                so we need to copy it locally.  */
14431             /* Note we need to use SvCUR(), rather than
14432                SvLEN(), on our mother_re, because it, in
14433                turn, may well be pointing to its own mother_re.  */
14434             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
14435                                    SvCUR(ret->mother_re)+1));
14436             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
14437         }
14438         ret->mother_re      = NULL;
14439     }
14440     ret->gofs = 0;
14441 }
14442 #endif /* PERL_IN_XSUB_RE */
14443
14444 /*
14445    regdupe_internal()
14446
14447    This is the internal complement to regdupe() which is used to copy
14448    the structure pointed to by the *pprivate pointer in the regexp.
14449    This is the core version of the extension overridable cloning hook.
14450    The regexp structure being duplicated will be copied by perl prior
14451    to this and will be provided as the regexp *r argument, however
14452    with the /old/ structures pprivate pointer value. Thus this routine
14453    may override any copying normally done by perl.
14454
14455    It returns a pointer to the new regexp_internal structure.
14456 */
14457
14458 void *
14459 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14460 {
14461     dVAR;
14462     struct regexp *const r = (struct regexp *)SvANY(rx);
14463     regexp_internal *reti;
14464     int len;
14465     RXi_GET_DECL(r,ri);
14466
14467     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14468
14469     len = ProgLen(ri);
14470
14471     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14472     Copy(ri->program, reti->program, len+1, regnode);
14473
14474     reti->num_code_blocks = ri->num_code_blocks;
14475     if (ri->code_blocks) {
14476         int n;
14477         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14478                 struct reg_code_block);
14479         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14480                 struct reg_code_block);
14481         for (n = 0; n < ri->num_code_blocks; n++)
14482              reti->code_blocks[n].src_regex = (REGEXP*)
14483                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14484     }
14485     else
14486         reti->code_blocks = NULL;
14487
14488     reti->regstclass = NULL;
14489
14490     if (ri->data) {
14491         struct reg_data *d;
14492         const int count = ri->data->count;
14493         int i;
14494
14495         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14496                 char, struct reg_data);
14497         Newx(d->what, count, U8);
14498
14499         d->count = count;
14500         for (i = 0; i < count; i++) {
14501             d->what[i] = ri->data->what[i];
14502             switch (d->what[i]) {
14503                 /* see also regcomp.h and regfree_internal() */
14504             case 'a': /* actually an AV, but the dup function is identical.  */
14505             case 'r':
14506             case 's':
14507             case 'S':
14508             case 'u': /* actually an HV, but the dup function is identical.  */
14509                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14510                 break;
14511             case 'f':
14512                 /* This is cheating. */
14513                 Newx(d->data[i], 1, struct regnode_charclass_class);
14514                 StructCopy(ri->data->data[i], d->data[i],
14515                             struct regnode_charclass_class);
14516                 reti->regstclass = (regnode*)d->data[i];
14517                 break;
14518             case 'T':
14519                 /* Trie stclasses are readonly and can thus be shared
14520                  * without duplication. We free the stclass in pregfree
14521                  * when the corresponding reg_ac_data struct is freed.
14522                  */
14523                 reti->regstclass= ri->regstclass;
14524                 /* Fall through */
14525             case 't':
14526                 OP_REFCNT_LOCK;
14527                 ((reg_trie_data*)ri->data->data[i])->refcount++;
14528                 OP_REFCNT_UNLOCK;
14529                 /* Fall through */
14530             case 'l':
14531             case 'L':
14532                 d->data[i] = ri->data->data[i];
14533                 break;
14534             default:
14535                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14536             }
14537         }
14538
14539         reti->data = d;
14540     }
14541     else
14542         reti->data = NULL;
14543
14544     reti->name_list_idx = ri->name_list_idx;
14545
14546 #ifdef RE_TRACK_PATTERN_OFFSETS
14547     if (ri->u.offsets) {
14548         Newx(reti->u.offsets, 2*len+1, U32);
14549         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14550     }
14551 #else
14552     SetProgLen(reti,len);
14553 #endif
14554
14555     return (void*)reti;
14556 }
14557
14558 #endif    /* USE_ITHREADS */
14559
14560 #ifndef PERL_IN_XSUB_RE
14561
14562 /*
14563  - regnext - dig the "next" pointer out of a node
14564  */
14565 regnode *
14566 Perl_regnext(pTHX_ register regnode *p)
14567 {
14568     dVAR;
14569     I32 offset;
14570
14571     if (!p)
14572         return(NULL);
14573
14574     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
14575         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14576     }
14577
14578     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14579     if (offset == 0)
14580         return(NULL);
14581
14582     return(p+offset);
14583 }
14584 #endif
14585
14586 STATIC void
14587 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14588 {
14589     va_list args;
14590     STRLEN l1 = strlen(pat1);
14591     STRLEN l2 = strlen(pat2);
14592     char buf[512];
14593     SV *msv;
14594     const char *message;
14595
14596     PERL_ARGS_ASSERT_RE_CROAK2;
14597
14598     if (l1 > 510)
14599         l1 = 510;
14600     if (l1 + l2 > 510)
14601         l2 = 510 - l1;
14602     Copy(pat1, buf, l1 , char);
14603     Copy(pat2, buf + l1, l2 , char);
14604     buf[l1 + l2] = '\n';
14605     buf[l1 + l2 + 1] = '\0';
14606 #ifdef I_STDARG
14607     /* ANSI variant takes additional second argument */
14608     va_start(args, pat2);
14609 #else
14610     va_start(args);
14611 #endif
14612     msv = vmess(buf, &args);
14613     va_end(args);
14614     message = SvPV_const(msv,l1);
14615     if (l1 > 512)
14616         l1 = 512;
14617     Copy(message, buf, l1 , char);
14618     buf[l1-1] = '\0';                   /* Overwrite \n */
14619     Perl_croak(aTHX_ "%s", buf);
14620 }
14621
14622 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
14623
14624 #ifndef PERL_IN_XSUB_RE
14625 void
14626 Perl_save_re_context(pTHX)
14627 {
14628     dVAR;
14629
14630     struct re_save_state *state;
14631
14632     SAVEVPTR(PL_curcop);
14633     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14634
14635     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14636     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14637     SSPUSHUV(SAVEt_RE_STATE);
14638
14639     Copy(&PL_reg_state, state, 1, struct re_save_state);
14640
14641     PL_reg_oldsaved = NULL;
14642     PL_reg_oldsavedlen = 0;
14643     PL_reg_oldsavedoffset = 0;
14644     PL_reg_oldsavedcoffset = 0;
14645     PL_reg_maxiter = 0;
14646     PL_reg_leftiter = 0;
14647     PL_reg_poscache = NULL;
14648     PL_reg_poscache_size = 0;
14649 #ifdef PERL_OLD_COPY_ON_WRITE
14650     PL_nrs = NULL;
14651 #endif
14652
14653     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14654     if (PL_curpm) {
14655         const REGEXP * const rx = PM_GETRE(PL_curpm);
14656         if (rx) {
14657             U32 i;
14658             for (i = 1; i <= RX_NPARENS(rx); i++) {
14659                 char digits[TYPE_CHARS(long)];
14660                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14661                 GV *const *const gvp
14662                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14663
14664                 if (gvp) {
14665                     GV * const gv = *gvp;
14666                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14667                         save_scalar(gv);
14668                 }
14669             }
14670         }
14671     }
14672 }
14673 #endif
14674
14675 static void
14676 clear_re(pTHX_ void *r)
14677 {
14678     dVAR;
14679     ReREFCNT_dec((REGEXP *)r);
14680 }
14681
14682 #ifdef DEBUGGING
14683
14684 STATIC void
14685 S_put_byte(pTHX_ SV *sv, int c)
14686 {
14687     PERL_ARGS_ASSERT_PUT_BYTE;
14688
14689     /* Our definition of isPRINT() ignores locales, so only bytes that are
14690        not part of UTF-8 are considered printable. I assume that the same
14691        holds for UTF-EBCDIC.
14692        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14693        which Wikipedia says:
14694
14695        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14696        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14697        identical, to the ASCII delete (DEL) or rubout control character.
14698        ) So the old condition can be simplified to !isPRINT(c)  */
14699     if (!isPRINT(c)) {
14700         if (c < 256) {
14701             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14702         }
14703         else {
14704             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14705         }
14706     }
14707     else {
14708         const char string = c;
14709         if (c == '-' || c == ']' || c == '\\' || c == '^')
14710             sv_catpvs(sv, "\\");
14711         sv_catpvn(sv, &string, 1);
14712     }
14713 }
14714
14715
14716 #define CLEAR_OPTSTART \
14717     if (optstart) STMT_START { \
14718             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14719             optstart=NULL; \
14720     } STMT_END
14721
14722 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14723
14724 STATIC const regnode *
14725 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14726             const regnode *last, const regnode *plast,
14727             SV* sv, I32 indent, U32 depth)
14728 {
14729     dVAR;
14730     U8 op = PSEUDO;     /* Arbitrary non-END op. */
14731     const regnode *next;
14732     const regnode *optstart= NULL;
14733
14734     RXi_GET_DECL(r,ri);
14735     GET_RE_DEBUG_FLAGS_DECL;
14736
14737     PERL_ARGS_ASSERT_DUMPUNTIL;
14738
14739 #ifdef DEBUG_DUMPUNTIL
14740     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14741         last ? last-start : 0,plast ? plast-start : 0);
14742 #endif
14743
14744     if (plast && plast < last)
14745         last= plast;
14746
14747     while (PL_regkind[op] != END && (!last || node < last)) {
14748         /* While that wasn't END last time... */
14749         NODE_ALIGN(node);
14750         op = OP(node);
14751         if (op == CLOSE || op == WHILEM)
14752             indent--;
14753         next = regnext((regnode *)node);
14754
14755         /* Where, what. */
14756         if (OP(node) == OPTIMIZED) {
14757             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14758                 optstart = node;
14759             else
14760                 goto after_print;
14761         } else
14762             CLEAR_OPTSTART;
14763
14764         regprop(r, sv, node);
14765         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14766                       (int)(2*indent + 1), "", SvPVX_const(sv));
14767
14768         if (OP(node) != OPTIMIZED) {
14769             if (next == NULL)           /* Next ptr. */
14770                 PerlIO_printf(Perl_debug_log, " (0)");
14771             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14772                 PerlIO_printf(Perl_debug_log, " (FAIL)");
14773             else
14774                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14775             (void)PerlIO_putc(Perl_debug_log, '\n');
14776         }
14777
14778       after_print:
14779         if (PL_regkind[(U8)op] == BRANCHJ) {
14780             assert(next);
14781             {
14782                 const regnode *nnode = (OP(next) == LONGJMP
14783                                        ? regnext((regnode *)next)
14784                                        : next);
14785                 if (last && nnode > last)
14786                     nnode = last;
14787                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14788             }
14789         }
14790         else if (PL_regkind[(U8)op] == BRANCH) {
14791             assert(next);
14792             DUMPUNTIL(NEXTOPER(node), next);
14793         }
14794         else if ( PL_regkind[(U8)op]  == TRIE ) {
14795             const regnode *this_trie = node;
14796             const char op = OP(node);
14797             const U32 n = ARG(node);
14798             const reg_ac_data * const ac = op>=AHOCORASICK ?
14799                (reg_ac_data *)ri->data->data[n] :
14800                NULL;
14801             const reg_trie_data * const trie =
14802                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14803 #ifdef DEBUGGING
14804             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14805 #endif
14806             const regnode *nextbranch= NULL;
14807             I32 word_idx;
14808             sv_setpvs(sv, "");
14809             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14810                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14811
14812                 PerlIO_printf(Perl_debug_log, "%*s%s ",
14813                    (int)(2*(indent+3)), "",
14814                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14815                             PL_colors[0], PL_colors[1],
14816                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14817                             PERL_PV_PRETTY_ELLIPSES    |
14818                             PERL_PV_PRETTY_LTGT
14819                             )
14820                             : "???"
14821                 );
14822                 if (trie->jump) {
14823                     U16 dist= trie->jump[word_idx+1];
14824                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14825                                   (UV)((dist ? this_trie + dist : next) - start));
14826                     if (dist) {
14827                         if (!nextbranch)
14828                             nextbranch= this_trie + trie->jump[0];
14829                         DUMPUNTIL(this_trie + dist, nextbranch);
14830                     }
14831                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14832                         nextbranch= regnext((regnode *)nextbranch);
14833                 } else {
14834                     PerlIO_printf(Perl_debug_log, "\n");
14835                 }
14836             }
14837             if (last && next > last)
14838                 node= last;
14839             else
14840                 node= next;
14841         }
14842         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
14843             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14844                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14845         }
14846         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14847             assert(next);
14848             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14849         }
14850         else if ( op == PLUS || op == STAR) {
14851             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14852         }
14853         else if (PL_regkind[(U8)op] == ANYOF) {
14854             /* arglen 1 + class block */
14855             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14856                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14857             node = NEXTOPER(node);
14858         }
14859         else if (PL_regkind[(U8)op] == EXACT) {
14860             /* Literal string, where present. */
14861             node += NODE_SZ_STR(node) - 1;
14862             node = NEXTOPER(node);
14863         }
14864         else {
14865             node = NEXTOPER(node);
14866             node += regarglen[(U8)op];
14867         }
14868         if (op == CURLYX || op == OPEN)
14869             indent++;
14870     }
14871     CLEAR_OPTSTART;
14872 #ifdef DEBUG_DUMPUNTIL
14873     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14874 #endif
14875     return node;
14876 }
14877
14878 #endif  /* DEBUGGING */
14879
14880 /*
14881  * Local variables:
14882  * c-indentation-style: bsd
14883  * c-basic-offset: 4
14884  * indent-tabs-mode: nil
14885  * End:
14886  *
14887  * ex: set ts=8 sts=4 sw=4 et:
14888  */