regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89 #ifndef PERL_IN_XSUB_RE
  90 #  include "charclass_invlists.h"
  91 #endif
  92
  93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  94
  95 #ifdef op
  96 #undef op
  97 #endif /* op */
  98
  99 #ifdef MSDOS
 100 #  if defined(BUGGY_MSC6)
 101  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 102 #    pragma optimize("a",off)
 103  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 104 #    pragma optimize("w",on )
 105 #  endif /* BUGGY_MSC6 */
 106 #endif /* MSDOS */
 107
 108 #ifndef STATIC
 109 #define STATIC  static
 110 #endif
 111
 112
 113 typedef struct RExC_state_t {
 114     U32         flags;                  /* RXf_* are we folding, multilining? */
 115     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 116     char        *precomp;               /* uncompiled string. */
 117     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 118     regexp      *rx;                    /* perl core regexp structure */
 119     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 120     char        *start;                 /* Start of input for compile */
 121     char        *end;                   /* End of input for compile */
 122     char        *parse;                 /* Input-scan pointer. */
 123     I32         whilem_seen;            /* number of WHILEM in this expr */
 124     regnode     *emit_start;            /* Start of emitted-code area */
 125     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 126     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 127     I32         naughty;                /* How bad is this pattern? */
 128     I32         sawback;                /* Did we see \1, ...? */
 129     U32         seen;
 130     I32         size;                   /* Code size. */
 131     I32         npar;                   /* Capture buffer count, (OPEN). */
 132     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 133     I32         nestroot;               /* root parens we are in - used by accept */
 134     I32         extralen;
 135     I32         seen_zerolen;
 136     I32         seen_evals;
 137     regnode     **open_parens;          /* pointers to open parens */
 138     regnode     **close_parens;         /* pointers to close parens */
 139     regnode     *opend;                 /* END node in program */
 140     I32         utf8;           /* whether the pattern is utf8 or not */
 141     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 142                                 /* XXX use this for future optimisation of case
 143                                  * where pattern must be upgraded to utf8. */
 144     I32         uni_semantics;  /* If a d charset modifier should use unicode
 145                                    rules, even if the pattern is not in
 146                                    utf8 */
 147     HV          *paren_names;           /* Paren names */
 148
 149     regnode     **recurse;              /* Recurse regops */
 150     I32         recurse_count;          /* Number of recurse regops */
 151     I32         in_lookbehind;
 152     I32         contains_locale;
 153     I32         override_recoding;
 154     struct reg_code_block *code_blocks; /* positions of literal (?{})
 155                                             within pattern */
 156     int         num_code_blocks;        /* size of code_blocks[] */
 157     int         code_index;             /* next code_blocks[] slot */
 158 #if ADD_TO_REGEXEC
 159     char        *starttry;              /* -Dr: where regtry was called. */
 160 #define RExC_starttry   (pRExC_state->starttry)
 161 #endif
 162 #ifdef DEBUGGING
 163     const char  *lastparse;
 164     I32         lastnum;
 165     AV          *paren_name_list;       /* idx -> name */
 166 #define RExC_lastparse  (pRExC_state->lastparse)
 167 #define RExC_lastnum    (pRExC_state->lastnum)
 168 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 169 #endif
 170 } RExC_state_t;
 171
 172 #define RExC_flags      (pRExC_state->flags)
 173 #define RExC_pm_flags   (pRExC_state->pm_flags)
 174 #define RExC_precomp    (pRExC_state->precomp)
 175 #define RExC_rx_sv      (pRExC_state->rx_sv)
 176 #define RExC_rx         (pRExC_state->rx)
 177 #define RExC_rxi        (pRExC_state->rxi)
 178 #define RExC_start      (pRExC_state->start)
 179 #define RExC_end        (pRExC_state->end)
 180 #define RExC_parse      (pRExC_state->parse)
 181 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 182 #ifdef RE_TRACK_PATTERN_OFFSETS
 183 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 184 #endif
 185 #define RExC_emit       (pRExC_state->emit)
 186 #define RExC_emit_start (pRExC_state->emit_start)
 187 #define RExC_emit_bound (pRExC_state->emit_bound)
 188 #define RExC_naughty    (pRExC_state->naughty)
 189 #define RExC_sawback    (pRExC_state->sawback)
 190 #define RExC_seen       (pRExC_state->seen)
 191 #define RExC_size       (pRExC_state->size)
 192 #define RExC_npar       (pRExC_state->npar)
 193 #define RExC_nestroot   (pRExC_state->nestroot)
 194 #define RExC_extralen   (pRExC_state->extralen)
 195 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 196 #define RExC_seen_evals (pRExC_state->seen_evals)
 197 #define RExC_utf8       (pRExC_state->utf8)
 198 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 199 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 200 #define RExC_open_parens        (pRExC_state->open_parens)
 201 #define RExC_close_parens       (pRExC_state->close_parens)
 202 #define RExC_opend      (pRExC_state->opend)
 203 #define RExC_paren_names        (pRExC_state->paren_names)
 204 #define RExC_recurse    (pRExC_state->recurse)
 205 #define RExC_recurse_count      (pRExC_state->recurse_count)
 206 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 207 #define RExC_contains_locale    (pRExC_state->contains_locale)
 208 #define RExC_override_recoding  (pRExC_state->override_recoding)
 209
 210
 211 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 212 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 213         ((*s) == '{' && regcurly(s)))
 214
 215 #ifdef SPSTART
 216 #undef SPSTART          /* dratted cpp namespace... */
 217 #endif
 218 /*
 219  * Flags to be passed up and down.
 220  */
 221 #define WORST           0       /* Worst case. */
 222 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 223
 224 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 225  * character, and if utf8, must be invariant.  Note that this is not the same
 226  * thing as REGNODE_SIMPLE */
 227 #define SIMPLE          0x02
 228 #define SPSTART         0x04    /* Starts with * or +. */
 229 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 230 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 231
 232 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 233
 234 /* whether trie related optimizations are enabled */
 235 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 236 #define TRIE_STUDY_OPT
 237 #define FULL_TRIE_STUDY
 238 #define TRIE_STCLASS
 239 #endif
 240
 241
 242
 243 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 244 #define PBITVAL(paren) (1 << ((paren) & 7))
 245 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 246 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 247 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 248
 249 /* If not already in utf8, do a longjmp back to the beginning */
 250 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 251 #define REQUIRE_UTF8    STMT_START {                                       \
 252                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 253                         } STMT_END
 254
 255 /* About scan_data_t.
 256
 257   During optimisation we recurse through the regexp program performing
 258   various inplace (keyhole style) optimisations. In addition study_chunk
 259   and scan_commit populate this data structure with information about
 260   what strings MUST appear in the pattern. We look for the longest
 261   string that must appear at a fixed location, and we look for the
 262   longest string that may appear at a floating location. So for instance
 263   in the pattern:
 264
 265     /FOO[xX]A.*B[xX]BAR/
 266
 267   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 268   strings (because they follow a .* construct). study_chunk will identify
 269   both FOO and BAR as being the longest fixed and floating strings respectively.
 270
 271   The strings can be composites, for instance
 272
 273      /(f)(o)(o)/
 274
 275   will result in a composite fixed substring 'foo'.
 276
 277   For each string some basic information is maintained:
 278
 279   - offset or min_offset
 280     This is the position the string must appear at, or not before.
 281     It also implicitly (when combined with minlenp) tells us how many
 282     characters must match before the string we are searching for.
 283     Likewise when combined with minlenp and the length of the string it
 284     tells us how many characters must appear after the string we have
 285     found.
 286
 287   - max_offset
 288     Only used for floating strings. This is the rightmost point that
 289     the string can appear at. If set to I32 max it indicates that the
 290     string can occur infinitely far to the right.
 291
 292   - minlenp
 293     A pointer to the minimum length of the pattern that the string
 294     was found inside. This is important as in the case of positive
 295     lookahead or positive lookbehind we can have multiple patterns
 296     involved. Consider
 297
 298     /(?=FOO).*F/
 299
 300     The minimum length of the pattern overall is 3, the minimum length
 301     of the lookahead part is 3, but the minimum length of the part that
 302     will actually match is 1. So 'FOO's minimum length is 3, but the
 303     minimum length for the F is 1. This is important as the minimum length
 304     is used to determine offsets in front of and behind the string being
 305     looked for.  Since strings can be composites this is the length of the
 306     pattern at the time it was committed with a scan_commit. Note that
 307     the length is calculated by study_chunk, so that the minimum lengths
 308     are not known until the full pattern has been compiled, thus the
 309     pointer to the value.
 310
 311   - lookbehind
 312
 313     In the case of lookbehind the string being searched for can be
 314     offset past the start point of the final matching string.
 315     If this value was just blithely removed from the min_offset it would
 316     invalidate some of the calculations for how many chars must match
 317     before or after (as they are derived from min_offset and minlen and
 318     the length of the string being searched for).
 319     When the final pattern is compiled and the data is moved from the
 320     scan_data_t structure into the regexp structure the information
 321     about lookbehind is factored in, with the information that would
 322     have been lost precalculated in the end_shift field for the
 323     associated string.
 324
 325   The fields pos_min and pos_delta are used to store the minimum offset
 326   and the delta to the maximum offset at the current point in the pattern.
 327
 328 */
 329
 330 typedef struct scan_data_t {
 331     /*I32 len_min;      unused */
 332     /*I32 len_delta;    unused */
 333     I32 pos_min;
 334     I32 pos_delta;
 335     SV *last_found;
 336     I32 last_end;           /* min value, <0 unless valid. */
 337     I32 last_start_min;
 338     I32 last_start_max;
 339     SV **longest;           /* Either &l_fixed, or &l_float. */
 340     SV *longest_fixed;      /* longest fixed string found in pattern */
 341     I32 offset_fixed;       /* offset where it starts */
 342     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 343     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 344     SV *longest_float;      /* longest floating string found in pattern */
 345     I32 offset_float_min;   /* earliest point in string it can appear */
 346     I32 offset_float_max;   /* latest point in string it can appear */
 347     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 348     I32 lookbehind_float;   /* is the position of the string modified by LB */
 349     I32 flags;
 350     I32 whilem_c;
 351     I32 *last_closep;
 352     struct regnode_charclass_class *start_class;
 353 } scan_data_t;
 354
 355 /*
 356  * Forward declarations for pregcomp()'s friends.
 357  */
 358
 359 static const scan_data_t zero_scan_data =
 360   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 361
 362 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 363 #define SF_BEFORE_SEOL          0x0001
 364 #define SF_BEFORE_MEOL          0x0002
 365 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 366 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 367
 368 #ifdef NO_UNARY_PLUS
 369 #  define SF_FIX_SHIFT_EOL      (0+2)
 370 #  define SF_FL_SHIFT_EOL               (0+4)
 371 #else
 372 #  define SF_FIX_SHIFT_EOL      (+2)
 373 #  define SF_FL_SHIFT_EOL               (+4)
 374 #endif
 375
 376 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 377 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 378
 379 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 380 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 381 #define SF_IS_INF               0x0040
 382 #define SF_HAS_PAR              0x0080
 383 #define SF_IN_PAR               0x0100
 384 #define SF_HAS_EVAL             0x0200
 385 #define SCF_DO_SUBSTR           0x0400
 386 #define SCF_DO_STCLASS_AND      0x0800
 387 #define SCF_DO_STCLASS_OR       0x1000
 388 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 389 #define SCF_WHILEM_VISITED_POS  0x2000
 390
 391 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 392 #define SCF_SEEN_ACCEPT         0x8000
 393
 394 #define UTF cBOOL(RExC_utf8)
 395
 396 /* The enums for all these are ordered so things work out correctly */
 397 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 398 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 399 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 400 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 401 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 402 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 403 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 404
 405 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 406
 407 #define OOB_UNICODE             12345678
 408 #define OOB_NAMEDCLASS          -1
 409
 410 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 411 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 412
 413
 414 /* length of regex to show in messages that don't mark a position within */
 415 #define RegexLengthToShowInErrorMessages 127
 416
 417 /*
 418  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 419  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 420  * op/pragma/warn/regcomp.
 421  */
 422 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 423 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 424
 425 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 426
 427 /*
 428  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 429  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 430  * "...".
 431  */
 432 #define _FAIL(code) STMT_START {                                        \
 433     const char *ellipses = "";                                          \
 434     IV len = RExC_end - RExC_precomp;                                   \
 435                                                                         \
 436     if (!SIZE_ONLY)                                                     \
 437         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 438     if (len > RegexLengthToShowInErrorMessages) {                       \
 439         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 440         len = RegexLengthToShowInErrorMessages - 10;                    \
 441         ellipses = "...";                                               \
 442     }                                                                   \
 443     code;                                                               \
 444 } STMT_END
 445
 446 #define FAIL(msg) _FAIL(                            \
 447     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 448             msg, (int)len, RExC_precomp, ellipses))
 449
 450 #define FAIL2(msg,arg) _FAIL(                       \
 451     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 452             arg, (int)len, RExC_precomp, ellipses))
 453
 454 /*
 455  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 456  */
 457 #define Simple_vFAIL(m) STMT_START {                                    \
 458     const IV offset = RExC_parse - RExC_precomp;                        \
 459     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 460             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 461 } STMT_END
 462
 463 /*
 464  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 465  */
 466 #define vFAIL(m) STMT_START {                           \
 467     if (!SIZE_ONLY)                                     \
 468         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 469     Simple_vFAIL(m);                                    \
 470 } STMT_END
 471
 472 /*
 473  * Like Simple_vFAIL(), but accepts two arguments.
 474  */
 475 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 476     const IV offset = RExC_parse - RExC_precomp;                        \
 477     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 478             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 479 } STMT_END
 480
 481 /*
 482  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 483  */
 484 #define vFAIL2(m,a1) STMT_START {                       \
 485     if (!SIZE_ONLY)                                     \
 486         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 487     Simple_vFAIL2(m, a1);                               \
 488 } STMT_END
 489
 490
 491 /*
 492  * Like Simple_vFAIL(), but accepts three arguments.
 493  */
 494 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 495     const IV offset = RExC_parse - RExC_precomp;                \
 496     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 497             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 498 } STMT_END
 499
 500 /*
 501  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 502  */
 503 #define vFAIL3(m,a1,a2) STMT_START {                    \
 504     if (!SIZE_ONLY)                                     \
 505         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 506     Simple_vFAIL3(m, a1, a2);                           \
 507 } STMT_END
 508
 509 /*
 510  * Like Simple_vFAIL(), but accepts four arguments.
 511  */
 512 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 513     const IV offset = RExC_parse - RExC_precomp;                \
 514     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 515             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 516 } STMT_END
 517
 518 #define ckWARNreg(loc,m) STMT_START {                                   \
 519     const IV offset = loc - RExC_precomp;                               \
 520     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 521             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 522 } STMT_END
 523
 524 #define ckWARNregdep(loc,m) STMT_START {                                \
 525     const IV offset = loc - RExC_precomp;                               \
 526     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 527             m REPORT_LOCATION,                                          \
 528             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 529 } STMT_END
 530
 531 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 532     const IV offset = loc - RExC_precomp;                               \
 533     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 534             m REPORT_LOCATION,                                          \
 535             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 536 } STMT_END
 537
 538 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 539     const IV offset = loc - RExC_precomp;                               \
 540     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 541             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 542 } STMT_END
 543
 544 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 545     const IV offset = loc - RExC_precomp;                               \
 546     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 547             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 548 } STMT_END
 549
 550 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 551     const IV offset = loc - RExC_precomp;                               \
 552     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 553             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 554 } STMT_END
 555
 556 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 557     const IV offset = loc - RExC_precomp;                               \
 558     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 559             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 560 } STMT_END
 561
 562 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 563     const IV offset = loc - RExC_precomp;                               \
 564     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 565             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 566 } STMT_END
 567
 568 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 569     const IV offset = loc - RExC_precomp;                               \
 570     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 571             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 572 } STMT_END
 573
 574
 575 /* Allow for side effects in s */
 576 #define REGC(c,s) STMT_START {                  \
 577     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 578 } STMT_END
 579
 580 /* Macros for recording node offsets.   20001227 mjd@plover.com
 581  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 582  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 583  * Element 0 holds the number n.
 584  * Position is 1 indexed.
 585  */
 586 #ifndef RE_TRACK_PATTERN_OFFSETS
 587 #define Set_Node_Offset_To_R(node,byte)
 588 #define Set_Node_Offset(node,byte)
 589 #define Set_Cur_Node_Offset
 590 #define Set_Node_Length_To_R(node,len)
 591 #define Set_Node_Length(node,len)
 592 #define Set_Node_Cur_Length(node)
 593 #define Node_Offset(n)
 594 #define Node_Length(n)
 595 #define Set_Node_Offset_Length(node,offset,len)
 596 #define ProgLen(ri) ri->u.proglen
 597 #define SetProgLen(ri,x) ri->u.proglen = x
 598 #else
 599 #define ProgLen(ri) ri->u.offsets[0]
 600 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 601 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 602     if (! SIZE_ONLY) {                                                  \
 603         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 604                     __LINE__, (int)(node), (int)(byte)));               \
 605         if((node) < 0) {                                                \
 606             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 607         } else {                                                        \
 608             RExC_offsets[2*(node)-1] = (byte);                          \
 609         }                                                               \
 610     }                                                                   \
 611 } STMT_END
 612
 613 #define Set_Node_Offset(node,byte) \
 614     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 615 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 616
 617 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 618     if (! SIZE_ONLY) {                                                  \
 619         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 620                 __LINE__, (int)(node), (int)(len)));                    \
 621         if((node) < 0) {                                                \
 622             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 623         } else {                                                        \
 624             RExC_offsets[2*(node)] = (len);                             \
 625         }                                                               \
 626     }                                                                   \
 627 } STMT_END
 628
 629 #define Set_Node_Length(node,len) \
 630     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 631 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 632 #define Set_Node_Cur_Length(node) \
 633     Set_Node_Length(node, RExC_parse - parse_start)
 634
 635 /* Get offsets and lengths */
 636 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 637 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 638
 639 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 640     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 641     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 642 } STMT_END
 643 #endif
 644
 645 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 646 #define EXPERIMENTAL_INPLACESCAN
 647 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 648
 649 #define DEBUG_STUDYDATA(str,data,depth)                              \
 650 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 651     PerlIO_printf(Perl_debug_log,                                    \
 652         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 653         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 654         (int)(depth)*2, "",                                          \
 655         (IV)((data)->pos_min),                                       \
 656         (IV)((data)->pos_delta),                                     \
 657         (UV)((data)->flags),                                         \
 658         (IV)((data)->whilem_c),                                      \
 659         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 660         is_inf ? "INF " : ""                                         \
 661     );                                                               \
 662     if ((data)->last_found)                                          \
 663         PerlIO_printf(Perl_debug_log,                                \
 664             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 665             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 666             SvPVX_const((data)->last_found),                         \
 667             (IV)((data)->last_end),                                  \
 668             (IV)((data)->last_start_min),                            \
 669             (IV)((data)->last_start_max),                            \
 670             ((data)->longest &&                                      \
 671              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 672             SvPVX_const((data)->longest_fixed),                      \
 673             (IV)((data)->offset_fixed),                              \
 674             ((data)->longest &&                                      \
 675              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 676             SvPVX_const((data)->longest_float),                      \
 677             (IV)((data)->offset_float_min),                          \
 678             (IV)((data)->offset_float_max)                           \
 679         );                                                           \
 680     PerlIO_printf(Perl_debug_log,"\n");                              \
 681 });
 682
 683 static void clear_re(pTHX_ void *r);
 684
 685 /* Mark that we cannot extend a found fixed substring at this point.
 686    Update the longest found anchored substring and the longest found
 687    floating substrings if needed. */
 688
 689 STATIC void
 690 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 691 {
 692     const STRLEN l = CHR_SVLEN(data->last_found);
 693     const STRLEN old_l = CHR_SVLEN(*data->longest);
 694     GET_RE_DEBUG_FLAGS_DECL;
 695
 696     PERL_ARGS_ASSERT_SCAN_COMMIT;
 697
 698     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 699         SvSetMagicSV(*data->longest, data->last_found);
 700         if (*data->longest == data->longest_fixed) {
 701             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 702             if (data->flags & SF_BEFORE_EOL)
 703                 data->flags
 704                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 705             else
 706                 data->flags &= ~SF_FIX_BEFORE_EOL;
 707             data->minlen_fixed=minlenp;
 708             data->lookbehind_fixed=0;
 709         }
 710         else { /* *data->longest == data->longest_float */
 711             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 712             data->offset_float_max = (l
 713                                       ? data->last_start_max
 714                                       : data->pos_min + data->pos_delta);
 715             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 716                 data->offset_float_max = I32_MAX;
 717             if (data->flags & SF_BEFORE_EOL)
 718                 data->flags
 719                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 720             else
 721                 data->flags &= ~SF_FL_BEFORE_EOL;
 722             data->minlen_float=minlenp;
 723             data->lookbehind_float=0;
 724         }
 725     }
 726     SvCUR_set(data->last_found, 0);
 727     {
 728         SV * const sv = data->last_found;
 729         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 730             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 731             if (mg)
 732                 mg->mg_len = 0;
 733         }
 734     }
 735     data->last_end = -1;
 736     data->flags &= ~SF_BEFORE_EOL;
 737     DEBUG_STUDYDATA("commit: ",data,0);
 738 }
 739
 740 /* Can match anything (initialization) */
 741 STATIC void
 742 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 743 {
 744     PERL_ARGS_ASSERT_CL_ANYTHING;
 745
 746     ANYOF_BITMAP_SETALL(cl);
 747     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 748                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 749
 750     /* If any portion of the regex is to operate under locale rules,
 751      * initialization includes it.  The reason this isn't done for all regexes
 752      * is that the optimizer was written under the assumption that locale was
 753      * all-or-nothing.  Given the complexity and lack of documentation in the
 754      * optimizer, and that there are inadequate test cases for locale, so many
 755      * parts of it may not work properly, it is safest to avoid locale unless
 756      * necessary. */
 757     if (RExC_contains_locale) {
 758         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 759         cl->flags |= ANYOF_LOCALE;
 760     }
 761     else {
 762         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 763     }
 764 }
 765
 766 /* Can match anything (initialization) */
 767 STATIC int
 768 S_cl_is_anything(const struct regnode_charclass_class *cl)
 769 {
 770     int value;
 771
 772     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 773
 774     for (value = 0; value <= ANYOF_MAX; value += 2)
 775         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 776             return 1;
 777     if (!(cl->flags & ANYOF_UNICODE_ALL))
 778         return 0;
 779     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 780         return 0;
 781     return 1;
 782 }
 783
 784 /* Can match anything (initialization) */
 785 STATIC void
 786 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 787 {
 788     PERL_ARGS_ASSERT_CL_INIT;
 789
 790     Zero(cl, 1, struct regnode_charclass_class);
 791     cl->type = ANYOF;
 792     cl_anything(pRExC_state, cl);
 793     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 794 }
 795
 796 /* These two functions currently do the exact same thing */
 797 #define cl_init_zero            S_cl_init
 798
 799 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 800  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 801  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 802 STATIC void
 803 S_cl_and(struct regnode_charclass_class *cl,
 804         const struct regnode_charclass_class *and_with)
 805 {
 806     PERL_ARGS_ASSERT_CL_AND;
 807
 808     assert(and_with->type == ANYOF);
 809
 810     /* I (khw) am not sure all these restrictions are necessary XXX */
 811     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 812         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 813         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 814         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 815         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 816         int i;
 817
 818         if (and_with->flags & ANYOF_INVERT)
 819             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 820                 cl->bitmap[i] &= ~and_with->bitmap[i];
 821         else
 822             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 823                 cl->bitmap[i] &= and_with->bitmap[i];
 824     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 825
 826     if (and_with->flags & ANYOF_INVERT) {
 827
 828         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 829          * aren't affected by the inversion.  Those that are affected are
 830          * handled individually below */
 831         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 832         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 833         cl->flags |= affected_flags;
 834
 835         /* We currently don't know how to deal with things that aren't in the
 836          * bitmap, but we know that the intersection is no greater than what
 837          * is already in cl, so let there be false positives that get sorted
 838          * out after the synthetic start class succeeds, and the node is
 839          * matched for real. */
 840
 841         /* The inversion of these two flags indicate that the resulting
 842          * intersection doesn't have them */
 843         if (and_with->flags & ANYOF_UNICODE_ALL) {
 844             cl->flags &= ~ANYOF_UNICODE_ALL;
 845         }
 846         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 847             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 848         }
 849     }
 850     else {   /* and'd node is not inverted */
 851         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 852
 853         if (! ANYOF_NONBITMAP(and_with)) {
 854
 855             /* Here 'and_with' doesn't match anything outside the bitmap
 856              * (except possibly ANYOF_UNICODE_ALL), which means the
 857              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 858              * which case we don't know what the intersection is, but it's no
 859              * greater than what cl already has, so can just leave it alone,
 860              * with possible false positives */
 861             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 862                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 863                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 864             }
 865         }
 866         else if (! ANYOF_NONBITMAP(cl)) {
 867
 868             /* Here, 'and_with' does match something outside the bitmap, and cl
 869              * doesn't have a list of things to match outside the bitmap.  If
 870              * cl can match all code points above 255, the intersection will
 871              * be those above-255 code points that 'and_with' matches.  If cl
 872              * can't match all Unicode code points, it means that it can't
 873              * match anything outside the bitmap (since the 'if' that got us
 874              * into this block tested for that), so we leave the bitmap empty.
 875              */
 876             if (cl->flags & ANYOF_UNICODE_ALL) {
 877                 ARG_SET(cl, ARG(and_with));
 878
 879                 /* and_with's ARG may match things that don't require UTF8.
 880                  * And now cl's will too, in spite of this being an 'and'.  See
 881                  * the comments below about the kludge */
 882                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 883             }
 884         }
 885         else {
 886             /* Here, both 'and_with' and cl match something outside the
 887              * bitmap.  Currently we do not do the intersection, so just match
 888              * whatever cl had at the beginning.  */
 889         }
 890
 891
 892         /* Take the intersection of the two sets of flags.  However, the
 893          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 894          * kludge around the fact that this flag is not treated like the others
 895          * which are initialized in cl_anything().  The way the optimizer works
 896          * is that the synthetic start class (SSC) is initialized to match
 897          * anything, and then the first time a real node is encountered, its
 898          * values are AND'd with the SSC's with the result being the values of
 899          * the real node.  However, there are paths through the optimizer where
 900          * the AND never gets called, so those initialized bits are set
 901          * inappropriately, which is not usually a big deal, as they just cause
 902          * false positives in the SSC, which will just mean a probably
 903          * imperceptible slow down in execution.  However this bit has a
 904          * higher false positive consequence in that it can cause utf8.pm,
 905          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 906          * bigger slowdown and also causes significant extra memory to be used.
 907          * In order to prevent this, the code now takes a different tack.  The
 908          * bit isn't set unless some part of the regular expression needs it,
 909          * but once set it won't get cleared.  This means that these extra
 910          * modules won't get loaded unless there was some path through the
 911          * pattern that would have required them anyway, and  so any false
 912          * positives that occur by not ANDing them out when they could be
 913          * aren't as severe as they would be if we treated this bit like all
 914          * the others */
 915         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 916                                       & ANYOF_NONBITMAP_NON_UTF8;
 917         cl->flags &= and_with->flags;
 918         cl->flags |= outside_bitmap_but_not_utf8;
 919     }
 920 }
 921
 922 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 923  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 924  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 925 STATIC void
 926 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 927 {
 928     PERL_ARGS_ASSERT_CL_OR;
 929
 930     if (or_with->flags & ANYOF_INVERT) {
 931
 932         /* Here, the or'd node is to be inverted.  This means we take the
 933          * complement of everything not in the bitmap, but currently we don't
 934          * know what that is, so give up and match anything */
 935         if (ANYOF_NONBITMAP(or_with)) {
 936             cl_anything(pRExC_state, cl);
 937         }
 938         /* We do not use
 939          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 940          *   <= (B1 | !B2) | (CL1 | !CL2)
 941          * which is wasteful if CL2 is small, but we ignore CL2:
 942          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 943          * XXXX Can we handle case-fold?  Unclear:
 944          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 945          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 946          */
 947         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 948              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 949              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 950             int i;
 951
 952             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 953                 cl->bitmap[i] |= ~or_with->bitmap[i];
 954         } /* XXXX: logic is complicated otherwise */
 955         else {
 956             cl_anything(pRExC_state, cl);
 957         }
 958
 959         /* And, we can just take the union of the flags that aren't affected
 960          * by the inversion */
 961         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 962
 963         /* For the remaining flags:
 964             ANYOF_UNICODE_ALL and inverted means to not match anything above
 965                     255, which means that the union with cl should just be
 966                     what cl has in it, so can ignore this flag
 967             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 968                     is 127-255 to match them, but then invert that, so the
 969                     union with cl should just be what cl has in it, so can
 970                     ignore this flag
 971          */
 972     } else {    /* 'or_with' is not inverted */
 973         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 974         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 975              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 976                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 977             int i;
 978
 979             /* OR char bitmap and class bitmap separately */
 980             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 981                 cl->bitmap[i] |= or_with->bitmap[i];
 982             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 983                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 984                     cl->classflags[i] |= or_with->classflags[i];
 985                 cl->flags |= ANYOF_CLASS;
 986             }
 987         }
 988         else { /* XXXX: logic is complicated, leave it along for a moment. */
 989             cl_anything(pRExC_state, cl);
 990         }
 991
 992         if (ANYOF_NONBITMAP(or_with)) {
 993
 994             /* Use the added node's outside-the-bit-map match if there isn't a
 995              * conflict.  If there is a conflict (both nodes match something
 996              * outside the bitmap, but what they match outside is not the same
 997              * pointer, and hence not easily compared until XXX we extend
 998              * inversion lists this far), give up and allow the start class to
 999              * match everything outside the bitmap.  If that stuff is all above
1000              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1001             if (! ANYOF_NONBITMAP(cl)) {
1002                 ARG_SET(cl, ARG(or_with));
1003             }
1004             else if (ARG(cl) != ARG(or_with)) {
1005
1006                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1007                     cl_anything(pRExC_state, cl);
1008                 }
1009                 else {
1010                     cl->flags |= ANYOF_UNICODE_ALL;
1011                 }
1012             }
1013         }
1014
1015         /* Take the union */
1016         cl->flags |= or_with->flags;
1017     }
1018 }
1019
1020 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1021 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1022 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1023 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1024
1025
1026 #ifdef DEBUGGING
1027 /*
1028    dump_trie(trie,widecharmap,revcharmap)
1029    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1030    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1031
1032    These routines dump out a trie in a somewhat readable format.
1033    The _interim_ variants are used for debugging the interim
1034    tables that are used to generate the final compressed
1035    representation which is what dump_trie expects.
1036
1037    Part of the reason for their existence is to provide a form
1038    of documentation as to how the different representations function.
1039
1040 */
1041
1042 /*
1043   Dumps the final compressed table form of the trie to Perl_debug_log.
1044   Used for debugging make_trie().
1045 */
1046
1047 STATIC void
1048 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1049             AV *revcharmap, U32 depth)
1050 {
1051     U32 state;
1052     SV *sv=sv_newmortal();
1053     int colwidth= widecharmap ? 6 : 4;
1054     U16 word;
1055     GET_RE_DEBUG_FLAGS_DECL;
1056
1057     PERL_ARGS_ASSERT_DUMP_TRIE;
1058
1059     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1060         (int)depth * 2 + 2,"",
1061         "Match","Base","Ofs" );
1062
1063     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1064         SV ** const tmp = av_fetch( revcharmap, state, 0);
1065         if ( tmp ) {
1066             PerlIO_printf( Perl_debug_log, "%*s",
1067                 colwidth,
1068                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1069                             PL_colors[0], PL_colors[1],
1070                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1071                             PERL_PV_ESCAPE_FIRSTCHAR
1072                 )
1073             );
1074         }
1075     }
1076     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1077         (int)depth * 2 + 2,"");
1078
1079     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1080         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1081     PerlIO_printf( Perl_debug_log, "\n");
1082
1083     for( state = 1 ; state < trie->statecount ; state++ ) {
1084         const U32 base = trie->states[ state ].trans.base;
1085
1086         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1087
1088         if ( trie->states[ state ].wordnum ) {
1089             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1090         } else {
1091             PerlIO_printf( Perl_debug_log, "%6s", "" );
1092         }
1093
1094         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1095
1096         if ( base ) {
1097             U32 ofs = 0;
1098
1099             while( ( base + ofs  < trie->uniquecharcount ) ||
1100                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1101                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1102                     ofs++;
1103
1104             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1105
1106             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1107                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1108                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1109                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1110                 {
1111                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1112                     colwidth,
1113                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1114                 } else {
1115                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1116                 }
1117             }
1118
1119             PerlIO_printf( Perl_debug_log, "]");
1120
1121         }
1122         PerlIO_printf( Perl_debug_log, "\n" );
1123     }
1124     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1125     for (word=1; word <= trie->wordcount; word++) {
1126         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1127             (int)word, (int)(trie->wordinfo[word].prev),
1128             (int)(trie->wordinfo[word].len));
1129     }
1130     PerlIO_printf(Perl_debug_log, "\n" );
1131 }
1132 /*
1133   Dumps a fully constructed but uncompressed trie in list form.
1134   List tries normally only are used for construction when the number of
1135   possible chars (trie->uniquecharcount) is very high.
1136   Used for debugging make_trie().
1137 */
1138 STATIC void
1139 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1140                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1141                          U32 depth)
1142 {
1143     U32 state;
1144     SV *sv=sv_newmortal();
1145     int colwidth= widecharmap ? 6 : 4;
1146     GET_RE_DEBUG_FLAGS_DECL;
1147
1148     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1149
1150     /* print out the table precompression.  */
1151     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1152         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1153         "------:-----+-----------------\n" );
1154
1155     for( state=1 ; state < next_alloc ; state ++ ) {
1156         U16 charid;
1157
1158         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1159             (int)depth * 2 + 2,"", (UV)state  );
1160         if ( ! trie->states[ state ].wordnum ) {
1161             PerlIO_printf( Perl_debug_log, "%5s| ","");
1162         } else {
1163             PerlIO_printf( Perl_debug_log, "W%4x| ",
1164                 trie->states[ state ].wordnum
1165             );
1166         }
1167         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1168             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1169             if ( tmp ) {
1170                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1171                     colwidth,
1172                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1173                             PL_colors[0], PL_colors[1],
1174                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1175                             PERL_PV_ESCAPE_FIRSTCHAR
1176                     ) ,
1177                     TRIE_LIST_ITEM(state,charid).forid,
1178                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1179                 );
1180                 if (!(charid % 10))
1181                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1182                         (int)((depth * 2) + 14), "");
1183             }
1184         }
1185         PerlIO_printf( Perl_debug_log, "\n");
1186     }
1187 }
1188
1189 /*
1190   Dumps a fully constructed but uncompressed trie in table form.
1191   This is the normal DFA style state transition table, with a few
1192   twists to facilitate compression later.
1193   Used for debugging make_trie().
1194 */
1195 STATIC void
1196 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1197                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1198                           U32 depth)
1199 {
1200     U32 state;
1201     U16 charid;
1202     SV *sv=sv_newmortal();
1203     int colwidth= widecharmap ? 6 : 4;
1204     GET_RE_DEBUG_FLAGS_DECL;
1205
1206     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1207
1208     /*
1209        print out the table precompression so that we can do a visual check
1210        that they are identical.
1211      */
1212
1213     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1214
1215     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1216         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1217         if ( tmp ) {
1218             PerlIO_printf( Perl_debug_log, "%*s",
1219                 colwidth,
1220                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1221                             PL_colors[0], PL_colors[1],
1222                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1223                             PERL_PV_ESCAPE_FIRSTCHAR
1224                 )
1225             );
1226         }
1227     }
1228
1229     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1230
1231     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1232         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1233     }
1234
1235     PerlIO_printf( Perl_debug_log, "\n" );
1236
1237     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1238
1239         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1240             (int)depth * 2 + 2,"",
1241             (UV)TRIE_NODENUM( state ) );
1242
1243         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1244             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1245             if (v)
1246                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1247             else
1248                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1249         }
1250         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1251             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1252         } else {
1253             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1254             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1255         }
1256     }
1257 }
1258
1259 #endif
1260
1261
1262 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1263   startbranch: the first branch in the whole branch sequence
1264   first      : start branch of sequence of branch-exact nodes.
1265                May be the same as startbranch
1266   last       : Thing following the last branch.
1267                May be the same as tail.
1268   tail       : item following the branch sequence
1269   count      : words in the sequence
1270   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1271   depth      : indent depth
1272
1273 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1274
1275 A trie is an N'ary tree where the branches are determined by digital
1276 decomposition of the key. IE, at the root node you look up the 1st character and
1277 follow that branch repeat until you find the end of the branches. Nodes can be
1278 marked as "accepting" meaning they represent a complete word. Eg:
1279
1280   /he|she|his|hers/
1281
1282 would convert into the following structure. Numbers represent states, letters
1283 following numbers represent valid transitions on the letter from that state, if
1284 the number is in square brackets it represents an accepting state, otherwise it
1285 will be in parenthesis.
1286
1287       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1288       |    |
1289       |   (2)
1290       |    |
1291      (1)   +-i->(6)-+-s->[7]
1292       |
1293       +-s->(3)-+-h->(4)-+-e->[5]
1294
1295       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1296
1297 This shows that when matching against the string 'hers' we will begin at state 1
1298 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1299 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1300 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1301 single traverse. We store a mapping from accepting to state to which word was
1302 matched, and then when we have multiple possibilities we try to complete the
1303 rest of the regex in the order in which they occured in the alternation.
1304
1305 The only prior NFA like behaviour that would be changed by the TRIE support is
1306 the silent ignoring of duplicate alternations which are of the form:
1307
1308  / (DUPE|DUPE) X? (?{ ... }) Y /x
1309
1310 Thus EVAL blocks following a trie may be called a different number of times with
1311 and without the optimisation. With the optimisations dupes will be silently
1312 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1313 the following demonstrates:
1314
1315  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1316
1317 which prints out 'word' three times, but
1318
1319  'words'=~/(word|word|word)(?{ print $1 })S/
1320
1321 which doesnt print it out at all. This is due to other optimisations kicking in.
1322
1323 Example of what happens on a structural level:
1324
1325 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1326
1327    1: CURLYM[1] {1,32767}(18)
1328    5:   BRANCH(8)
1329    6:     EXACT <ac>(16)
1330    8:   BRANCH(11)
1331    9:     EXACT <ad>(16)
1332   11:   BRANCH(14)
1333   12:     EXACT <ab>(16)
1334   16:   SUCCEED(0)
1335   17:   NOTHING(18)
1336   18: END(0)
1337
1338 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1339 and should turn into:
1340
1341    1: CURLYM[1] {1,32767}(18)
1342    5:   TRIE(16)
1343         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1344           <ac>
1345           <ad>
1346           <ab>
1347   16:   SUCCEED(0)
1348   17:   NOTHING(18)
1349   18: END(0)
1350
1351 Cases where tail != last would be like /(?foo|bar)baz/:
1352
1353    1: BRANCH(4)
1354    2:   EXACT <foo>(8)
1355    4: BRANCH(7)
1356    5:   EXACT <bar>(8)
1357    7: TAIL(8)
1358    8: EXACT <baz>(10)
1359   10: END(0)
1360
1361 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1362 and would end up looking like:
1363
1364     1: TRIE(8)
1365       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1366         <foo>
1367         <bar>
1368    7: TAIL(8)
1369    8: EXACT <baz>(10)
1370   10: END(0)
1371
1372     d = uvuni_to_utf8_flags(d, uv, 0);
1373
1374 is the recommended Unicode-aware way of saying
1375
1376     *(d++) = uv;
1377 */
1378
1379 #define TRIE_STORE_REVCHAR(val)                                            \
1380     STMT_START {                                                           \
1381         if (UTF) {                                                         \
1382             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1383             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1384             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1385             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1386             SvPOK_on(zlopp);                                               \
1387             SvUTF8_on(zlopp);                                              \
1388             av_push(revcharmap, zlopp);                                    \
1389         } else {                                                           \
1390             char ooooff = (char)val;                                           \
1391             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1392         }                                                                  \
1393         } STMT_END
1394
1395 #define TRIE_READ_CHAR STMT_START {                                                     \
1396     wordlen++;                                                                          \
1397     if ( UTF ) {                                                                        \
1398         /* if it is UTF then it is either already folded, or does not need folding */   \
1399         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1400     }                                                                                   \
1401     else if (folder == PL_fold_latin1) {                                                \
1402         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1403         if ( foldlen > 0 ) {                                                            \
1404            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1405            foldlen -= len;                                                              \
1406            scan += len;                                                                 \
1407            len = 0;                                                                     \
1408         } else {                                                                        \
1409             len = 1;                                                                    \
1410             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1411             skiplen = UNISKIP(uvc);                                                     \
1412             foldlen -= skiplen;                                                         \
1413             scan = foldbuf + skiplen;                                                   \
1414         }                                                                               \
1415     } else {                                                                            \
1416         /* raw data, will be folded later if needed */                                  \
1417         uvc = (U32)*uc;                                                                 \
1418         len = 1;                                                                        \
1419     }                                                                                   \
1420 } STMT_END
1421
1422
1423
1424 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1425     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1426         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1427         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1428     }                                                           \
1429     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1430     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1431     TRIE_LIST_CUR( state )++;                                   \
1432 } STMT_END
1433
1434 #define TRIE_LIST_NEW(state) STMT_START {                       \
1435     Newxz( trie->states[ state ].trans.list,               \
1436         4, reg_trie_trans_le );                                 \
1437      TRIE_LIST_CUR( state ) = 1;                                \
1438      TRIE_LIST_LEN( state ) = 4;                                \
1439 } STMT_END
1440
1441 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1442     U16 dupe= trie->states[ state ].wordnum;                    \
1443     regnode * const noper_next = regnext( noper );              \
1444                                                                 \
1445     DEBUG_r({                                                   \
1446         /* store the word for dumping */                        \
1447         SV* tmp;                                                \
1448         if (OP(noper) != NOTHING)                               \
1449             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1450         else                                                    \
1451             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1452         av_push( trie_words, tmp );                             \
1453     });                                                         \
1454                                                                 \
1455     curword++;                                                  \
1456     trie->wordinfo[curword].prev   = 0;                         \
1457     trie->wordinfo[curword].len    = wordlen;                   \
1458     trie->wordinfo[curword].accept = state;                     \
1459                                                                 \
1460     if ( noper_next < tail ) {                                  \
1461         if (!trie->jump)                                        \
1462             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1463         trie->jump[curword] = (U16)(noper_next - convert);      \
1464         if (!jumper)                                            \
1465             jumper = noper_next;                                \
1466         if (!nextbranch)                                        \
1467             nextbranch= regnext(cur);                           \
1468     }                                                           \
1469                                                                 \
1470     if ( dupe ) {                                               \
1471         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1472         /* chain, so that when the bits of chain are later    */\
1473         /* linked together, the dups appear in the chain      */\
1474         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1475         trie->wordinfo[dupe].prev = curword;                    \
1476     } else {                                                    \
1477         /* we haven't inserted this word yet.                */ \
1478         trie->states[ state ].wordnum = curword;                \
1479     }                                                           \
1480 } STMT_END
1481
1482
1483 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1484      ( ( base + charid >=  ucharcount                                   \
1485          && base + charid < ubound                                      \
1486          && state == trie->trans[ base - ucharcount + charid ].check    \
1487          && trie->trans[ base - ucharcount + charid ].next )            \
1488            ? trie->trans[ base - ucharcount + charid ].next             \
1489            : ( state==1 ? special : 0 )                                 \
1490       )
1491
1492 #define MADE_TRIE       1
1493 #define MADE_JUMP_TRIE  2
1494 #define MADE_EXACT_TRIE 4
1495
1496 STATIC I32
1497 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1498 {
1499     dVAR;
1500     /* first pass, loop through and scan words */
1501     reg_trie_data *trie;
1502     HV *widecharmap = NULL;
1503     AV *revcharmap = newAV();
1504     regnode *cur;
1505     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1506     STRLEN len = 0;
1507     UV uvc = 0;
1508     U16 curword = 0;
1509     U32 next_alloc = 0;
1510     regnode *jumper = NULL;
1511     regnode *nextbranch = NULL;
1512     regnode *convert = NULL;
1513     U32 *prev_states; /* temp array mapping each state to previous one */
1514     /* we just use folder as a flag in utf8 */
1515     const U8 * folder = NULL;
1516
1517 #ifdef DEBUGGING
1518     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1519     AV *trie_words = NULL;
1520     /* along with revcharmap, this only used during construction but both are
1521      * useful during debugging so we store them in the struct when debugging.
1522      */
1523 #else
1524     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1525     STRLEN trie_charcount=0;
1526 #endif
1527     SV *re_trie_maxbuff;
1528     GET_RE_DEBUG_FLAGS_DECL;
1529
1530     PERL_ARGS_ASSERT_MAKE_TRIE;
1531 #ifndef DEBUGGING
1532     PERL_UNUSED_ARG(depth);
1533 #endif
1534
1535     switch (flags) {
1536         case EXACT: break;
1537         case EXACTFA:
1538         case EXACTFU_SS:
1539         case EXACTFU_TRICKYFOLD:
1540         case EXACTFU: folder = PL_fold_latin1; break;
1541         case EXACTF:  folder = PL_fold; break;
1542         case EXACTFL: folder = PL_fold_locale; break;
1543         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1544     }
1545
1546     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1547     trie->refcount = 1;
1548     trie->startstate = 1;
1549     trie->wordcount = word_count;
1550     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1551     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1552     if (flags == EXACT)
1553         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1554     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1555                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1556
1557     DEBUG_r({
1558         trie_words = newAV();
1559     });
1560
1561     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1562     if (!SvIOK(re_trie_maxbuff)) {
1563         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1564     }
1565     DEBUG_TRIE_COMPILE_r({
1566                 PerlIO_printf( Perl_debug_log,
1567                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1568                   (int)depth * 2 + 2, "",
1569                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1570                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1571                   (int)depth);
1572     });
1573
1574    /* Find the node we are going to overwrite */
1575     if ( first == startbranch && OP( last ) != BRANCH ) {
1576         /* whole branch chain */
1577         convert = first;
1578     } else {
1579         /* branch sub-chain */
1580         convert = NEXTOPER( first );
1581     }
1582
1583     /*  -- First loop and Setup --
1584
1585        We first traverse the branches and scan each word to determine if it
1586        contains widechars, and how many unique chars there are, this is
1587        important as we have to build a table with at least as many columns as we
1588        have unique chars.
1589
1590        We use an array of integers to represent the character codes 0..255
1591        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1592        native representation of the character value as the key and IV's for the
1593        coded index.
1594
1595        *TODO* If we keep track of how many times each character is used we can
1596        remap the columns so that the table compression later on is more
1597        efficient in terms of memory by ensuring the most common value is in the
1598        middle and the least common are on the outside.  IMO this would be better
1599        than a most to least common mapping as theres a decent chance the most
1600        common letter will share a node with the least common, meaning the node
1601        will not be compressible. With a middle is most common approach the worst
1602        case is when we have the least common nodes twice.
1603
1604      */
1605
1606     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1607         regnode *noper = NEXTOPER( cur );
1608         const U8 *uc = (U8*)STRING( noper );
1609         const U8 *e  = uc + STR_LEN( noper );
1610         STRLEN foldlen = 0;
1611         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1612         STRLEN skiplen = 0;
1613         const U8 *scan = (U8*)NULL;
1614         U32 wordlen      = 0;         /* required init */
1615         STRLEN chars = 0;
1616         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1617
1618         if (OP(noper) == NOTHING) {
1619             regnode *noper_next= regnext(noper);
1620             if (noper_next != tail && OP(noper_next) == flags) {
1621                 noper = noper_next;
1622                 uc= (U8*)STRING(noper);
1623                 e= uc + STR_LEN(noper);
1624                 trie->minlen= STR_LEN(noper);
1625             } else {
1626                 trie->minlen= 0;
1627                 continue;
1628             }
1629         }
1630
1631         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1632             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1633                                           regardless of encoding */
1634             if (OP( noper ) == EXACTFU_SS) {
1635                 /* false positives are ok, so just set this */
1636                 TRIE_BITMAP_SET(trie,0xDF);
1637             }
1638         }
1639         for ( ; uc < e ; uc += len ) {
1640             TRIE_CHARCOUNT(trie)++;
1641             TRIE_READ_CHAR;
1642             chars++;
1643             if ( uvc < 256 ) {
1644                 if ( folder ) {
1645                     U8 folded= folder[ (U8) uvc ];
1646                     if ( !trie->charmap[ folded ] ) {
1647                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1648                         TRIE_STORE_REVCHAR( folded );
1649                     }
1650                 }
1651                 if ( !trie->charmap[ uvc ] ) {
1652                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1653                     TRIE_STORE_REVCHAR( uvc );
1654                 }
1655                 if ( set_bit ) {
1656                     /* store the codepoint in the bitmap, and its folded
1657                      * equivalent. */
1658                     TRIE_BITMAP_SET(trie, uvc);
1659
1660                     /* store the folded codepoint */
1661                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1662
1663                     if ( !UTF ) {
1664                         /* store first byte of utf8 representation of
1665                            variant codepoints */
1666                         if (! UNI_IS_INVARIANT(uvc)) {
1667                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1668                         }
1669                     }
1670                     set_bit = 0; /* We've done our bit :-) */
1671                 }
1672             } else {
1673                 SV** svpp;
1674                 if ( !widecharmap )
1675                     widecharmap = newHV();
1676
1677                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1678
1679                 if ( !svpp )
1680                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1681
1682                 if ( !SvTRUE( *svpp ) ) {
1683                     sv_setiv( *svpp, ++trie->uniquecharcount );
1684                     TRIE_STORE_REVCHAR(uvc);
1685                 }
1686             }
1687         }
1688         if( cur == first ) {
1689             trie->minlen = chars;
1690             trie->maxlen = chars;
1691         } else if (chars < trie->minlen) {
1692             trie->minlen = chars;
1693         } else if (chars > trie->maxlen) {
1694             trie->maxlen = chars;
1695         }
1696         if (OP( noper ) == EXACTFU_SS) {
1697             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1698             if (trie->minlen > 1)
1699                 trie->minlen= 1;
1700         }
1701         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1702             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1703              *                - We assume that any such sequence might match a 2 byte string */
1704             if (trie->minlen > 2 )
1705                 trie->minlen= 2;
1706         }
1707
1708     } /* end first pass */
1709     DEBUG_TRIE_COMPILE_r(
1710         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1711                 (int)depth * 2 + 2,"",
1712                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1713                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1714                 (int)trie->minlen, (int)trie->maxlen )
1715     );
1716
1717     /*
1718         We now know what we are dealing with in terms of unique chars and
1719         string sizes so we can calculate how much memory a naive
1720         representation using a flat table  will take. If it's over a reasonable
1721         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1722         conservative but potentially much slower representation using an array
1723         of lists.
1724
1725         At the end we convert both representations into the same compressed
1726         form that will be used in regexec.c for matching with. The latter
1727         is a form that cannot be used to construct with but has memory
1728         properties similar to the list form and access properties similar
1729         to the table form making it both suitable for fast searches and
1730         small enough that its feasable to store for the duration of a program.
1731
1732         See the comment in the code where the compressed table is produced
1733         inplace from the flat tabe representation for an explanation of how
1734         the compression works.
1735
1736     */
1737
1738
1739     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1740     prev_states[1] = 0;
1741
1742     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1743         /*
1744             Second Pass -- Array Of Lists Representation
1745
1746             Each state will be represented by a list of charid:state records
1747             (reg_trie_trans_le) the first such element holds the CUR and LEN
1748             points of the allocated array. (See defines above).
1749
1750             We build the initial structure using the lists, and then convert
1751             it into the compressed table form which allows faster lookups
1752             (but cant be modified once converted).
1753         */
1754
1755         STRLEN transcount = 1;
1756
1757         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1758             "%*sCompiling trie using list compiler\n",
1759             (int)depth * 2 + 2, ""));
1760
1761         trie->states = (reg_trie_state *)
1762             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1763                                   sizeof(reg_trie_state) );
1764         TRIE_LIST_NEW(1);
1765         next_alloc = 2;
1766
1767         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1768
1769             regnode *noper   = NEXTOPER( cur );
1770             U8 *uc           = (U8*)STRING( noper );
1771             const U8 *e      = uc + STR_LEN( noper );
1772             U32 state        = 1;         /* required init */
1773             U16 charid       = 0;         /* sanity init */
1774             U8 *scan         = (U8*)NULL; /* sanity init */
1775             STRLEN foldlen   = 0;         /* required init */
1776             U32 wordlen      = 0;         /* required init */
1777             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1778             STRLEN skiplen   = 0;
1779
1780             if (OP(noper) == NOTHING) {
1781                 regnode *noper_next= regnext(noper);
1782                 if (noper_next != tail && OP(noper_next) == flags) {
1783                     noper = noper_next;
1784                     uc= (U8*)STRING(noper);
1785                     e= uc + STR_LEN(noper);
1786                 }
1787             }
1788
1789             if (OP(noper) != NOTHING) {
1790                 for ( ; uc < e ; uc += len ) {
1791
1792                     TRIE_READ_CHAR;
1793
1794                     if ( uvc < 256 ) {
1795                         charid = trie->charmap[ uvc ];
1796                     } else {
1797                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1798                         if ( !svpp ) {
1799                             charid = 0;
1800                         } else {
1801                             charid=(U16)SvIV( *svpp );
1802                         }
1803                     }
1804                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1805                     if ( charid ) {
1806
1807                         U16 check;
1808                         U32 newstate = 0;
1809
1810                         charid--;
1811                         if ( !trie->states[ state ].trans.list ) {
1812                             TRIE_LIST_NEW( state );
1813                         }
1814                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1815                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1816                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1817                                 break;
1818                             }
1819                         }
1820                         if ( ! newstate ) {
1821                             newstate = next_alloc++;
1822                             prev_states[newstate] = state;
1823                             TRIE_LIST_PUSH( state, charid, newstate );
1824                             transcount++;
1825                         }
1826                         state = newstate;
1827                     } else {
1828                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1829                     }
1830                 }
1831             }
1832             TRIE_HANDLE_WORD(state);
1833
1834         } /* end second pass */
1835
1836         /* next alloc is the NEXT state to be allocated */
1837         trie->statecount = next_alloc;
1838         trie->states = (reg_trie_state *)
1839             PerlMemShared_realloc( trie->states,
1840                                    next_alloc
1841                                    * sizeof(reg_trie_state) );
1842
1843         /* and now dump it out before we compress it */
1844         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1845                                                          revcharmap, next_alloc,
1846                                                          depth+1)
1847         );
1848
1849         trie->trans = (reg_trie_trans *)
1850             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1851         {
1852             U32 state;
1853             U32 tp = 0;
1854             U32 zp = 0;
1855
1856
1857             for( state=1 ; state < next_alloc ; state ++ ) {
1858                 U32 base=0;
1859
1860                 /*
1861                 DEBUG_TRIE_COMPILE_MORE_r(
1862                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1863                 );
1864                 */
1865
1866                 if (trie->states[state].trans.list) {
1867                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1868                     U16 maxid=minid;
1869                     U16 idx;
1870
1871                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1872                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1873                         if ( forid < minid ) {
1874                             minid=forid;
1875                         } else if ( forid > maxid ) {
1876                             maxid=forid;
1877                         }
1878                     }
1879                     if ( transcount < tp + maxid - minid + 1) {
1880                         transcount *= 2;
1881                         trie->trans = (reg_trie_trans *)
1882                             PerlMemShared_realloc( trie->trans,
1883                                                      transcount
1884                                                      * sizeof(reg_trie_trans) );
1885                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1886                     }
1887                     base = trie->uniquecharcount + tp - minid;
1888                     if ( maxid == minid ) {
1889                         U32 set = 0;
1890                         for ( ; zp < tp ; zp++ ) {
1891                             if ( ! trie->trans[ zp ].next ) {
1892                                 base = trie->uniquecharcount + zp - minid;
1893                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1894                                 trie->trans[ zp ].check = state;
1895                                 set = 1;
1896                                 break;
1897                             }
1898                         }
1899                         if ( !set ) {
1900                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1901                             trie->trans[ tp ].check = state;
1902                             tp++;
1903                             zp = tp;
1904                         }
1905                     } else {
1906                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1907                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1908                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1909                             trie->trans[ tid ].check = state;
1910                         }
1911                         tp += ( maxid - minid + 1 );
1912                     }
1913                     Safefree(trie->states[ state ].trans.list);
1914                 }
1915                 /*
1916                 DEBUG_TRIE_COMPILE_MORE_r(
1917                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1918                 );
1919                 */
1920                 trie->states[ state ].trans.base=base;
1921             }
1922             trie->lasttrans = tp + 1;
1923         }
1924     } else {
1925         /*
1926            Second Pass -- Flat Table Representation.
1927
1928            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1929            We know that we will need Charcount+1 trans at most to store the data
1930            (one row per char at worst case) So we preallocate both structures
1931            assuming worst case.
1932
1933            We then construct the trie using only the .next slots of the entry
1934            structs.
1935
1936            We use the .check field of the first entry of the node temporarily to
1937            make compression both faster and easier by keeping track of how many non
1938            zero fields are in the node.
1939
1940            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1941            transition.
1942
1943            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1944            number representing the first entry of the node, and state as a
1945            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1946            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1947            are 2 entrys per node. eg:
1948
1949              A B       A B
1950           1. 2 4    1. 3 7
1951           2. 0 3    3. 0 5
1952           3. 0 0    5. 0 0
1953           4. 0 0    7. 0 0
1954
1955            The table is internally in the right hand, idx form. However as we also
1956            have to deal with the states array which is indexed by nodenum we have to
1957            use TRIE_NODENUM() to convert.
1958
1959         */
1960         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1961             "%*sCompiling trie using table compiler\n",
1962             (int)depth * 2 + 2, ""));
1963
1964         trie->trans = (reg_trie_trans *)
1965             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1966                                   * trie->uniquecharcount + 1,
1967                                   sizeof(reg_trie_trans) );
1968         trie->states = (reg_trie_state *)
1969             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1970                                   sizeof(reg_trie_state) );
1971         next_alloc = trie->uniquecharcount + 1;
1972
1973
1974         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1975
1976             regnode *noper   = NEXTOPER( cur );
1977             const U8 *uc     = (U8*)STRING( noper );
1978             const U8 *e      = uc + STR_LEN( noper );
1979
1980             U32 state        = 1;         /* required init */
1981
1982             U16 charid       = 0;         /* sanity init */
1983             U32 accept_state = 0;         /* sanity init */
1984             U8 *scan         = (U8*)NULL; /* sanity init */
1985
1986             STRLEN foldlen   = 0;         /* required init */
1987             U32 wordlen      = 0;         /* required init */
1988             STRLEN skiplen   = 0;
1989             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1990
1991             if (OP(noper) == NOTHING) {
1992                 regnode *noper_next= regnext(noper);
1993                 if (noper_next != tail && OP(noper_next) == flags) {
1994                     noper = noper_next;
1995                     uc= (U8*)STRING(noper);
1996                     e= uc + STR_LEN(noper);
1997                 }
1998             }
1999
2000             if ( OP(noper) != NOTHING ) {
2001                 for ( ; uc < e ; uc += len ) {
2002
2003                     TRIE_READ_CHAR;
2004
2005                     if ( uvc < 256 ) {
2006                         charid = trie->charmap[ uvc ];
2007                     } else {
2008                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2009                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2010                     }
2011                     if ( charid ) {
2012                         charid--;
2013                         if ( !trie->trans[ state + charid ].next ) {
2014                             trie->trans[ state + charid ].next = next_alloc;
2015                             trie->trans[ state ].check++;
2016                             prev_states[TRIE_NODENUM(next_alloc)]
2017                                     = TRIE_NODENUM(state);
2018                             next_alloc += trie->uniquecharcount;
2019                         }
2020                         state = trie->trans[ state + charid ].next;
2021                     } else {
2022                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2023                     }
2024                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2025                 }
2026             }
2027             accept_state = TRIE_NODENUM( state );
2028             TRIE_HANDLE_WORD(accept_state);
2029
2030         } /* end second pass */
2031
2032         /* and now dump it out before we compress it */
2033         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2034                                                           revcharmap,
2035                                                           next_alloc, depth+1));
2036
2037         {
2038         /*
2039            * Inplace compress the table.*
2040
2041            For sparse data sets the table constructed by the trie algorithm will
2042            be mostly 0/FAIL transitions or to put it another way mostly empty.
2043            (Note that leaf nodes will not contain any transitions.)
2044
2045            This algorithm compresses the tables by eliminating most such
2046            transitions, at the cost of a modest bit of extra work during lookup:
2047
2048            - Each states[] entry contains a .base field which indicates the
2049            index in the state[] array wheres its transition data is stored.
2050
2051            - If .base is 0 there are no valid transitions from that node.
2052
2053            - If .base is nonzero then charid is added to it to find an entry in
2054            the trans array.
2055
2056            -If trans[states[state].base+charid].check!=state then the
2057            transition is taken to be a 0/Fail transition. Thus if there are fail
2058            transitions at the front of the node then the .base offset will point
2059            somewhere inside the previous nodes data (or maybe even into a node
2060            even earlier), but the .check field determines if the transition is
2061            valid.
2062
2063            XXX - wrong maybe?
2064            The following process inplace converts the table to the compressed
2065            table: We first do not compress the root node 1,and mark all its
2066            .check pointers as 1 and set its .base pointer as 1 as well. This
2067            allows us to do a DFA construction from the compressed table later,
2068            and ensures that any .base pointers we calculate later are greater
2069            than 0.
2070
2071            - We set 'pos' to indicate the first entry of the second node.
2072
2073            - We then iterate over the columns of the node, finding the first and
2074            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2075            and set the .check pointers accordingly, and advance pos
2076            appropriately and repreat for the next node. Note that when we copy
2077            the next pointers we have to convert them from the original
2078            NODEIDX form to NODENUM form as the former is not valid post
2079            compression.
2080
2081            - If a node has no transitions used we mark its base as 0 and do not
2082            advance the pos pointer.
2083
2084            - If a node only has one transition we use a second pointer into the
2085            structure to fill in allocated fail transitions from other states.
2086            This pointer is independent of the main pointer and scans forward
2087            looking for null transitions that are allocated to a state. When it
2088            finds one it writes the single transition into the "hole".  If the
2089            pointer doesnt find one the single transition is appended as normal.
2090
2091            - Once compressed we can Renew/realloc the structures to release the
2092            excess space.
2093
2094            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2095            specifically Fig 3.47 and the associated pseudocode.
2096
2097            demq
2098         */
2099         const U32 laststate = TRIE_NODENUM( next_alloc );
2100         U32 state, charid;
2101         U32 pos = 0, zp=0;
2102         trie->statecount = laststate;
2103
2104         for ( state = 1 ; state < laststate ; state++ ) {
2105             U8 flag = 0;
2106             const U32 stateidx = TRIE_NODEIDX( state );
2107             const U32 o_used = trie->trans[ stateidx ].check;
2108             U32 used = trie->trans[ stateidx ].check;
2109             trie->trans[ stateidx ].check = 0;
2110
2111             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2112                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2113                     if ( trie->trans[ stateidx + charid ].next ) {
2114                         if (o_used == 1) {
2115                             for ( ; zp < pos ; zp++ ) {
2116                                 if ( ! trie->trans[ zp ].next ) {
2117                                     break;
2118                                 }
2119                             }
2120                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2121                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2122                             trie->trans[ zp ].check = state;
2123                             if ( ++zp > pos ) pos = zp;
2124                             break;
2125                         }
2126                         used--;
2127                     }
2128                     if ( !flag ) {
2129                         flag = 1;
2130                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2131                     }
2132                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2133                     trie->trans[ pos ].check = state;
2134                     pos++;
2135                 }
2136             }
2137         }
2138         trie->lasttrans = pos + 1;
2139         trie->states = (reg_trie_state *)
2140             PerlMemShared_realloc( trie->states, laststate
2141                                    * sizeof(reg_trie_state) );
2142         DEBUG_TRIE_COMPILE_MORE_r(
2143                 PerlIO_printf( Perl_debug_log,
2144                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2145                     (int)depth * 2 + 2,"",
2146                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2147                     (IV)next_alloc,
2148                     (IV)pos,
2149                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2150             );
2151
2152         } /* end table compress */
2153     }
2154     DEBUG_TRIE_COMPILE_MORE_r(
2155             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2156                 (int)depth * 2 + 2, "",
2157                 (UV)trie->statecount,
2158                 (UV)trie->lasttrans)
2159     );
2160     /* resize the trans array to remove unused space */
2161     trie->trans = (reg_trie_trans *)
2162         PerlMemShared_realloc( trie->trans, trie->lasttrans
2163                                * sizeof(reg_trie_trans) );
2164
2165     {   /* Modify the program and insert the new TRIE node */
2166         U8 nodetype =(U8)(flags & 0xFF);
2167         char *str=NULL;
2168
2169 #ifdef DEBUGGING
2170         regnode *optimize = NULL;
2171 #ifdef RE_TRACK_PATTERN_OFFSETS
2172
2173         U32 mjd_offset = 0;
2174         U32 mjd_nodelen = 0;
2175 #endif /* RE_TRACK_PATTERN_OFFSETS */
2176 #endif /* DEBUGGING */
2177         /*
2178            This means we convert either the first branch or the first Exact,
2179            depending on whether the thing following (in 'last') is a branch
2180            or not and whther first is the startbranch (ie is it a sub part of
2181            the alternation or is it the whole thing.)
2182            Assuming its a sub part we convert the EXACT otherwise we convert
2183            the whole branch sequence, including the first.
2184          */
2185         /* Find the node we are going to overwrite */
2186         if ( first != startbranch || OP( last ) == BRANCH ) {
2187             /* branch sub-chain */
2188             NEXT_OFF( first ) = (U16)(last - first);
2189 #ifdef RE_TRACK_PATTERN_OFFSETS
2190             DEBUG_r({
2191                 mjd_offset= Node_Offset((convert));
2192                 mjd_nodelen= Node_Length((convert));
2193             });
2194 #endif
2195             /* whole branch chain */
2196         }
2197 #ifdef RE_TRACK_PATTERN_OFFSETS
2198         else {
2199             DEBUG_r({
2200                 const  regnode *nop = NEXTOPER( convert );
2201                 mjd_offset= Node_Offset((nop));
2202                 mjd_nodelen= Node_Length((nop));
2203             });
2204         }
2205         DEBUG_OPTIMISE_r(
2206             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2207                 (int)depth * 2 + 2, "",
2208                 (UV)mjd_offset, (UV)mjd_nodelen)
2209         );
2210 #endif
2211         /* But first we check to see if there is a common prefix we can
2212            split out as an EXACT and put in front of the TRIE node.  */
2213         trie->startstate= 1;
2214         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2215             U32 state;
2216             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2217                 U32 ofs = 0;
2218                 I32 idx = -1;
2219                 U32 count = 0;
2220                 const U32 base = trie->states[ state ].trans.base;
2221
2222                 if ( trie->states[state].wordnum )
2223                         count = 1;
2224
2225                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2226                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2227                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2228                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2229                     {
2230                         if ( ++count > 1 ) {
2231                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2232                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2233                             if ( state == 1 ) break;
2234                             if ( count == 2 ) {
2235                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2236                                 DEBUG_OPTIMISE_r(
2237                                     PerlIO_printf(Perl_debug_log,
2238                                         "%*sNew Start State=%"UVuf" Class: [",
2239                                         (int)depth * 2 + 2, "",
2240                                         (UV)state));
2241                                 if (idx >= 0) {
2242                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2243                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2244
2245                                     TRIE_BITMAP_SET(trie,*ch);
2246                                     if ( folder )
2247                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2248                                     DEBUG_OPTIMISE_r(
2249                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2250                                     );
2251                                 }
2252                             }
2253                             TRIE_BITMAP_SET(trie,*ch);
2254                             if ( folder )
2255                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2256                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2257                         }
2258                         idx = ofs;
2259                     }
2260                 }
2261                 if ( count == 1 ) {
2262                     SV **tmp = av_fetch( revcharmap, idx, 0);
2263                     STRLEN len;
2264                     char *ch = SvPV( *tmp, len );
2265                     DEBUG_OPTIMISE_r({
2266                         SV *sv=sv_newmortal();
2267                         PerlIO_printf( Perl_debug_log,
2268                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2269                             (int)depth * 2 + 2, "",
2270                             (UV)state, (UV)idx,
2271                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2272                                 PL_colors[0], PL_colors[1],
2273                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2274                                 PERL_PV_ESCAPE_FIRSTCHAR
2275                             )
2276                         );
2277                     });
2278                     if ( state==1 ) {
2279                         OP( convert ) = nodetype;
2280                         str=STRING(convert);
2281                         STR_LEN(convert)=0;
2282                     }
2283                     STR_LEN(convert) += len;
2284                     while (len--)
2285                         *str++ = *ch++;
2286                 } else {
2287 #ifdef DEBUGGING
2288                     if (state>1)
2289                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2290 #endif
2291                     break;
2292                 }
2293             }
2294             trie->prefixlen = (state-1);
2295             if (str) {
2296                 regnode *n = convert+NODE_SZ_STR(convert);
2297                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2298                 trie->startstate = state;
2299                 trie->minlen -= (state - 1);
2300                 trie->maxlen -= (state - 1);
2301 #ifdef DEBUGGING
2302                /* At least the UNICOS C compiler choked on this
2303                 * being argument to DEBUG_r(), so let's just have
2304                 * it right here. */
2305                if (
2306 #ifdef PERL_EXT_RE_BUILD
2307                    1
2308 #else
2309                    DEBUG_r_TEST
2310 #endif
2311                    ) {
2312                    regnode *fix = convert;
2313                    U32 word = trie->wordcount;
2314                    mjd_nodelen++;
2315                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2316                    while( ++fix < n ) {
2317                        Set_Node_Offset_Length(fix, 0, 0);
2318                    }
2319                    while (word--) {
2320                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2321                        if (tmp) {
2322                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2323                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2324                            else
2325                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2326                        }
2327                    }
2328                }
2329 #endif
2330                 if (trie->maxlen) {
2331                     convert = n;
2332                 } else {
2333                     NEXT_OFF(convert) = (U16)(tail - convert);
2334                     DEBUG_r(optimize= n);
2335                 }
2336             }
2337         }
2338         if (!jumper)
2339             jumper = last;
2340         if ( trie->maxlen ) {
2341             NEXT_OFF( convert ) = (U16)(tail - convert);
2342             ARG_SET( convert, data_slot );
2343             /* Store the offset to the first unabsorbed branch in
2344                jump[0], which is otherwise unused by the jump logic.
2345                We use this when dumping a trie and during optimisation. */
2346             if (trie->jump)
2347                 trie->jump[0] = (U16)(nextbranch - convert);
2348
2349             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2350              *   and there is a bitmap
2351              *   and the first "jump target" node we found leaves enough room
2352              * then convert the TRIE node into a TRIEC node, with the bitmap
2353              * embedded inline in the opcode - this is hypothetically faster.
2354              */
2355             if ( !trie->states[trie->startstate].wordnum
2356                  && trie->bitmap
2357                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2358             {
2359                 OP( convert ) = TRIEC;
2360                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2361                 PerlMemShared_free(trie->bitmap);
2362                 trie->bitmap= NULL;
2363             } else
2364                 OP( convert ) = TRIE;
2365
2366             /* store the type in the flags */
2367             convert->flags = nodetype;
2368             DEBUG_r({
2369             optimize = convert
2370                       + NODE_STEP_REGNODE
2371                       + regarglen[ OP( convert ) ];
2372             });
2373             /* XXX We really should free up the resource in trie now,
2374                    as we won't use them - (which resources?) dmq */
2375         }
2376         /* needed for dumping*/
2377         DEBUG_r(if (optimize) {
2378             regnode *opt = convert;
2379
2380             while ( ++opt < optimize) {
2381                 Set_Node_Offset_Length(opt,0,0);
2382             }
2383             /*
2384                 Try to clean up some of the debris left after the
2385                 optimisation.
2386              */
2387             while( optimize < jumper ) {
2388                 mjd_nodelen += Node_Length((optimize));
2389                 OP( optimize ) = OPTIMIZED;
2390                 Set_Node_Offset_Length(optimize,0,0);
2391                 optimize++;
2392             }
2393             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2394         });
2395     } /* end node insert */
2396
2397     /*  Finish populating the prev field of the wordinfo array.  Walk back
2398      *  from each accept state until we find another accept state, and if
2399      *  so, point the first word's .prev field at the second word. If the
2400      *  second already has a .prev field set, stop now. This will be the
2401      *  case either if we've already processed that word's accept state,
2402      *  or that state had multiple words, and the overspill words were
2403      *  already linked up earlier.
2404      */
2405     {
2406         U16 word;
2407         U32 state;
2408         U16 prev;
2409
2410         for (word=1; word <= trie->wordcount; word++) {
2411             prev = 0;
2412             if (trie->wordinfo[word].prev)
2413                 continue;
2414             state = trie->wordinfo[word].accept;
2415             while (state) {
2416                 state = prev_states[state];
2417                 if (!state)
2418                     break;
2419                 prev = trie->states[state].wordnum;
2420                 if (prev)
2421                     break;
2422             }
2423             trie->wordinfo[word].prev = prev;
2424         }
2425         Safefree(prev_states);
2426     }
2427
2428
2429     /* and now dump out the compressed format */
2430     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2431
2432     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2433 #ifdef DEBUGGING
2434     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2435     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2436 #else
2437     SvREFCNT_dec(revcharmap);
2438 #endif
2439     return trie->jump
2440            ? MADE_JUMP_TRIE
2441            : trie->startstate>1
2442              ? MADE_EXACT_TRIE
2443              : MADE_TRIE;
2444 }
2445
2446 STATIC void
2447 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2448 {
2449 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2450
2451    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2452    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2453    ISBN 0-201-10088-6
2454
2455    We find the fail state for each state in the trie, this state is the longest proper
2456    suffix of the current state's 'word' that is also a proper prefix of another word in our
2457    trie. State 1 represents the word '' and is thus the default fail state. This allows
2458    the DFA not to have to restart after its tried and failed a word at a given point, it
2459    simply continues as though it had been matching the other word in the first place.
2460    Consider
2461       'abcdgu'=~/abcdefg|cdgu/
2462    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2463    fail, which would bring us to the state representing 'd' in the second word where we would
2464    try 'g' and succeed, proceeding to match 'cdgu'.
2465  */
2466  /* add a fail transition */
2467     const U32 trie_offset = ARG(source);
2468     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2469     U32 *q;
2470     const U32 ucharcount = trie->uniquecharcount;
2471     const U32 numstates = trie->statecount;
2472     const U32 ubound = trie->lasttrans + ucharcount;
2473     U32 q_read = 0;
2474     U32 q_write = 0;
2475     U32 charid;
2476     U32 base = trie->states[ 1 ].trans.base;
2477     U32 *fail;
2478     reg_ac_data *aho;
2479     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2480     GET_RE_DEBUG_FLAGS_DECL;
2481
2482     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2483 #ifndef DEBUGGING
2484     PERL_UNUSED_ARG(depth);
2485 #endif
2486
2487
2488     ARG_SET( stclass, data_slot );
2489     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2490     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2491     aho->trie=trie_offset;
2492     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2493     Copy( trie->states, aho->states, numstates, reg_trie_state );
2494     Newxz( q, numstates, U32);
2495     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2496     aho->refcount = 1;
2497     fail = aho->fail;
2498     /* initialize fail[0..1] to be 1 so that we always have
2499        a valid final fail state */
2500     fail[ 0 ] = fail[ 1 ] = 1;
2501
2502     for ( charid = 0; charid < ucharcount ; charid++ ) {
2503         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2504         if ( newstate ) {
2505             q[ q_write ] = newstate;
2506             /* set to point at the root */
2507             fail[ q[ q_write++ ] ]=1;
2508         }
2509     }
2510     while ( q_read < q_write) {
2511         const U32 cur = q[ q_read++ % numstates ];
2512         base = trie->states[ cur ].trans.base;
2513
2514         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2515             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2516             if (ch_state) {
2517                 U32 fail_state = cur;
2518                 U32 fail_base;
2519                 do {
2520                     fail_state = fail[ fail_state ];
2521                     fail_base = aho->states[ fail_state ].trans.base;
2522                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2523
2524                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2525                 fail[ ch_state ] = fail_state;
2526                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2527                 {
2528                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2529                 }
2530                 q[ q_write++ % numstates] = ch_state;
2531             }
2532         }
2533     }
2534     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2535        when we fail in state 1, this allows us to use the
2536        charclass scan to find a valid start char. This is based on the principle
2537        that theres a good chance the string being searched contains lots of stuff
2538        that cant be a start char.
2539      */
2540     fail[ 0 ] = fail[ 1 ] = 0;
2541     DEBUG_TRIE_COMPILE_r({
2542         PerlIO_printf(Perl_debug_log,
2543                       "%*sStclass Failtable (%"UVuf" states): 0",
2544                       (int)(depth * 2), "", (UV)numstates
2545         );
2546         for( q_read=1; q_read<numstates; q_read++ ) {
2547             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2548         }
2549         PerlIO_printf(Perl_debug_log, "\n");
2550     });
2551     Safefree(q);
2552     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2553 }
2554
2555
2556 /*
2557  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2558  * These need to be revisited when a newer toolchain becomes available.
2559  */
2560 #if defined(__sparc64__) && defined(__GNUC__)
2561 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2562 #       undef  SPARC64_GCC_WORKAROUND
2563 #       define SPARC64_GCC_WORKAROUND 1
2564 #   endif
2565 #endif
2566
2567 #define DEBUG_PEEP(str,scan,depth) \
2568     DEBUG_OPTIMISE_r({if (scan){ \
2569        SV * const mysv=sv_newmortal(); \
2570        regnode *Next = regnext(scan); \
2571        regprop(RExC_rx, mysv, scan); \
2572        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2573        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2574        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2575    }});
2576
2577
2578 /* The below joins as many adjacent EXACTish nodes as possible into a single
2579  * one, and looks for problematic sequences of characters whose folds vs.
2580  * non-folds have sufficiently different lengths, that the optimizer would be
2581  * fooled into rejecting legitimate matches of them, and the trie construction
2582  * code can't cope with them.  The joining is only done if:
2583  * 1) there is room in the current conglomerated node to entirely contain the
2584  *    next one.
2585  * 2) they are the exact same node type
2586  *
2587  * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2588  * these get optimized out
2589  *
2590  * If there are problematic code sequences, *min_subtract is set to the delta
2591  * that the minimum size of the node can be less than its actual size.  And,
2592  * the node type of the result is changed to reflect that it contains these
2593  * sequences.
2594  *
2595  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2596  * and contains LATIN SMALL LETTER SHARP S
2597  *
2598  * This is as good a place as any to discuss the design of handling these
2599  * problematic sequences.  It's been wrong in Perl for a very long time.  There
2600  * are three code points in Unicode whose folded lengths differ so much from
2601  * the un-folded lengths that it causes problems for the optimizer and trie
2602  * construction.  Why only these are problematic, and not others where lengths
2603  * also differ is something I (khw) do not understand.  New versions of Unicode
2604  * might add more such code points.  Hopefully the logic in fold_grind.t that
2605  * figures out what to test (in part by verifying that each size-combination
2606  * gets tested) will catch any that do come along, so they can be added to the
2607  * special handling below.  The chances of new ones are actually rather small,
2608  * as most, if not all, of the world's scripts that have casefolding have
2609  * already been encoded by Unicode.  Also, a number of Unicode's decisions were
2610  * made to allow compatibility with pre-existing standards, and almost all of
2611  * those have already been dealt with.  These would otherwise be the most
2612  * likely candidates for generating further tricky sequences.  In other words,
2613  * Unicode by itself is unlikely to add new ones unless it is for compatibility
2614  * with pre-existing standards, and there aren't many of those left.
2615  *
2616  * The previous designs for dealing with these involved assigning a special
2617  * node for them.  This approach doesn't work, as evidenced by this example:
2618  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2619  * Both these fold to "sss", but if the pattern is parsed to create a node of
2620  * that would match just the \xDF, it won't be able to handle the case where a
2621  * successful match would have to cross the node's boundary.  The new approach
2622  * that hopefully generally solves the problem generates an EXACTFU_SS node
2623  * that is "sss".
2624  *
2625  * There are a number of components to the approach (a lot of work for just
2626  * three code points!):
2627  * 1)   This routine examines each EXACTFish node that could contain the
2628  *      problematic sequences.  It returns in *min_subtract how much to
2629  *      subtract from the the actual length of the string to get a real minimum
2630  *      for one that could match it.  This number is usually 0 except for the
2631  *      problematic sequences.  This delta is used by the caller to adjust the
2632  *      min length of the match, and the delta between min and max, so that the
2633  *      optimizer doesn't reject these possibilities based on size constraints.
2634  * 2)   These sequences are not currently correctly handled by the trie code
2635  *      either, so it changes the joined node type to ops that are not handled
2636  *      by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
2637  * 3)   This is sufficient for the two Greek sequences (described below), but
2638  *      the one involving the Sharp s (\xDF) needs more.  The node type
2639  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2640  *      sequence in it.  For non-UTF-8 patterns and strings, this is the only
2641  *      case where there is a possible fold length change.  That means that a
2642  *      regular EXACTFU node without UTF-8 involvement doesn't have to concern
2643  *      itself with length changes, and so can be processed faster.  regexec.c
2644  *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
2645  *      is pre-folded by regcomp.c.  This saves effort in regex matching.
2646  *      However, probably mostly for historical reasons, the pre-folding isn't
2647  *      done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2648  *      nodes, as what they fold to isn't known until runtime.)  The fold
2649  *      possibilities for the non-UTF8 patterns are quite simple, except for
2650  *      the sharp s.  All the ones that don't involve a UTF-8 target string
2651  *      are members of a fold-pair, and arrays are set up for all of them
2652  *      that quickly find the other member of the pair.  It might actually
2653  *      be faster to pre-fold these, but it isn't currently done, except for
2654  *      the sharp s.  Code elsewhere in this file makes sure that it gets
2655  *      folded to 'ss', even if the pattern isn't UTF-8.  This avoids the
2656  *      issues described in the next item.
2657  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2658  *      'ss' or not is not knowable at compile time.  It will match iff the
2659  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2660  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2661  *      it can't be folded to "ss" at compile time, unlike EXACTFU does as
2662  *      described in item 3).  An assumption that the optimizer part of
2663  *      regexec.c (probably unwittingly) makes is that a character in the
2664  *      pattern corresponds to at most a single character in the target string.
2665  *      (And I do mean character, and not byte here, unlike other parts of the
2666  *      documentation that have never been updated to account for multibyte
2667  *      Unicode.)  This assumption is wrong only in this case, as all other
2668  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2669  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2670  *      reluctant to try to change this assumption, so instead the code punts.
2671  *      This routine examines EXACTF nodes for the sharp s, and returns a
2672  *      boolean indicating whether or not the node is an EXACTF node that
2673  *      contains a sharp s.  When it is true, the caller sets a flag that later
2674  *      causes the optimizer in this file to not set values for the floating
2675  *      and fixed string lengths, and thus avoids the optimizer code in
2676  *      regexec.c that makes the invalid assumption.  Thus, there is no
2677  *      optimization based on string lengths for EXACTF nodes that contain the
2678  *      sharp s.  This only happens for /id rules (which means the pattern
2679  *      isn't in UTF-8).
2680  */
2681
2682 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2683     if (PL_regkind[OP(scan)] == EXACT) \
2684         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2685
2686 STATIC U32
2687 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2688     /* Merge several consecutive EXACTish nodes into one. */
2689     regnode *n = regnext(scan);
2690     U32 stringok = 1;
2691     regnode *next = scan + NODE_SZ_STR(scan);
2692     U32 merged = 0;
2693     U32 stopnow = 0;
2694 #ifdef DEBUGGING
2695     regnode *stop = scan;
2696     GET_RE_DEBUG_FLAGS_DECL;
2697 #else
2698     PERL_UNUSED_ARG(depth);
2699 #endif
2700
2701     PERL_ARGS_ASSERT_JOIN_EXACT;
2702 #ifndef EXPERIMENTAL_INPLACESCAN
2703     PERL_UNUSED_ARG(flags);
2704     PERL_UNUSED_ARG(val);
2705 #endif
2706     DEBUG_PEEP("join",scan,depth);
2707
2708     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2709      * EXACT ones that are mergeable to the current one. */
2710     while (n
2711            && (PL_regkind[OP(n)] == NOTHING
2712                || (stringok && OP(n) == OP(scan)))
2713            && NEXT_OFF(n)
2714            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2715     {
2716
2717         if (OP(n) == TAIL || n > next)
2718             stringok = 0;
2719         if (PL_regkind[OP(n)] == NOTHING) {
2720             DEBUG_PEEP("skip:",n,depth);
2721             NEXT_OFF(scan) += NEXT_OFF(n);
2722             next = n + NODE_STEP_REGNODE;
2723 #ifdef DEBUGGING
2724             if (stringok)
2725                 stop = n;
2726 #endif
2727             n = regnext(n);
2728         }
2729         else if (stringok) {
2730             const unsigned int oldl = STR_LEN(scan);
2731             regnode * const nnext = regnext(n);
2732
2733             if (oldl + STR_LEN(n) > U8_MAX)
2734                 break;
2735
2736             DEBUG_PEEP("merg",n,depth);
2737             merged++;
2738
2739             NEXT_OFF(scan) += NEXT_OFF(n);
2740             STR_LEN(scan) += STR_LEN(n);
2741             next = n + NODE_SZ_STR(n);
2742             /* Now we can overwrite *n : */
2743             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2744 #ifdef DEBUGGING
2745             stop = next - 1;
2746 #endif
2747             n = nnext;
2748             if (stopnow) break;
2749         }
2750
2751 #ifdef EXPERIMENTAL_INPLACESCAN
2752         if (flags && !NEXT_OFF(n)) {
2753             DEBUG_PEEP("atch", val, depth);
2754             if (reg_off_by_arg[OP(n)]) {
2755                 ARG_SET(n, val - n);
2756             }
2757             else {
2758                 NEXT_OFF(n) = val - n;
2759             }
2760             stopnow = 1;
2761         }
2762 #endif
2763     }
2764
2765     *min_subtract = 0;
2766     *has_exactf_sharp_s = FALSE;
2767
2768     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2769      * can now analyze for sequences of problematic code points.  (Prior to
2770      * this final joining, sequences could have been split over boundaries, and
2771      * hence missed).  The sequences only happen in folding, hence for any
2772      * non-EXACT EXACTish node */
2773     if (OP(scan) != EXACT) {
2774         U8 *s;
2775         U8 * s0 = (U8*) STRING(scan);
2776         U8 * const s_end = s0 + STR_LEN(scan);
2777
2778         /* The below is perhaps overboard, but this allows us to save a test
2779          * each time through the loop at the expense of a mask.  This is
2780          * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2781          * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
2782          * This uses an exclusive 'or' to find that bit and then inverts it to
2783          * form a mask, with just a single 0, in the bit position where 'S' and
2784          * 's' differ. */
2785         const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2786         const U8 s_masked = 's' & S_or_s_mask;
2787
2788         /* One pass is made over the node's string looking for all the
2789          * possibilities.  to avoid some tests in the loop, there are two main
2790          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2791          * non-UTF-8 */
2792         if (UTF) {
2793
2794             /* There are two problematic Greek code points in Unicode
2795              * casefolding
2796              *
2797              * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2798              * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2799              *
2800              * which casefold to
2801              *
2802              * Unicode                      UTF-8
2803              *
2804              * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2805              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2806              *
2807              * This means that in case-insensitive matching (or "loose
2808              * matching", as Unicode calls it), an EXACTF of length six (the
2809              * UTF-8 encoded byte length of the above casefolded versions) can
2810              * match a target string of length two (the byte length of UTF-8
2811              * encoded U+0390 or U+03B0).  This would rather mess up the
2812              * minimum length computation.  (there are other code points that
2813              * also fold to these two sequences, but the delta is smaller)
2814              *
2815              * If these sequences are found, the minimum length is decreased by
2816              * four (six minus two).
2817              *
2818              * Similarly, 'ss' may match the single char and byte LATIN SMALL
2819              * LETTER SHARP S.  We decrease the min length by 1 for each
2820              * occurrence of 'ss' found */
2821
2822 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2823 #           define U390_first_byte 0xb4
2824             const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2825 #           define U3B0_first_byte 0xb5
2826             const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2827 #else
2828 #           define U390_first_byte 0xce
2829             const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2830 #           define U3B0_first_byte 0xcf
2831             const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2832 #endif
2833             const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2834                                                  yields a net of 0 */
2835             /* Examine the string for one of the problematic sequences */
2836             for (s = s0;
2837                  s < s_end - 1; /* Can stop 1 before the end, as minimum length
2838                                  * sequence we are looking for is 2 */
2839                  s += UTF8SKIP(s))
2840             {
2841
2842                 /* Look for the first byte in each problematic sequence */
2843                 switch (*s) {
2844                     /* We don't have to worry about other things that fold to
2845                      * 's' (such as the long s, U+017F), as all above-latin1
2846                      * code points have been pre-folded */
2847                     case 's':
2848                     case 'S':
2849
2850                         /* Current character is an 's' or 'S'.  If next one is
2851                          * as well, we have the dreaded sequence */
2852                         if (((*(s+1) & S_or_s_mask) == s_masked)
2853                             /* These two node types don't have special handling
2854                              * for 'ss' */
2855                             && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2856                         {
2857                             *min_subtract += 1;
2858                             OP(scan) = EXACTFU_SS;
2859                             s++;    /* No need to look at this character again */
2860                         }
2861                         break;
2862
2863                     case U390_first_byte:
2864                         if (s_end - s >= len
2865
2866                             /* The 1's are because are skipping comparing the
2867                              * first byte */
2868                             && memEQ(s + 1, U390_tail, len - 1))
2869                         {
2870                             goto greek_sequence;
2871                         }
2872                         break;
2873
2874                     case U3B0_first_byte:
2875                         if (! (s_end - s >= len
2876                                && memEQ(s + 1, U3B0_tail, len - 1)))
2877                         {
2878                             break;
2879                         }
2880                       greek_sequence:
2881                         *min_subtract += 4;
2882
2883                         /* This can't currently be handled by trie's, so change
2884                          * the node type to indicate this.  If EXACTFA and
2885                          * EXACTFL were ever to be handled by trie's, this
2886                          * would have to be changed.  If this node has already
2887                          * been changed to EXACTFU_SS in this loop, leave it as
2888                          * is.  (I (khw) think it doesn't matter in regexec.c
2889                          * for UTF patterns, but no need to change it */
2890                         if (OP(scan) == EXACTFU) {
2891                             OP(scan) = EXACTFU_TRICKYFOLD;
2892                         }
2893                         s += 6; /* We already know what this sequence is.  Skip
2894                                    the rest of it */
2895                         break;
2896                 }
2897             }
2898         }
2899         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2900
2901             /* Here, the pattern is not UTF-8.  We need to look only for the
2902              * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2903              * in the final position.  Otherwise we can stop looking 1 byte
2904              * earlier because have to find both the first and second 's' */
2905             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2906
2907             for (s = s0; s < upper; s++) {
2908                 switch (*s) {
2909                     case 'S':
2910                     case 's':
2911                         if (s_end - s > 1
2912                             && ((*(s+1) & S_or_s_mask) == s_masked))
2913                         {
2914                             *min_subtract += 1;
2915
2916                             /* EXACTF nodes need to know that the minimum
2917                              * length changed so that a sharp s in the string
2918                              * can match this ss in the pattern, but they
2919                              * remain EXACTF nodes, as they are not trie'able,
2920                              * so don't have to invent a new node type to
2921                              * exclude them from the trie code */
2922                             if (OP(scan) != EXACTF) {
2923                                 OP(scan) = EXACTFU_SS;
2924                             }
2925                             s++;
2926                         }
2927                         break;
2928                     case LATIN_SMALL_LETTER_SHARP_S:
2929                         if (OP(scan) == EXACTF) {
2930                             *has_exactf_sharp_s = TRUE;
2931                         }
2932                         break;
2933                 }
2934             }
2935         }
2936     }
2937
2938 #ifdef DEBUGGING
2939     /* Allow dumping but overwriting the collection of skipped
2940      * ops and/or strings with fake optimized ops */
2941     n = scan + NODE_SZ_STR(scan);
2942     while (n <= stop) {
2943         OP(n) = OPTIMIZED;
2944         FLAGS(n) = 0;
2945         NEXT_OFF(n) = 0;
2946         n++;
2947     }
2948 #endif
2949     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2950     return stopnow;
2951 }
2952
2953 /* REx optimizer.  Converts nodes into quicker variants "in place".
2954    Finds fixed substrings.  */
2955
2956 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2957    to the position after last scanned or to NULL. */
2958
2959 #define INIT_AND_WITHP \
2960     assert(!and_withp); \
2961     Newx(and_withp,1,struct regnode_charclass_class); \
2962     SAVEFREEPV(and_withp)
2963
2964 /* this is a chain of data about sub patterns we are processing that
2965    need to be handled separately/specially in study_chunk. Its so
2966    we can simulate recursion without losing state.  */
2967 struct scan_frame;
2968 typedef struct scan_frame {
2969     regnode *last;  /* last node to process in this frame */
2970     regnode *next;  /* next node to process when last is reached */
2971     struct scan_frame *prev; /*previous frame*/
2972     I32 stop; /* what stopparen do we use */
2973 } scan_frame;
2974
2975
2976 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2977
2978 #define CASE_SYNST_FNC(nAmE)                                       \
2979 case nAmE:                                                         \
2980     if (flags & SCF_DO_STCLASS_AND) {                              \
2981             for (value = 0; value < 256; value++)                  \
2982                 if (!is_ ## nAmE ## _cp(value))                       \
2983                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2984     }                                                              \
2985     else {                                                         \
2986             for (value = 0; value < 256; value++)                  \
2987                 if (is_ ## nAmE ## _cp(value))                        \
2988                     ANYOF_BITMAP_SET(data->start_class, value);    \
2989     }                                                              \
2990     break;                                                         \
2991 case N ## nAmE:                                                    \
2992     if (flags & SCF_DO_STCLASS_AND) {                              \
2993             for (value = 0; value < 256; value++)                   \
2994                 if (is_ ## nAmE ## _cp(value))                         \
2995                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2996     }                                                               \
2997     else {                                                          \
2998             for (value = 0; value < 256; value++)                   \
2999                 if (!is_ ## nAmE ## _cp(value))                        \
3000                     ANYOF_BITMAP_SET(data->start_class, value);     \
3001     }                                                               \
3002     break
3003
3004
3005
3006 STATIC I32
3007 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3008                         I32 *minlenp, I32 *deltap,
3009                         regnode *last,
3010                         scan_data_t *data,
3011                         I32 stopparen,
3012                         U8* recursed,
3013                         struct regnode_charclass_class *and_withp,
3014                         U32 flags, U32 depth)
3015                         /* scanp: Start here (read-write). */
3016                         /* deltap: Write maxlen-minlen here. */
3017                         /* last: Stop before this one. */
3018                         /* data: string data about the pattern */
3019                         /* stopparen: treat close N as END */
3020                         /* recursed: which subroutines have we recursed into */
3021                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3022 {
3023     dVAR;
3024     I32 min = 0, pars = 0, code;
3025     regnode *scan = *scanp, *next;
3026     I32 delta = 0;
3027     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3028     int is_inf_internal = 0;            /* The studied chunk is infinite */
3029     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3030     scan_data_t data_fake;
3031     SV *re_trie_maxbuff = NULL;
3032     regnode *first_non_open = scan;
3033     I32 stopmin = I32_MAX;
3034     scan_frame *frame = NULL;
3035     GET_RE_DEBUG_FLAGS_DECL;
3036
3037     PERL_ARGS_ASSERT_STUDY_CHUNK;
3038
3039 #ifdef DEBUGGING
3040     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3041 #endif
3042
3043     if ( depth == 0 ) {
3044         while (first_non_open && OP(first_non_open) == OPEN)
3045             first_non_open=regnext(first_non_open);
3046     }
3047
3048
3049   fake_study_recurse:
3050     while ( scan && OP(scan) != END && scan < last ){
3051         UV min_subtract = 0;    /* How much to subtract from the minimum node
3052                                    length to get a real minimum (because the
3053                                    folded version may be shorter) */
3054         bool has_exactf_sharp_s = FALSE;
3055         /* Peephole optimizer: */
3056         DEBUG_STUDYDATA("Peep:", data,depth);
3057         DEBUG_PEEP("Peep",scan,depth);
3058
3059         /* Its not clear to khw or hv why this is done here, and not in the
3060          * clauses that deal with EXACT nodes.  khw's guess is that it's
3061          * because of a previous design */
3062         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3063
3064         /* Follow the next-chain of the current node and optimize
3065            away all the NOTHINGs from it.  */
3066         if (OP(scan) != CURLYX) {
3067             const int max = (reg_off_by_arg[OP(scan)]
3068                        ? I32_MAX
3069                        /* I32 may be smaller than U16 on CRAYs! */
3070                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3071             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3072             int noff;
3073             regnode *n = scan;
3074
3075             /* Skip NOTHING and LONGJMP. */
3076             while ((n = regnext(n))
3077                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3078                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3079                    && off + noff < max)
3080                 off += noff;
3081             if (reg_off_by_arg[OP(scan)])
3082                 ARG(scan) = off;
3083             else
3084                 NEXT_OFF(scan) = off;
3085         }
3086
3087
3088
3089         /* The principal pseudo-switch.  Cannot be a switch, since we
3090            look into several different things.  */
3091         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3092                    || OP(scan) == IFTHEN) {
3093             next = regnext(scan);
3094             code = OP(scan);
3095             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3096
3097             if (OP(next) == code || code == IFTHEN) {
3098                 /* NOTE - There is similar code to this block below for handling
3099                    TRIE nodes on a re-study.  If you change stuff here check there
3100                    too. */
3101                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3102                 struct regnode_charclass_class accum;
3103                 regnode * const startbranch=scan;
3104
3105                 if (flags & SCF_DO_SUBSTR)
3106                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3107                 if (flags & SCF_DO_STCLASS)
3108                     cl_init_zero(pRExC_state, &accum);
3109
3110                 while (OP(scan) == code) {
3111                     I32 deltanext, minnext, f = 0, fake;
3112                     struct regnode_charclass_class this_class;
3113
3114                     num++;
3115                     data_fake.flags = 0;
3116                     if (data) {
3117                         data_fake.whilem_c = data->whilem_c;
3118                         data_fake.last_closep = data->last_closep;
3119                     }
3120                     else
3121                         data_fake.last_closep = &fake;
3122
3123                     data_fake.pos_delta = delta;
3124                     next = regnext(scan);
3125                     scan = NEXTOPER(scan);
3126                     if (code != BRANCH)
3127                         scan = NEXTOPER(scan);
3128                     if (flags & SCF_DO_STCLASS) {
3129                         cl_init(pRExC_state, &this_class);
3130                         data_fake.start_class = &this_class;
3131                         f = SCF_DO_STCLASS_AND;
3132                     }
3133                     if (flags & SCF_WHILEM_VISITED_POS)
3134                         f |= SCF_WHILEM_VISITED_POS;
3135
3136                     /* we suppose the run is continuous, last=next...*/
3137                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3138                                           next, &data_fake,
3139                                           stopparen, recursed, NULL, f,depth+1);
3140                     if (min1 > minnext)
3141                         min1 = minnext;
3142                     if (max1 < minnext + deltanext)
3143                         max1 = minnext + deltanext;
3144                     if (deltanext == I32_MAX)
3145                         is_inf = is_inf_internal = 1;
3146                     scan = next;
3147                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3148                         pars++;
3149                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3150                         if ( stopmin > minnext)
3151                             stopmin = min + min1;
3152                         flags &= ~SCF_DO_SUBSTR;
3153                         if (data)
3154                             data->flags |= SCF_SEEN_ACCEPT;
3155                     }
3156                     if (data) {
3157                         if (data_fake.flags & SF_HAS_EVAL)
3158                             data->flags |= SF_HAS_EVAL;
3159                         data->whilem_c = data_fake.whilem_c;
3160                     }
3161                     if (flags & SCF_DO_STCLASS)
3162                         cl_or(pRExC_state, &accum, &this_class);
3163                 }
3164                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3165                     min1 = 0;
3166                 if (flags & SCF_DO_SUBSTR) {
3167                     data->pos_min += min1;
3168                     data->pos_delta += max1 - min1;
3169                     if (max1 != min1 || is_inf)
3170                         data->longest = &(data->longest_float);
3171                 }
3172                 min += min1;
3173                 delta += max1 - min1;
3174                 if (flags & SCF_DO_STCLASS_OR) {
3175                     cl_or(pRExC_state, data->start_class, &accum);
3176                     if (min1) {
3177                         cl_and(data->start_class, and_withp);
3178                         flags &= ~SCF_DO_STCLASS;
3179                     }
3180                 }
3181                 else if (flags & SCF_DO_STCLASS_AND) {
3182                     if (min1) {
3183                         cl_and(data->start_class, &accum);
3184                         flags &= ~SCF_DO_STCLASS;
3185                     }
3186                     else {
3187                         /* Switch to OR mode: cache the old value of
3188                          * data->start_class */
3189                         INIT_AND_WITHP;
3190                         StructCopy(data->start_class, and_withp,
3191                                    struct regnode_charclass_class);
3192                         flags &= ~SCF_DO_STCLASS_AND;
3193                         StructCopy(&accum, data->start_class,
3194                                    struct regnode_charclass_class);
3195                         flags |= SCF_DO_STCLASS_OR;
3196                         data->start_class->flags |= ANYOF_EOS;
3197                     }
3198                 }
3199
3200                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3201                 /* demq.
3202
3203                    Assuming this was/is a branch we are dealing with: 'scan' now
3204                    points at the item that follows the branch sequence, whatever
3205                    it is. We now start at the beginning of the sequence and look
3206                    for subsequences of
3207
3208                    BRANCH->EXACT=>x1
3209                    BRANCH->EXACT=>x2
3210                    tail
3211
3212                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3213
3214                    If we can find such a subsequence we need to turn the first
3215                    element into a trie and then add the subsequent branch exact
3216                    strings to the trie.
3217
3218                    We have two cases
3219
3220                      1. patterns where the whole set of branches can be converted.
3221
3222                      2. patterns where only a subset can be converted.
3223
3224                    In case 1 we can replace the whole set with a single regop
3225                    for the trie. In case 2 we need to keep the start and end
3226                    branches so
3227
3228                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3229                      becomes BRANCH TRIE; BRANCH X;
3230
3231                   There is an additional case, that being where there is a
3232                   common prefix, which gets split out into an EXACT like node
3233                   preceding the TRIE node.
3234
3235                   If x(1..n)==tail then we can do a simple trie, if not we make
3236                   a "jump" trie, such that when we match the appropriate word
3237                   we "jump" to the appropriate tail node. Essentially we turn
3238                   a nested if into a case structure of sorts.
3239
3240                 */
3241
3242                     int made=0;
3243                     if (!re_trie_maxbuff) {
3244                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3245                         if (!SvIOK(re_trie_maxbuff))
3246                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3247                     }
3248                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3249                         regnode *cur;
3250                         regnode *first = (regnode *)NULL;
3251                         regnode *last = (regnode *)NULL;
3252                         regnode *tail = scan;
3253                         U8 trietype = 0;
3254                         U32 count=0;
3255
3256 #ifdef DEBUGGING
3257                         SV * const mysv = sv_newmortal();       /* for dumping */
3258 #endif
3259                         /* var tail is used because there may be a TAIL
3260                            regop in the way. Ie, the exacts will point to the
3261                            thing following the TAIL, but the last branch will
3262                            point at the TAIL. So we advance tail. If we
3263                            have nested (?:) we may have to move through several
3264                            tails.
3265                          */
3266
3267                         while ( OP( tail ) == TAIL ) {
3268                             /* this is the TAIL generated by (?:) */
3269                             tail = regnext( tail );
3270                         }
3271
3272
3273                         DEBUG_TRIE_COMPILE_r({
3274                             regprop(RExC_rx, mysv, tail );
3275                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3276                                 (int)depth * 2 + 2, "",
3277                                 "Looking for TRIE'able sequences. Tail node is: ",
3278                                 SvPV_nolen_const( mysv )
3279                             );
3280                         });
3281
3282                         /*
3283
3284                             Step through the branches
3285                                 cur represents each branch,
3286                                 noper is the first thing to be matched as part of that branch
3287                                 noper_next is the regnext() of that node.
3288
3289                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3290                             via a "jump trie" but we also support building with NOJUMPTRIE,
3291                             which restricts the trie logic to structures like /FOO|BAR/.
3292
3293                             If noper is a trieable nodetype then the branch is a possible optimization
3294                             target. If we are building under NOJUMPTRIE then we require that noper_next
3295                             is the same as scan (our current position in the regex program).
3296
3297                             Once we have two or more consecutive such branches we can create a
3298                             trie of the EXACT's contents and stitch it in place into the program.
3299
3300                             If the sequence represents all of the branches in the alternation we
3301                             replace the entire thing with a single TRIE node.
3302
3303                             Otherwise when it is a subsequence we need to stitch it in place and
3304                             replace only the relevant branches. This means the first branch has
3305                             to remain as it is used by the alternation logic, and its next pointer,
3306                             and needs to be repointed at the item on the branch chain following
3307                             the last branch we have optimized away.
3308
3309                             This could be either a BRANCH, in which case the subsequence is internal,
3310                             or it could be the item following the branch sequence in which case the
3311                             subsequence is at the end (which does not necessarily mean the first node
3312                             is the start of the alternation).
3313
3314                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3315
3316                                 optype          |  trietype
3317                                 ----------------+-----------
3318                                 NOTHING         | NOTHING
3319                                 EXACT           | EXACT
3320                                 EXACTFU         | EXACTFU
3321                                 EXACTFU_SS      | EXACTFU
3322                                 EXACTFU_TRICKYFOLD | EXACTFU
3323                                 EXACTFA         | 0
3324
3325
3326                         */
3327 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3328                        ( EXACT == (X) )   ? EXACT :        \
3329                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3330                        0 )
3331
3332                         /* dont use tail as the end marker for this traverse */
3333                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3334                             regnode * const noper = NEXTOPER( cur );
3335                             U8 noper_type = OP( noper );
3336                             U8 noper_trietype = TRIE_TYPE( noper_type );
3337 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3338                             regnode * const noper_next = regnext( noper );
3339                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3340                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3341 #endif
3342
3343                             DEBUG_TRIE_COMPILE_r({
3344                                 regprop(RExC_rx, mysv, cur);
3345                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3346                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3347
3348                                 regprop(RExC_rx, mysv, noper);
3349                                 PerlIO_printf( Perl_debug_log, " -> %s",
3350                                     SvPV_nolen_const(mysv));
3351
3352                                 if ( noper_next ) {
3353                                   regprop(RExC_rx, mysv, noper_next );
3354                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3355                                     SvPV_nolen_const(mysv));
3356                                 }
3357                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3358                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3359                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3360                                 );
3361                             });
3362
3363                             /* Is noper a trieable nodetype that can be merged with the
3364                              * current trie (if there is one)? */
3365                             if ( noper_trietype
3366                                   &&
3367                                   (
3368                                         ( noper_trietype == NOTHING)
3369                                         || ( trietype == NOTHING )
3370                                         || ( trietype == noper_trietype )
3371                                   )
3372 #ifdef NOJUMPTRIE
3373                                   && noper_next == tail
3374 #endif
3375                                   && count < U16_MAX)
3376                             {
3377                                 /* Handle mergable triable node
3378                                  * Either we are the first node in a new trieable sequence,
3379                                  * in which case we do some bookkeeping, otherwise we update
3380                                  * the end pointer. */
3381                                 if ( !first ) {
3382                                     first = cur;
3383                                     trietype = noper_trietype;
3384                                     if ( noper_trietype == NOTHING ) {
3385 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3386                                         regnode * const noper_next = regnext( noper );
3387                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3388                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3389 #endif
3390
3391                                         if ( noper_next_trietype )
3392                                             trietype = noper_next_trietype;
3393                                     }
3394                                 } else {
3395                                     if ( trietype == NOTHING )
3396                                         trietype = noper_trietype;
3397                                     last = cur;
3398                                 }
3399                                 if (first)
3400                                     count++;
3401                             } /* end handle mergable triable node */
3402                             else {
3403                                 /* handle unmergable node -
3404                                  * noper may either be a triable node which can not be tried
3405                                  * together with the current trie, or a non triable node */
3406                                 if ( last ) {
3407                                     /* If last is set and trietype is not NOTHING then we have found
3408                                      * at least two triable branch sequences in a row of a similar
3409                                      * trietype so we can turn them into a trie. If/when we
3410                                      * allow NOTHING to start a trie sequence this condition will be
3411                                      * required, and it isn't expensive so we leave it in for now. */
3412                                     if ( trietype != NOTHING )
3413                                         make_trie( pRExC_state,
3414                                                 startbranch, first, cur, tail, count,
3415                                                 trietype, depth+1 );
3416                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3417                                 }
3418                                 if ( noper_trietype
3419 #ifdef NOJUMPTRIE
3420                                      && noper_next == tail
3421 #endif
3422                                 ){
3423                                     /* noper is triable, so we can start a new trie sequence */
3424                                     count = 1;
3425                                     first = cur;
3426                                     trietype = noper_trietype;
3427                                 } else if (first) {
3428                                     /* if we already saw a first but the current node is not triable then we have
3429                                      * to reset the first information. */
3430                                     count = 0;
3431                                     first = NULL;
3432                                     trietype = 0;
3433                                 }
3434                             } /* end handle unmergable node */
3435                         } /* loop over branches */
3436                         DEBUG_TRIE_COMPILE_r({
3437                             regprop(RExC_rx, mysv, cur);
3438                             PerlIO_printf( Perl_debug_log,
3439                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3440                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3441
3442                         });
3443                         if ( last ) {
3444                             if ( trietype != NOTHING ) {
3445                                 /* the last branch of the sequence was part of a trie,
3446                                  * so we have to construct it here outside of the loop
3447                                  */
3448                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3449 #ifdef TRIE_STUDY_OPT
3450                                 if ( ((made == MADE_EXACT_TRIE &&
3451                                      startbranch == first)
3452                                      || ( first_non_open == first )) &&
3453                                      depth==0 ) {
3454                                     flags |= SCF_TRIE_RESTUDY;
3455                                     if ( startbranch == first
3456                                          && scan == tail )
3457                                     {
3458                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3459                                     }
3460                                 }
3461 #endif
3462                             } else {
3463                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3464                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3465                                  */
3466                                 if ( startbranch == first ) {
3467                                     regnode *opt;
3468                                     /* the entire thing is a NOTHING sequence, something like this:
3469                                      * (?:|) So we can turn it into a plain NOTHING op. */
3470                                     DEBUG_TRIE_COMPILE_r({
3471                                         regprop(RExC_rx, mysv, cur);
3472                                         PerlIO_printf( Perl_debug_log,
3473                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3474                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3475
3476                                     });
3477                                     OP(startbranch)= NOTHING;
3478                                     NEXT_OFF(startbranch)= tail - startbranch;
3479                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3480                                         OP(opt)= OPTIMIZED;
3481                                 }
3482                             }
3483                         } /* end if ( last) */
3484                     } /* TRIE_MAXBUF is non zero */
3485
3486                 } /* do trie */
3487
3488             }
3489             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3490                 scan = NEXTOPER(NEXTOPER(scan));
3491             } else                      /* single branch is optimized. */
3492                 scan = NEXTOPER(scan);
3493             continue;
3494         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3495             scan_frame *newframe = NULL;
3496             I32 paren;
3497             regnode *start;
3498             regnode *end;
3499
3500             if (OP(scan) != SUSPEND) {
3501             /* set the pointer */
3502                 if (OP(scan) == GOSUB) {
3503                     paren = ARG(scan);
3504                     RExC_recurse[ARG2L(scan)] = scan;
3505                     start = RExC_open_parens[paren-1];
3506                     end   = RExC_close_parens[paren-1];
3507                 } else {
3508                     paren = 0;
3509                     start = RExC_rxi->program + 1;
3510                     end   = RExC_opend;
3511                 }
3512                 if (!recursed) {
3513                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3514                     SAVEFREEPV(recursed);
3515                 }
3516                 if (!PAREN_TEST(recursed,paren+1)) {
3517                     PAREN_SET(recursed,paren+1);
3518                     Newx(newframe,1,scan_frame);
3519                 } else {
3520                     if (flags & SCF_DO_SUBSTR) {
3521                         SCAN_COMMIT(pRExC_state,data,minlenp);
3522                         data->longest = &(data->longest_float);
3523                     }
3524                     is_inf = is_inf_internal = 1;
3525                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3526                         cl_anything(pRExC_state, data->start_class);
3527                     flags &= ~SCF_DO_STCLASS;
3528                 }
3529             } else {
3530                 Newx(newframe,1,scan_frame);
3531                 paren = stopparen;
3532                 start = scan+2;
3533                 end = regnext(scan);
3534             }
3535             if (newframe) {
3536                 assert(start);
3537                 assert(end);
3538                 SAVEFREEPV(newframe);
3539                 newframe->next = regnext(scan);
3540                 newframe->last = last;
3541                 newframe->stop = stopparen;
3542                 newframe->prev = frame;
3543
3544                 frame = newframe;
3545                 scan =  start;
3546                 stopparen = paren;
3547                 last = end;
3548
3549                 continue;
3550             }
3551         }
3552         else if (OP(scan) == EXACT) {
3553             I32 l = STR_LEN(scan);
3554             UV uc;
3555             if (UTF) {
3556                 const U8 * const s = (U8*)STRING(scan);
3557                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3558                 l = utf8_length(s, s + l);
3559             } else {
3560                 uc = *((U8*)STRING(scan));
3561             }
3562             min += l;
3563             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3564                 /* The code below prefers earlier match for fixed
3565                    offset, later match for variable offset.  */
3566                 if (data->last_end == -1) { /* Update the start info. */
3567                     data->last_start_min = data->pos_min;
3568                     data->last_start_max = is_inf
3569                         ? I32_MAX : data->pos_min + data->pos_delta;
3570                 }
3571                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3572                 if (UTF)
3573                     SvUTF8_on(data->last_found);
3574                 {
3575                     SV * const sv = data->last_found;
3576                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3577                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3578                     if (mg && mg->mg_len >= 0)
3579                         mg->mg_len += utf8_length((U8*)STRING(scan),
3580                                                   (U8*)STRING(scan)+STR_LEN(scan));
3581                 }
3582                 data->last_end = data->pos_min + l;
3583                 data->pos_min += l; /* As in the first entry. */
3584                 data->flags &= ~SF_BEFORE_EOL;
3585             }
3586             if (flags & SCF_DO_STCLASS_AND) {
3587                 /* Check whether it is compatible with what we know already! */
3588                 int compat = 1;
3589
3590
3591                 /* If compatible, we or it in below.  It is compatible if is
3592                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3593                  * it's for a locale.  Even if there isn't unicode semantics
3594                  * here, at runtime there may be because of matching against a
3595                  * utf8 string, so accept a possible false positive for
3596                  * latin1-range folds */
3597                 if (uc >= 0x100 ||
3598                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3599                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3600                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3601                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3602                     )
3603                 {
3604                     compat = 0;
3605                 }
3606                 ANYOF_CLASS_ZERO(data->start_class);
3607                 ANYOF_BITMAP_ZERO(data->start_class);
3608                 if (compat)
3609                     ANYOF_BITMAP_SET(data->start_class, uc);
3610                 else if (uc >= 0x100) {
3611                     int i;
3612
3613                     /* Some Unicode code points fold to the Latin1 range; as
3614                      * XXX temporary code, instead of figuring out if this is
3615                      * one, just assume it is and set all the start class bits
3616                      * that could be some such above 255 code point's fold
3617                      * which will generate fals positives.  As the code
3618                      * elsewhere that does compute the fold settles down, it
3619                      * can be extracted out and re-used here */
3620                     for (i = 0; i < 256; i++){
3621                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3622                             ANYOF_BITMAP_SET(data->start_class, i);
3623                         }
3624                     }
3625                 }
3626                 data->start_class->flags &= ~ANYOF_EOS;
3627                 if (uc < 0x100)
3628                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3629             }
3630             else if (flags & SCF_DO_STCLASS_OR) {
3631                 /* false positive possible if the class is case-folded */
3632                 if (uc < 0x100)
3633                     ANYOF_BITMAP_SET(data->start_class, uc);
3634                 else
3635                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3636                 data->start_class->flags &= ~ANYOF_EOS;
3637                 cl_and(data->start_class, and_withp);
3638             }
3639             flags &= ~SCF_DO_STCLASS;
3640         }
3641         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3642             I32 l = STR_LEN(scan);
3643             UV uc = *((U8*)STRING(scan));
3644
3645             /* Search for fixed substrings supports EXACT only. */
3646             if (flags & SCF_DO_SUBSTR) {
3647                 assert(data);
3648                 SCAN_COMMIT(pRExC_state, data, minlenp);
3649             }
3650             if (UTF) {
3651                 const U8 * const s = (U8 *)STRING(scan);
3652                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3653                 l = utf8_length(s, s + l);
3654             }
3655             else if (has_exactf_sharp_s) {
3656                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3657             }
3658             min += l - min_subtract;
3659             if (min < 0) {
3660                 min = 0;
3661             }
3662             delta += min_subtract;
3663             if (flags & SCF_DO_SUBSTR) {
3664                 data->pos_min += l - min_subtract;
3665                 if (data->pos_min < 0) {
3666                     data->pos_min = 0;
3667                 }
3668                 data->pos_delta += min_subtract;
3669                 if (min_subtract) {
3670                     data->longest = &(data->longest_float);
3671                 }
3672             }
3673             if (flags & SCF_DO_STCLASS_AND) {
3674                 /* Check whether it is compatible with what we know already! */
3675                 int compat = 1;
3676                 if (uc >= 0x100 ||
3677                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3678                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3679                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3680                 {
3681                     compat = 0;
3682                 }
3683                 ANYOF_CLASS_ZERO(data->start_class);
3684                 ANYOF_BITMAP_ZERO(data->start_class);
3685                 if (compat) {
3686                     ANYOF_BITMAP_SET(data->start_class, uc);
3687                     data->start_class->flags &= ~ANYOF_EOS;
3688                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3689                     if (OP(scan) == EXACTFL) {
3690                         /* XXX This set is probably no longer necessary, and
3691                          * probably wrong as LOCALE now is on in the initial
3692                          * state */
3693                         data->start_class->flags |= ANYOF_LOCALE;
3694                     }
3695                     else {
3696
3697                         /* Also set the other member of the fold pair.  In case
3698                          * that unicode semantics is called for at runtime, use
3699                          * the full latin1 fold.  (Can't do this for locale,
3700                          * because not known until runtime) */
3701                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3702
3703                         /* All other (EXACTFL handled above) folds except under
3704                          * /iaa that include s, S, and sharp_s also may include
3705                          * the others */
3706                         if (OP(scan) != EXACTFA) {
3707                             if (uc == 's' || uc == 'S') {
3708                                 ANYOF_BITMAP_SET(data->start_class,
3709                                                  LATIN_SMALL_LETTER_SHARP_S);
3710                             }
3711                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3712                                 ANYOF_BITMAP_SET(data->start_class, 's');
3713                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3714                             }
3715                         }
3716                     }
3717                 }
3718                 else if (uc >= 0x100) {
3719                     int i;
3720                     for (i = 0; i < 256; i++){
3721                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3722                             ANYOF_BITMAP_SET(data->start_class, i);
3723                         }
3724                     }
3725                 }
3726             }
3727             else if (flags & SCF_DO_STCLASS_OR) {
3728                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3729                     /* false positive possible if the class is case-folded.
3730                        Assume that the locale settings are the same... */
3731                     if (uc < 0x100) {
3732                         ANYOF_BITMAP_SET(data->start_class, uc);
3733                         if (OP(scan) != EXACTFL) {
3734
3735                             /* And set the other member of the fold pair, but
3736                              * can't do that in locale because not known until
3737                              * run-time */
3738                             ANYOF_BITMAP_SET(data->start_class,
3739                                              PL_fold_latin1[uc]);
3740
3741                             /* All folds except under /iaa that include s, S,
3742                              * and sharp_s also may include the others */
3743                             if (OP(scan) != EXACTFA) {
3744                                 if (uc == 's' || uc == 'S') {
3745                                     ANYOF_BITMAP_SET(data->start_class,
3746                                                    LATIN_SMALL_LETTER_SHARP_S);
3747                                 }
3748                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3749                                     ANYOF_BITMAP_SET(data->start_class, 's');
3750                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3751                                 }
3752                             }
3753                         }
3754                     }
3755                     data->start_class->flags &= ~ANYOF_EOS;
3756                 }
3757                 cl_and(data->start_class, and_withp);
3758             }
3759             flags &= ~SCF_DO_STCLASS;
3760         }
3761         else if (REGNODE_VARIES(OP(scan))) {
3762             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3763             I32 f = flags, pos_before = 0;
3764             regnode * const oscan = scan;
3765             struct regnode_charclass_class this_class;
3766             struct regnode_charclass_class *oclass = NULL;
3767             I32 next_is_eval = 0;
3768
3769             switch (PL_regkind[OP(scan)]) {
3770             case WHILEM:                /* End of (?:...)* . */
3771                 scan = NEXTOPER(scan);
3772                 goto finish;
3773             case PLUS:
3774                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3775                     next = NEXTOPER(scan);
3776                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3777                         mincount = 1;
3778                         maxcount = REG_INFTY;
3779                         next = regnext(scan);
3780                         scan = NEXTOPER(scan);
3781                         goto do_curly;
3782                     }
3783                 }
3784                 if (flags & SCF_DO_SUBSTR)
3785                     data->pos_min++;
3786                 min++;
3787                 /* Fall through. */
3788             case STAR:
3789                 if (flags & SCF_DO_STCLASS) {
3790                     mincount = 0;
3791                     maxcount = REG_INFTY;
3792                     next = regnext(scan);
3793                     scan = NEXTOPER(scan);
3794                     goto do_curly;
3795                 }
3796                 is_inf = is_inf_internal = 1;
3797                 scan = regnext(scan);
3798                 if (flags & SCF_DO_SUBSTR) {
3799                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3800                     data->longest = &(data->longest_float);
3801                 }
3802                 goto optimize_curly_tail;
3803             case CURLY:
3804                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3805                     && (scan->flags == stopparen))
3806                 {
3807                     mincount = 1;
3808                     maxcount = 1;
3809                 } else {
3810                     mincount = ARG1(scan);
3811                     maxcount = ARG2(scan);
3812                 }
3813                 next = regnext(scan);
3814                 if (OP(scan) == CURLYX) {
3815                     I32 lp = (data ? *(data->last_closep) : 0);
3816                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3817                 }
3818                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3819                 next_is_eval = (OP(scan) == EVAL);
3820               do_curly:
3821                 if (flags & SCF_DO_SUBSTR) {
3822                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3823                     pos_before = data->pos_min;
3824                 }
3825                 if (data) {
3826                     fl = data->flags;
3827                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3828                     if (is_inf)
3829                         data->flags |= SF_IS_INF;
3830                 }
3831                 if (flags & SCF_DO_STCLASS) {
3832                     cl_init(pRExC_state, &this_class);
3833                     oclass = data->start_class;
3834                     data->start_class = &this_class;
3835                     f |= SCF_DO_STCLASS_AND;
3836                     f &= ~SCF_DO_STCLASS_OR;
3837                 }
3838                 /* Exclude from super-linear cache processing any {n,m}
3839                    regops for which the combination of input pos and regex
3840                    pos is not enough information to determine if a match
3841                    will be possible.
3842
3843                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3844                    regex pos at the \s*, the prospects for a match depend not
3845                    only on the input position but also on how many (bar\s*)
3846                    repeats into the {4,8} we are. */
3847                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3848                     f &= ~SCF_WHILEM_VISITED_POS;
3849
3850                 /* This will finish on WHILEM, setting scan, or on NULL: */
3851                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3852                                       last, data, stopparen, recursed, NULL,
3853                                       (mincount == 0
3854                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3855
3856                 if (flags & SCF_DO_STCLASS)
3857                     data->start_class = oclass;
3858                 if (mincount == 0 || minnext == 0) {
3859                     if (flags & SCF_DO_STCLASS_OR) {
3860                         cl_or(pRExC_state, data->start_class, &this_class);
3861                     }
3862                     else if (flags & SCF_DO_STCLASS_AND) {
3863                         /* Switch to OR mode: cache the old value of
3864                          * data->start_class */
3865                         INIT_AND_WITHP;
3866                         StructCopy(data->start_class, and_withp,
3867                                    struct regnode_charclass_class);
3868                         flags &= ~SCF_DO_STCLASS_AND;
3869                         StructCopy(&this_class, data->start_class,
3870                                    struct regnode_charclass_class);
3871                         flags |= SCF_DO_STCLASS_OR;
3872                         data->start_class->flags |= ANYOF_EOS;
3873                     }
3874                 } else {                /* Non-zero len */
3875                     if (flags & SCF_DO_STCLASS_OR) {
3876                         cl_or(pRExC_state, data->start_class, &this_class);
3877                         cl_and(data->start_class, and_withp);
3878                     }
3879                     else if (flags & SCF_DO_STCLASS_AND)
3880                         cl_and(data->start_class, &this_class);
3881                     flags &= ~SCF_DO_STCLASS;
3882                 }
3883                 if (!scan)              /* It was not CURLYX, but CURLY. */
3884                     scan = next;
3885                 if ( /* ? quantifier ok, except for (?{ ... }) */
3886                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3887                     && (minnext == 0) && (deltanext == 0)
3888                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3889                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3890                 {
3891                     ckWARNreg(RExC_parse,
3892                               "Quantifier unexpected on zero-length expression");
3893                 }
3894
3895                 min += minnext * mincount;
3896                 is_inf_internal |= ((maxcount == REG_INFTY
3897                                      && (minnext + deltanext) > 0)
3898                                     || deltanext == I32_MAX);
3899                 is_inf |= is_inf_internal;
3900                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3901
3902                 /* Try powerful optimization CURLYX => CURLYN. */
3903                 if (  OP(oscan) == CURLYX && data
3904                       && data->flags & SF_IN_PAR
3905                       && !(data->flags & SF_HAS_EVAL)
3906                       && !deltanext && minnext == 1 ) {
3907                     /* Try to optimize to CURLYN.  */
3908                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3909                     regnode * const nxt1 = nxt;
3910 #ifdef DEBUGGING
3911                     regnode *nxt2;
3912 #endif
3913
3914                     /* Skip open. */
3915                     nxt = regnext(nxt);
3916                     if (!REGNODE_SIMPLE(OP(nxt))
3917                         && !(PL_regkind[OP(nxt)] == EXACT
3918                              && STR_LEN(nxt) == 1))
3919                         goto nogo;
3920 #ifdef DEBUGGING
3921                     nxt2 = nxt;
3922 #endif
3923                     nxt = regnext(nxt);
3924                     if (OP(nxt) != CLOSE)
3925                         goto nogo;
3926                     if (RExC_open_parens) {
3927                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3928                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3929                     }
3930                     /* Now we know that nxt2 is the only contents: */
3931                     oscan->flags = (U8)ARG(nxt);
3932                     OP(oscan) = CURLYN;
3933                     OP(nxt1) = NOTHING; /* was OPEN. */
3934
3935 #ifdef DEBUGGING
3936                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3937                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3938                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3939                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3940                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3941                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3942 #endif
3943                 }
3944               nogo:
3945
3946                 /* Try optimization CURLYX => CURLYM. */
3947                 if (  OP(oscan) == CURLYX && data
3948                       && !(data->flags & SF_HAS_PAR)
3949                       && !(data->flags & SF_HAS_EVAL)
3950                       && !deltanext     /* atom is fixed width */
3951                       && minnext != 0   /* CURLYM can't handle zero width */
3952                 ) {
3953                     /* XXXX How to optimize if data == 0? */
3954                     /* Optimize to a simpler form.  */
3955                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3956                     regnode *nxt2;
3957
3958                     OP(oscan) = CURLYM;
3959                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3960                             && (OP(nxt2) != WHILEM))
3961                         nxt = nxt2;
3962                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3963                     /* Need to optimize away parenths. */
3964                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3965                         /* Set the parenth number.  */
3966                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3967
3968                         oscan->flags = (U8)ARG(nxt);
3969                         if (RExC_open_parens) {
3970                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3971                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3972                         }
3973                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3974                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3975
3976 #ifdef DEBUGGING
3977                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3978                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3979                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3980                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3981 #endif
3982 #if 0
3983                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3984                             regnode *nnxt = regnext(nxt1);
3985                             if (nnxt == nxt) {
3986                                 if (reg_off_by_arg[OP(nxt1)])
3987                                     ARG_SET(nxt1, nxt2 - nxt1);
3988                                 else if (nxt2 - nxt1 < U16_MAX)
3989                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3990                                 else
3991                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3992                             }
3993                             nxt1 = nnxt;
3994                         }
3995 #endif
3996                         /* Optimize again: */
3997                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3998                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3999                     }
4000                     else
4001                         oscan->flags = 0;
4002                 }
4003                 else if ((OP(oscan) == CURLYX)
4004                          && (flags & SCF_WHILEM_VISITED_POS)
4005                          /* See the comment on a similar expression above.
4006                             However, this time it's not a subexpression
4007                             we care about, but the expression itself. */
4008                          && (maxcount == REG_INFTY)
4009                          && data && ++data->whilem_c < 16) {
4010                     /* This stays as CURLYX, we can put the count/of pair. */
4011                     /* Find WHILEM (as in regexec.c) */
4012                     regnode *nxt = oscan + NEXT_OFF(oscan);
4013
4014                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4015                         nxt += ARG(nxt);
4016                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4017                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4018                 }
4019                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4020                     pars++;
4021                 if (flags & SCF_DO_SUBSTR) {
4022                     SV *last_str = NULL;
4023                     int counted = mincount != 0;
4024
4025                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4026 #if defined(SPARC64_GCC_WORKAROUND)
4027                         I32 b = 0;
4028                         STRLEN l = 0;
4029                         const char *s = NULL;
4030                         I32 old = 0;
4031
4032                         if (pos_before >= data->last_start_min)
4033                             b = pos_before;
4034                         else
4035                             b = data->last_start_min;
4036
4037                         l = 0;
4038                         s = SvPV_const(data->last_found, l);
4039                         old = b - data->last_start_min;
4040
4041 #else
4042                         I32 b = pos_before >= data->last_start_min
4043                             ? pos_before : data->last_start_min;
4044                         STRLEN l;
4045                         const char * const s = SvPV_const(data->last_found, l);
4046                         I32 old = b - data->last_start_min;
4047 #endif
4048
4049                         if (UTF)
4050                             old = utf8_hop((U8*)s, old) - (U8*)s;
4051                         l -= old;
4052                         /* Get the added string: */
4053                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4054                         if (deltanext == 0 && pos_before == b) {
4055                             /* What was added is a constant string */
4056                             if (mincount > 1) {
4057                                 SvGROW(last_str, (mincount * l) + 1);
4058                                 repeatcpy(SvPVX(last_str) + l,
4059                                           SvPVX_const(last_str), l, mincount - 1);
4060                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4061                                 /* Add additional parts. */
4062                                 SvCUR_set(data->last_found,
4063                                           SvCUR(data->last_found) - l);
4064                                 sv_catsv(data->last_found, last_str);
4065                                 {
4066                                     SV * sv = data->last_found;
4067                                     MAGIC *mg =
4068                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4069                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4070                                     if (mg && mg->mg_len >= 0)
4071                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4072                                 }
4073                                 data->last_end += l * (mincount - 1);
4074                             }
4075                         } else {
4076                             /* start offset must point into the last copy */
4077                             data->last_start_min += minnext * (mincount - 1);
4078                             data->last_start_max += is_inf ? I32_MAX
4079                                 : (maxcount - 1) * (minnext + data->pos_delta);
4080                         }
4081                     }
4082                     /* It is counted once already... */
4083                     data->pos_min += minnext * (mincount - counted);
4084                     data->pos_delta += - counted * deltanext +
4085                         (minnext + deltanext) * maxcount - minnext * mincount;
4086                     if (mincount != maxcount) {
4087                          /* Cannot extend fixed substrings found inside
4088                             the group.  */
4089                         SCAN_COMMIT(pRExC_state,data,minlenp);
4090                         if (mincount && last_str) {
4091                             SV * const sv = data->last_found;
4092                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4093                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4094
4095                             if (mg)
4096                                 mg->mg_len = -1;
4097                             sv_setsv(sv, last_str);
4098                             data->last_end = data->pos_min;
4099                             data->last_start_min =
4100                                 data->pos_min - CHR_SVLEN(last_str);
4101                             data->last_start_max = is_inf
4102                                 ? I32_MAX
4103                                 : data->pos_min + data->pos_delta
4104                                 - CHR_SVLEN(last_str);
4105                         }
4106                         data->longest = &(data->longest_float);
4107                     }
4108                     SvREFCNT_dec(last_str);
4109                 }
4110                 if (data && (fl & SF_HAS_EVAL))
4111                     data->flags |= SF_HAS_EVAL;
4112               optimize_curly_tail:
4113                 if (OP(oscan) != CURLYX) {
4114                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4115                            && NEXT_OFF(next))
4116                         NEXT_OFF(oscan) += NEXT_OFF(next);
4117                 }
4118                 continue;
4119             default:                    /* REF, ANYOFV, and CLUMP only? */
4120                 if (flags & SCF_DO_SUBSTR) {
4121                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4122                     data->longest = &(data->longest_float);
4123                 }
4124                 is_inf = is_inf_internal = 1;
4125                 if (flags & SCF_DO_STCLASS_OR)
4126                     cl_anything(pRExC_state, data->start_class);
4127                 flags &= ~SCF_DO_STCLASS;
4128                 break;
4129             }
4130         }
4131         else if (OP(scan) == LNBREAK) {
4132             if (flags & SCF_DO_STCLASS) {
4133                 int value = 0;
4134                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4135                 if (flags & SCF_DO_STCLASS_AND) {
4136                     for (value = 0; value < 256; value++)
4137                         if (!is_VERTWS_cp(value))
4138                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4139                 }
4140                 else {
4141                     for (value = 0; value < 256; value++)
4142                         if (is_VERTWS_cp(value))
4143                             ANYOF_BITMAP_SET(data->start_class, value);
4144                 }
4145                 if (flags & SCF_DO_STCLASS_OR)
4146                     cl_and(data->start_class, and_withp);
4147                 flags &= ~SCF_DO_STCLASS;
4148             }
4149             min += 1;
4150             delta += 1;
4151             if (flags & SCF_DO_SUBSTR) {
4152                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4153                 data->pos_min += 1;
4154                 data->pos_delta += 1;
4155                 data->longest = &(data->longest_float);
4156             }
4157         }
4158         else if (REGNODE_SIMPLE(OP(scan))) {
4159             int value = 0;
4160
4161             if (flags & SCF_DO_SUBSTR) {
4162                 SCAN_COMMIT(pRExC_state,data,minlenp);
4163                 data->pos_min++;
4164             }
4165             min++;
4166             if (flags & SCF_DO_STCLASS) {
4167                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4168
4169                 /* Some of the logic below assumes that switching
4170                    locale on will only add false positives. */
4171                 switch (PL_regkind[OP(scan)]) {
4172                 case SANY:
4173                 default:
4174                   do_default:
4175                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4176                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4177                         cl_anything(pRExC_state, data->start_class);
4178                     break;
4179                 case REG_ANY:
4180                     if (OP(scan) == SANY)
4181                         goto do_default;
4182                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4183                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4184                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4185                         cl_anything(pRExC_state, data->start_class);
4186                     }
4187                     if (flags & SCF_DO_STCLASS_AND || !value)
4188                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4189                     break;
4190                 case ANYOF:
4191                     if (flags & SCF_DO_STCLASS_AND)
4192                         cl_and(data->start_class,
4193                                (struct regnode_charclass_class*)scan);
4194                     else
4195                         cl_or(pRExC_state, data->start_class,
4196                               (struct regnode_charclass_class*)scan);
4197                     break;
4198                 case ALNUM:
4199                     if (flags & SCF_DO_STCLASS_AND) {
4200                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4201                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4202                             if (OP(scan) == ALNUMU) {
4203                                 for (value = 0; value < 256; value++) {
4204                                     if (!isWORDCHAR_L1(value)) {
4205                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4206                                     }
4207                                 }
4208                             } else {
4209                                 for (value = 0; value < 256; value++) {
4210                                     if (!isALNUM(value)) {
4211                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4212                                     }
4213                                 }
4214                             }
4215                         }
4216                     }
4217                     else {
4218                         if (data->start_class->flags & ANYOF_LOCALE)
4219                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4220
4221                         /* Even if under locale, set the bits for non-locale
4222                          * in case it isn't a true locale-node.  This will
4223                          * create false positives if it truly is locale */
4224                         if (OP(scan) == ALNUMU) {
4225                             for (value = 0; value < 256; value++) {
4226                                 if (isWORDCHAR_L1(value)) {
4227                                     ANYOF_BITMAP_SET(data->start_class, value);
4228                                 }
4229                             }
4230                         } else {
4231                             for (value = 0; value < 256; value++) {
4232                                 if (isALNUM(value)) {
4233                                     ANYOF_BITMAP_SET(data->start_class, value);
4234                                 }
4235                             }
4236                         }
4237                     }
4238                     break;
4239                 case NALNUM:
4240                     if (flags & SCF_DO_STCLASS_AND) {
4241                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4242                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4243                             if (OP(scan) == NALNUMU) {
4244                                 for (value = 0; value < 256; value++) {
4245                                     if (isWORDCHAR_L1(value)) {
4246                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4247                                     }
4248                                 }
4249                             } else {
4250                                 for (value = 0; value < 256; value++) {
4251                                     if (isALNUM(value)) {
4252                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4253                                     }
4254                                 }
4255                             }
4256                         }
4257                     }
4258                     else {
4259                         if (data->start_class->flags & ANYOF_LOCALE)
4260                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4261
4262                         /* Even if under locale, set the bits for non-locale in
4263                          * case it isn't a true locale-node.  This will create
4264                          * false positives if it truly is locale */
4265                         if (OP(scan) == NALNUMU) {
4266                             for (value = 0; value < 256; value++) {
4267                                 if (! isWORDCHAR_L1(value)) {
4268                                     ANYOF_BITMAP_SET(data->start_class, value);
4269                                 }
4270                             }
4271                         } else {
4272                             for (value = 0; value < 256; value++) {
4273                                 if (! isALNUM(value)) {
4274                                     ANYOF_BITMAP_SET(data->start_class, value);
4275                                 }
4276                             }
4277                         }
4278                     }
4279                     break;
4280                 case SPACE:
4281                     if (flags & SCF_DO_STCLASS_AND) {
4282                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4283                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4284                             if (OP(scan) == SPACEU) {
4285                                 for (value = 0; value < 256; value++) {
4286                                     if (!isSPACE_L1(value)) {
4287                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4288                                     }
4289                                 }
4290                             } else {
4291                                 for (value = 0; value < 256; value++) {
4292                                     if (!isSPACE(value)) {
4293                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4294                                     }
4295                                 }
4296                             }
4297                         }
4298                     }
4299                     else {
4300                         if (data->start_class->flags & ANYOF_LOCALE) {
4301                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4302                         }
4303                         if (OP(scan) == SPACEU) {
4304                             for (value = 0; value < 256; value++) {
4305                                 if (isSPACE_L1(value)) {
4306                                     ANYOF_BITMAP_SET(data->start_class, value);
4307                                 }
4308                             }
4309                         } else {
4310                             for (value = 0; value < 256; value++) {
4311                                 if (isSPACE(value)) {
4312                                     ANYOF_BITMAP_SET(data->start_class, value);
4313                                 }
4314                             }
4315                         }
4316                     }
4317                     break;
4318                 case NSPACE:
4319                     if (flags & SCF_DO_STCLASS_AND) {
4320                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4321                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4322                             if (OP(scan) == NSPACEU) {
4323                                 for (value = 0; value < 256; value++) {
4324                                     if (isSPACE_L1(value)) {
4325                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4326                                     }
4327                                 }
4328                             } else {
4329                                 for (value = 0; value < 256; value++) {
4330                                     if (isSPACE(value)) {
4331                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4332                                     }
4333                                 }
4334                             }
4335                         }
4336                     }
4337                     else {
4338                         if (data->start_class->flags & ANYOF_LOCALE)
4339                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4340                         if (OP(scan) == NSPACEU) {
4341                             for (value = 0; value < 256; value++) {
4342                                 if (!isSPACE_L1(value)) {
4343                                     ANYOF_BITMAP_SET(data->start_class, value);
4344                                 }
4345                             }
4346                         }
4347                         else {
4348                             for (value = 0; value < 256; value++) {
4349                                 if (!isSPACE(value)) {
4350                                     ANYOF_BITMAP_SET(data->start_class, value);
4351                                 }
4352                             }
4353                         }
4354                     }
4355                     break;
4356                 case DIGIT:
4357                     if (flags & SCF_DO_STCLASS_AND) {
4358                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4359                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4360                             for (value = 0; value < 256; value++)
4361                                 if (!isDIGIT(value))
4362                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4363                         }
4364                     }
4365                     else {
4366                         if (data->start_class->flags & ANYOF_LOCALE)
4367                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4368                         for (value = 0; value < 256; value++)
4369                             if (isDIGIT(value))
4370                                 ANYOF_BITMAP_SET(data->start_class, value);
4371                     }
4372                     break;
4373                 case NDIGIT:
4374                     if (flags & SCF_DO_STCLASS_AND) {
4375                         if (!(data->start_class->flags & ANYOF_LOCALE))
4376                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4377                         for (value = 0; value < 256; value++)
4378                             if (isDIGIT(value))
4379                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4380                     }
4381                     else {
4382                         if (data->start_class->flags & ANYOF_LOCALE)
4383                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4384                         for (value = 0; value < 256; value++)
4385                             if (!isDIGIT(value))
4386                                 ANYOF_BITMAP_SET(data->start_class, value);
4387                     }
4388                     break;
4389                 CASE_SYNST_FNC(VERTWS);
4390                 CASE_SYNST_FNC(HORIZWS);
4391
4392                 }
4393                 if (flags & SCF_DO_STCLASS_OR)
4394                     cl_and(data->start_class, and_withp);
4395                 flags &= ~SCF_DO_STCLASS;
4396             }
4397         }
4398         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4399             data->flags |= (OP(scan) == MEOL
4400                             ? SF_BEFORE_MEOL
4401                             : SF_BEFORE_SEOL);
4402         }
4403         else if (  PL_regkind[OP(scan)] == BRANCHJ
4404                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4405                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4406                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4407             if ( OP(scan) == UNLESSM &&
4408                  scan->flags == 0 &&
4409                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4410                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4411             ) {
4412                 regnode *opt;
4413                 regnode *upto= regnext(scan);
4414                 DEBUG_PARSE_r({
4415                     SV * const mysv_val=sv_newmortal();
4416                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4417
4418                     /*DEBUG_PARSE_MSG("opfail");*/
4419                     regprop(RExC_rx, mysv_val, upto);
4420                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4421                                   SvPV_nolen_const(mysv_val),
4422                                   (IV)REG_NODE_NUM(upto),
4423                                   (IV)(upto - scan)
4424                     );
4425                 });
4426                 OP(scan) = OPFAIL;
4427                 NEXT_OFF(scan) = upto - scan;
4428                 for (opt= scan + 1; opt < upto ; opt++)
4429                     OP(opt) = OPTIMIZED;
4430                 scan= upto;
4431                 continue;
4432             }
4433             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4434                 || OP(scan) == UNLESSM )
4435             {
4436                 /* Negative Lookahead/lookbehind
4437                    In this case we can't do fixed string optimisation.
4438                 */
4439
4440                 I32 deltanext, minnext, fake = 0;
4441                 regnode *nscan;
4442                 struct regnode_charclass_class intrnl;
4443                 int f = 0;
4444
4445                 data_fake.flags = 0;
4446                 if (data) {
4447                     data_fake.whilem_c = data->whilem_c;
4448                     data_fake.last_closep = data->last_closep;
4449                 }
4450                 else
4451                     data_fake.last_closep = &fake;
4452                 data_fake.pos_delta = delta;
4453                 if ( flags & SCF_DO_STCLASS && !scan->flags
4454                      && OP(scan) == IFMATCH ) { /* Lookahead */
4455                     cl_init(pRExC_state, &intrnl);
4456                     data_fake.start_class = &intrnl;
4457                     f |= SCF_DO_STCLASS_AND;
4458                 }
4459                 if (flags & SCF_WHILEM_VISITED_POS)
4460                     f |= SCF_WHILEM_VISITED_POS;
4461                 next = regnext(scan);
4462                 nscan = NEXTOPER(NEXTOPER(scan));
4463                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4464                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4465                 if (scan->flags) {
4466                     if (deltanext) {
4467                         FAIL("Variable length lookbehind not implemented");
4468                     }
4469                     else if (minnext > (I32)U8_MAX) {
4470                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4471                     }
4472                     scan->flags = (U8)minnext;
4473                 }
4474                 if (data) {
4475                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4476                         pars++;
4477                     if (data_fake.flags & SF_HAS_EVAL)
4478                         data->flags |= SF_HAS_EVAL;
4479                     data->whilem_c = data_fake.whilem_c;
4480                 }
4481                 if (f & SCF_DO_STCLASS_AND) {
4482                     if (flags & SCF_DO_STCLASS_OR) {
4483                         /* OR before, AND after: ideally we would recurse with
4484                          * data_fake to get the AND applied by study of the
4485                          * remainder of the pattern, and then derecurse;
4486                          * *** HACK *** for now just treat as "no information".
4487                          * See [perl #56690].
4488                          */
4489                         cl_init(pRExC_state, data->start_class);
4490                     }  else {
4491                         /* AND before and after: combine and continue */
4492                         const int was = (data->start_class->flags & ANYOF_EOS);
4493
4494                         cl_and(data->start_class, &intrnl);
4495                         if (was)
4496                             data->start_class->flags |= ANYOF_EOS;
4497                     }
4498                 }
4499             }
4500 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4501             else {
4502                 /* Positive Lookahead/lookbehind
4503                    In this case we can do fixed string optimisation,
4504                    but we must be careful about it. Note in the case of
4505                    lookbehind the positions will be offset by the minimum
4506                    length of the pattern, something we won't know about
4507                    until after the recurse.
4508                 */
4509                 I32 deltanext, fake = 0;
4510                 regnode *nscan;
4511                 struct regnode_charclass_class intrnl;
4512                 int f = 0;
4513                 /* We use SAVEFREEPV so that when the full compile
4514                     is finished perl will clean up the allocated
4515                     minlens when it's all done. This way we don't
4516                     have to worry about freeing them when we know
4517                     they wont be used, which would be a pain.
4518                  */
4519                 I32 *minnextp;
4520                 Newx( minnextp, 1, I32 );
4521                 SAVEFREEPV(minnextp);
4522
4523                 if (data) {
4524                     StructCopy(data, &data_fake, scan_data_t);
4525                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4526                         f |= SCF_DO_SUBSTR;
4527                         if (scan->flags)
4528                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4529                         data_fake.last_found=newSVsv(data->last_found);
4530                     }
4531                 }
4532                 else
4533                     data_fake.last_closep = &fake;
4534                 data_fake.flags = 0;
4535                 data_fake.pos_delta = delta;
4536                 if (is_inf)
4537                     data_fake.flags |= SF_IS_INF;
4538                 if ( flags & SCF_DO_STCLASS && !scan->flags
4539                      && OP(scan) == IFMATCH ) { /* Lookahead */
4540                     cl_init(pRExC_state, &intrnl);
4541                     data_fake.start_class = &intrnl;
4542                     f |= SCF_DO_STCLASS_AND;
4543                 }
4544                 if (flags & SCF_WHILEM_VISITED_POS)
4545                     f |= SCF_WHILEM_VISITED_POS;
4546                 next = regnext(scan);
4547                 nscan = NEXTOPER(NEXTOPER(scan));
4548
4549                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4550                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4551                 if (scan->flags) {
4552                     if (deltanext) {
4553                         FAIL("Variable length lookbehind not implemented");
4554                     }
4555                     else if (*minnextp > (I32)U8_MAX) {
4556                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4557                     }
4558                     scan->flags = (U8)*minnextp;
4559                 }
4560
4561                 *minnextp += min;
4562
4563                 if (f & SCF_DO_STCLASS_AND) {
4564                     const int was = (data->start_class->flags & ANYOF_EOS);
4565
4566                     cl_and(data->start_class, &intrnl);
4567                     if (was)
4568                         data->start_class->flags |= ANYOF_EOS;
4569                 }
4570                 if (data) {
4571                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4572                         pars++;
4573                     if (data_fake.flags & SF_HAS_EVAL)
4574                         data->flags |= SF_HAS_EVAL;
4575                     data->whilem_c = data_fake.whilem_c;
4576                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4577                         if (RExC_rx->minlen<*minnextp)
4578                             RExC_rx->minlen=*minnextp;
4579                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4580                         SvREFCNT_dec(data_fake.last_found);
4581
4582                         if ( data_fake.minlen_fixed != minlenp )
4583                         {
4584                             data->offset_fixed= data_fake.offset_fixed;
4585                             data->minlen_fixed= data_fake.minlen_fixed;
4586                             data->lookbehind_fixed+= scan->flags;
4587                         }
4588                         if ( data_fake.minlen_float != minlenp )
4589                         {
4590                             data->minlen_float= data_fake.minlen_float;
4591                             data->offset_float_min=data_fake.offset_float_min;
4592                             data->offset_float_max=data_fake.offset_float_max;
4593                             data->lookbehind_float+= scan->flags;
4594                         }
4595                     }
4596                 }
4597             }
4598 #endif
4599         }
4600         else if (OP(scan) == OPEN) {
4601             if (stopparen != (I32)ARG(scan))
4602                 pars++;
4603         }
4604         else if (OP(scan) == CLOSE) {
4605             if (stopparen == (I32)ARG(scan)) {
4606                 break;
4607             }
4608             if ((I32)ARG(scan) == is_par) {
4609                 next = regnext(scan);
4610
4611                 if ( next && (OP(next) != WHILEM) && next < last)
4612                     is_par = 0;         /* Disable optimization */
4613             }
4614             if (data)
4615                 *(data->last_closep) = ARG(scan);
4616         }
4617         else if (OP(scan) == EVAL) {
4618                 if (data)
4619                     data->flags |= SF_HAS_EVAL;
4620         }
4621         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4622             if (flags & SCF_DO_SUBSTR) {
4623                 SCAN_COMMIT(pRExC_state,data,minlenp);
4624                 flags &= ~SCF_DO_SUBSTR;
4625             }
4626             if (data && OP(scan)==ACCEPT) {
4627                 data->flags |= SCF_SEEN_ACCEPT;
4628                 if (stopmin > min)
4629                     stopmin = min;
4630             }
4631         }
4632         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4633         {
4634                 if (flags & SCF_DO_SUBSTR) {
4635                     SCAN_COMMIT(pRExC_state,data,minlenp);
4636                     data->longest = &(data->longest_float);
4637                 }
4638                 is_inf = is_inf_internal = 1;
4639                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4640                     cl_anything(pRExC_state, data->start_class);
4641                 flags &= ~SCF_DO_STCLASS;
4642         }
4643         else if (OP(scan) == GPOS) {
4644             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4645                 !(delta || is_inf || (data && data->pos_delta)))
4646             {
4647                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4648                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4649                 if (RExC_rx->gofs < (U32)min)
4650                     RExC_rx->gofs = min;
4651             } else {
4652                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4653                 RExC_rx->gofs = 0;
4654             }
4655         }
4656 #ifdef TRIE_STUDY_OPT
4657 #ifdef FULL_TRIE_STUDY
4658         else if (PL_regkind[OP(scan)] == TRIE) {
4659             /* NOTE - There is similar code to this block above for handling
4660                BRANCH nodes on the initial study.  If you change stuff here
4661                check there too. */
4662             regnode *trie_node= scan;
4663             regnode *tail= regnext(scan);
4664             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4665             I32 max1 = 0, min1 = I32_MAX;
4666             struct regnode_charclass_class accum;
4667
4668             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4669                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4670             if (flags & SCF_DO_STCLASS)
4671                 cl_init_zero(pRExC_state, &accum);
4672
4673             if (!trie->jump) {
4674                 min1= trie->minlen;
4675                 max1= trie->maxlen;
4676             } else {
4677                 const regnode *nextbranch= NULL;
4678                 U32 word;
4679
4680                 for ( word=1 ; word <= trie->wordcount ; word++)
4681                 {
4682                     I32 deltanext=0, minnext=0, f = 0, fake;
4683                     struct regnode_charclass_class this_class;
4684
4685                     data_fake.flags = 0;
4686                     if (data) {
4687                         data_fake.whilem_c = data->whilem_c;
4688                         data_fake.last_closep = data->last_closep;
4689                     }
4690                     else
4691                         data_fake.last_closep = &fake;
4692                     data_fake.pos_delta = delta;
4693                     if (flags & SCF_DO_STCLASS) {
4694                         cl_init(pRExC_state, &this_class);
4695                         data_fake.start_class = &this_class;
4696                         f = SCF_DO_STCLASS_AND;
4697                     }
4698                     if (flags & SCF_WHILEM_VISITED_POS)
4699                         f |= SCF_WHILEM_VISITED_POS;
4700
4701                     if (trie->jump[word]) {
4702                         if (!nextbranch)
4703                             nextbranch = trie_node + trie->jump[0];
4704                         scan= trie_node + trie->jump[word];
4705                         /* We go from the jump point to the branch that follows
4706                            it. Note this means we need the vestigal unused branches
4707                            even though they arent otherwise used.
4708                          */
4709                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4710                             &deltanext, (regnode *)nextbranch, &data_fake,
4711                             stopparen, recursed, NULL, f,depth+1);
4712                     }
4713                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4714                         nextbranch= regnext((regnode*)nextbranch);
4715
4716                     if (min1 > (I32)(minnext + trie->minlen))
4717                         min1 = minnext + trie->minlen;
4718                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4719                         max1 = minnext + deltanext + trie->maxlen;
4720                     if (deltanext == I32_MAX)
4721                         is_inf = is_inf_internal = 1;
4722
4723                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4724                         pars++;
4725                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4726                         if ( stopmin > min + min1)
4727                             stopmin = min + min1;
4728                         flags &= ~SCF_DO_SUBSTR;
4729                         if (data)
4730                             data->flags |= SCF_SEEN_ACCEPT;
4731                     }
4732                     if (data) {
4733                         if (data_fake.flags & SF_HAS_EVAL)
4734                             data->flags |= SF_HAS_EVAL;
4735                         data->whilem_c = data_fake.whilem_c;
4736                     }
4737                     if (flags & SCF_DO_STCLASS)
4738                         cl_or(pRExC_state, &accum, &this_class);
4739                 }
4740             }
4741             if (flags & SCF_DO_SUBSTR) {
4742                 data->pos_min += min1;
4743                 data->pos_delta += max1 - min1;
4744                 if (max1 != min1 || is_inf)
4745                     data->longest = &(data->longest_float);
4746             }
4747             min += min1;
4748             delta += max1 - min1;
4749             if (flags & SCF_DO_STCLASS_OR) {
4750                 cl_or(pRExC_state, data->start_class, &accum);
4751                 if (min1) {
4752                     cl_and(data->start_class, and_withp);
4753                     flags &= ~SCF_DO_STCLASS;
4754                 }
4755             }
4756             else if (flags & SCF_DO_STCLASS_AND) {
4757                 if (min1) {
4758                     cl_and(data->start_class, &accum);
4759                     flags &= ~SCF_DO_STCLASS;
4760                 }
4761                 else {
4762                     /* Switch to OR mode: cache the old value of
4763                      * data->start_class */
4764                     INIT_AND_WITHP;
4765                     StructCopy(data->start_class, and_withp,
4766                                struct regnode_charclass_class);
4767                     flags &= ~SCF_DO_STCLASS_AND;
4768                     StructCopy(&accum, data->start_class,
4769                                struct regnode_charclass_class);
4770                     flags |= SCF_DO_STCLASS_OR;
4771                     data->start_class->flags |= ANYOF_EOS;
4772                 }
4773             }
4774             scan= tail;
4775             continue;
4776         }
4777 #else
4778         else if (PL_regkind[OP(scan)] == TRIE) {
4779             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4780             U8*bang=NULL;
4781
4782             min += trie->minlen;
4783             delta += (trie->maxlen - trie->minlen);
4784             flags &= ~SCF_DO_STCLASS; /* xxx */
4785             if (flags & SCF_DO_SUBSTR) {
4786                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4787                 data->pos_min += trie->minlen;
4788                 data->pos_delta += (trie->maxlen - trie->minlen);
4789                 if (trie->maxlen != trie->minlen)
4790                     data->longest = &(data->longest_float);
4791             }
4792             if (trie->jump) /* no more substrings -- for now /grr*/
4793                 flags &= ~SCF_DO_SUBSTR;
4794         }
4795 #endif /* old or new */
4796 #endif /* TRIE_STUDY_OPT */
4797
4798         /* Else: zero-length, ignore. */
4799         scan = regnext(scan);
4800     }
4801     if (frame) {
4802         last = frame->last;
4803         scan = frame->next;
4804         stopparen = frame->stop;
4805         frame = frame->prev;
4806         goto fake_study_recurse;
4807     }
4808
4809   finish:
4810     assert(!frame);
4811     DEBUG_STUDYDATA("pre-fin:",data,depth);
4812
4813     *scanp = scan;
4814     *deltap = is_inf_internal ? I32_MAX : delta;
4815     if (flags & SCF_DO_SUBSTR && is_inf)
4816         data->pos_delta = I32_MAX - data->pos_min;
4817     if (is_par > (I32)U8_MAX)
4818         is_par = 0;
4819     if (is_par && pars==1 && data) {
4820         data->flags |= SF_IN_PAR;
4821         data->flags &= ~SF_HAS_PAR;
4822     }
4823     else if (pars && data) {
4824         data->flags |= SF_HAS_PAR;
4825         data->flags &= ~SF_IN_PAR;
4826     }
4827     if (flags & SCF_DO_STCLASS_OR)
4828         cl_and(data->start_class, and_withp);
4829     if (flags & SCF_TRIE_RESTUDY)
4830         data->flags |=  SCF_TRIE_RESTUDY;
4831
4832     DEBUG_STUDYDATA("post-fin:",data,depth);
4833
4834     return min < stopmin ? min : stopmin;
4835 }
4836
4837 STATIC U32
4838 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4839 {
4840     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4841
4842     PERL_ARGS_ASSERT_ADD_DATA;
4843
4844     Renewc(RExC_rxi->data,
4845            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4846            char, struct reg_data);
4847     if(count)
4848         Renew(RExC_rxi->data->what, count + n, U8);
4849     else
4850         Newx(RExC_rxi->data->what, n, U8);
4851     RExC_rxi->data->count = count + n;
4852     Copy(s, RExC_rxi->data->what + count, n, U8);
4853     return count;
4854 }
4855
4856 /*XXX: todo make this not included in a non debugging perl */
4857 #ifndef PERL_IN_XSUB_RE
4858 void
4859 Perl_reginitcolors(pTHX)
4860 {
4861     dVAR;
4862     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4863     if (s) {
4864         char *t = savepv(s);
4865         int i = 0;
4866         PL_colors[0] = t;
4867         while (++i < 6) {
4868             t = strchr(t, '\t');
4869             if (t) {
4870                 *t = '\0';
4871                 PL_colors[i] = ++t;
4872             }
4873             else
4874                 PL_colors[i] = t = (char *)"";
4875         }
4876     } else {
4877         int i = 0;
4878         while (i < 6)
4879             PL_colors[i++] = (char *)"";
4880     }
4881     PL_colorset = 1;
4882 }
4883 #endif
4884
4885
4886 #ifdef TRIE_STUDY_OPT
4887 #define CHECK_RESTUDY_GOTO                                  \
4888         if (                                                \
4889               (data.flags & SCF_TRIE_RESTUDY)               \
4890               && ! restudied++                              \
4891         )     goto reStudy
4892 #else
4893 #define CHECK_RESTUDY_GOTO
4894 #endif
4895
4896 /*
4897  * pregcomp - compile a regular expression into internal code
4898  *
4899  * Decides which engine's compiler to call based on the hint currently in
4900  * scope
4901  */
4902
4903 #ifndef PERL_IN_XSUB_RE
4904
4905 /* return the currently in-scope regex engine (or the default if none)  */
4906
4907 regexp_engine const *
4908 Perl_current_re_engine(pTHX)
4909 {
4910     dVAR;
4911
4912     if (IN_PERL_COMPILETIME) {
4913         HV * const table = GvHV(PL_hintgv);
4914         SV **ptr;
4915
4916         if (!table)
4917             return &PL_core_reg_engine;
4918         ptr = hv_fetchs(table, "regcomp", FALSE);
4919         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4920             return &PL_core_reg_engine;
4921         return INT2PTR(regexp_engine*,SvIV(*ptr));
4922     }
4923     else {
4924         SV *ptr;
4925         if (!PL_curcop->cop_hints_hash)
4926             return &PL_core_reg_engine;
4927         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4928         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4929             return &PL_core_reg_engine;
4930         return INT2PTR(regexp_engine*,SvIV(ptr));
4931     }
4932 }
4933
4934
4935 REGEXP *
4936 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4937 {
4938     dVAR;
4939     regexp_engine const *eng = current_re_engine();
4940     GET_RE_DEBUG_FLAGS_DECL;
4941
4942     PERL_ARGS_ASSERT_PREGCOMP;
4943
4944     /* Dispatch a request to compile a regexp to correct regexp engine. */
4945     DEBUG_COMPILE_r({
4946         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4947                         PTR2UV(eng));
4948     });
4949     return CALLREGCOMP_ENG(eng, pattern, flags);
4950 }
4951 #endif
4952
4953 /* public(ish) wrapper for Perl_re_op_compile that only takes an SV
4954  * pattern rather than a list of OPs */
4955
4956 REGEXP *
4957 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4958 {
4959     SV *pat = pattern; /* defeat constness! */
4960     PERL_ARGS_ASSERT_RE_COMPILE;
4961     return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
4962                                 NULL, NULL, rx_flags, 0);
4963 }
4964
4965
4966 /*
4967  * Perl_re_op_compile - the perl internal RE engine's function to compile a
4968  * regular expression into internal code.
4969  * The pattern may be passed either as:
4970  *    a list of SVs (patternp plus pat_count)
4971  *    a list of OPs (expr)
4972  * If both are passed, the SV list is used, but the OP list indicates
4973  * which SVs are actually pre-compiled code blocks
4974  *
4975  * The SVs in the list have magic and qr overloading applied to them (and
4976  * the list may be modified in-place with replacement SVs in the latter
4977  * case).
4978  *
4979  * If the pattern hasn't changed from old_re, then old_re will be
4980  * returned.
4981  *
4982  * eng is the current engine. If that engine has an op_comp method, then
4983  * handle directly (i.e. we assume that op_comp was us); otherwise, just
4984  * do the initial concatenation of arguments and pass on to the external
4985  * engine.
4986  *
4987  * If is_bare_re is not null, set it to a boolean indicating whether the
4988  * arg list reduced (after overloading) to a single bare regex which has
4989  * been returned (i.e. /$qr/).
4990  *
4991  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
4992  *
4993  * pm_flags contains the PMf_* flags from the calling PMOP. Currently
4994  * we're only interested in PMf_HAS_CV and PMf_IS_QR.
4995  *
4996  * We can't allocate space until we know how big the compiled form will be,
4997  * but we can't compile it (and thus know how big it is) until we've got a
4998  * place to put the code.  So we cheat:  we compile it twice, once with code
4999  * generation turned off and size counting turned on, and once "for real".
5000  * This also means that we don't allocate space until we are sure that the
5001  * thing really will compile successfully, and we never have to move the
5002  * code and thus invalidate pointers into it.  (Note that it has to be in
5003  * one piece because free() must be able to free it all.) [NB: not true in perl]
5004  *
5005  * Beware that the optimization-preparation code in here knows about some
5006  * of the structure of the compiled regexp.  [I'll say.]
5007  */
5008
5009 REGEXP *
5010 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5011                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5012                      int *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5013 {
5014     dVAR;
5015     REGEXP *rx;
5016     struct regexp *r;
5017     register regexp_internal *ri;
5018     STRLEN plen;
5019     char  * VOL exp;
5020     char* xend;
5021     regnode *scan;
5022     I32 flags;
5023     I32 minlen = 0;
5024     U32 rx_flags;
5025     SV * VOL pat;
5026
5027     /* these are all flags - maybe they should be turned
5028      * into a single int with different bit masks */
5029     I32 sawlookahead = 0;
5030     I32 sawplus = 0;
5031     I32 sawopen = 0;
5032     bool used_setjump = FALSE;
5033     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5034     bool code_is_utf8 = 0;
5035     bool recompile = 0;
5036     U8 jump_ret = 0;
5037     dJMPENV;
5038     scan_data_t data;
5039     RExC_state_t RExC_state;
5040     RExC_state_t * const pRExC_state = &RExC_state;
5041 #ifdef TRIE_STUDY_OPT
5042     int restudied;
5043     RExC_state_t copyRExC_state;
5044 #endif
5045     GET_RE_DEBUG_FLAGS_DECL;
5046
5047     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5048
5049     DEBUG_r(if (!PL_colorset) reginitcolors());
5050
5051 #ifndef PERL_IN_XSUB_RE
5052     /* Initialize these here instead of as-needed, as is quick and avoids
5053      * having to test them each time otherwise */
5054     if (! PL_AboveLatin1) {
5055         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5056         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5057         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5058
5059         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5060         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5061
5062         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5063         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5064
5065         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5066         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5067
5068         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5069
5070         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5071         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5072
5073         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5074
5075         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5076         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5077
5078         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5079         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5080
5081         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5082         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5083
5084         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5085         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5086
5087         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5088         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5089
5090         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5091         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5092
5093         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5094         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5095
5096         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5097         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5098
5099         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5100
5101         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5102         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5103
5104         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5105         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5106     }
5107 #endif
5108
5109     pRExC_state->code_blocks = NULL;
5110     pRExC_state->num_code_blocks = 0;
5111
5112     if (is_bare_re)
5113         *is_bare_re = 0;
5114
5115     if (expr && (expr->op_type == OP_LIST ||
5116                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5117
5118         /* is the source UTF8, and how many code blocks are there? */
5119         OP *o;
5120         int ncode = 0;
5121
5122         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5123             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5124                 code_is_utf8 = 1;
5125             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5126                 /* count of DO blocks */
5127                 ncode++;
5128         }
5129         if (ncode) {
5130             pRExC_state->num_code_blocks = ncode;
5131             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5132         }
5133     }
5134
5135     if (pat_count) {
5136         /* handle a list of SVs */
5137
5138         SV **svp;
5139
5140         /* apply magic and RE overloading to each arg */
5141         for (svp = patternp; svp < patternp + pat_count; svp++) {
5142             SV *rx = *svp;
5143             SvGETMAGIC(rx);
5144             if (SvROK(rx) && SvAMAGIC(rx)) {
5145                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5146                 if (sv) {
5147                     if (SvROK(sv))
5148                         sv = SvRV(sv);
5149                     if (SvTYPE(sv) != SVt_REGEXP)
5150                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5151                     *svp = sv;
5152                 }
5153             }
5154         }
5155
5156         if (pat_count > 1) {
5157             /* concat multiple args and find any code block indexes */
5158
5159             OP *o = NULL;
5160             int n = 0;
5161             bool utf8 = 0;
5162
5163             if (pRExC_state->num_code_blocks) {
5164                 o = cLISTOPx(expr)->op_first;
5165                 assert(o->op_type == OP_PUSHMARK);
5166                 o = o->op_sibling;
5167             }
5168
5169             pat = newSVpvn("", 0);
5170             SAVEFREESV(pat);
5171
5172             /* determine if the pattern is going to be utf8 (needed
5173              * in advance to align code block indices correctly).
5174              * XXX This could fail to be detected for an arg with
5175              * overloading but not concat overloading; but the main effect
5176              * in this obscure case is to need a 'use re eval' for a
5177              * literal code block */
5178             for (svp = patternp; svp < patternp + pat_count; svp++) {
5179                 if (SvUTF8(*svp))
5180                     utf8 = 1;
5181             }
5182             if (utf8)
5183                 SvUTF8_on(pat);
5184
5185             for (svp = patternp; svp < patternp + pat_count; svp++) {
5186                 SV *sv, *msv = *svp;
5187                 SV *rx;
5188                 bool code = 0;
5189                 if (o) {
5190                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5191                         assert(n < pRExC_state->num_code_blocks);
5192                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5193                         pRExC_state->code_blocks[n].block = o;
5194                         pRExC_state->code_blocks[n].src_regex = NULL;
5195                         n++;
5196                         code = 1;
5197                         o = o->op_sibling; /* skip CONST */
5198                         assert(o);
5199                     }
5200                     o = o->op_sibling;;
5201                 }
5202
5203                 /* extract any code blocks within any embedded qr//'s */
5204                 rx = msv;
5205                 if (SvROK(rx))
5206                     rx = SvRV(rx);
5207                 if (SvTYPE(rx) == SVt_REGEXP
5208                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5209                 {
5210
5211                     RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5212                     if (ri->num_code_blocks) {
5213                         int i;
5214                         /* the presence of an embedded qr// with code means
5215                          * we should always recompile: the text of the
5216                          * qr// may not have changed, but it may be a
5217                          * different closure than last time */
5218                         recompile = 1;
5219                         Renew(pRExC_state->code_blocks,
5220                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5221                             struct reg_code_block);
5222                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5223                         for (i=0; i < ri->num_code_blocks; i++) {
5224                             struct reg_code_block *src, *dst;
5225                             STRLEN offset =  SvCUR(pat)
5226                                 + ((struct regexp *)SvANY(rx))->pre_prefix;
5227                             assert(n < pRExC_state->num_code_blocks);
5228                             src = &ri->code_blocks[i];
5229                             dst = &pRExC_state->code_blocks[n];
5230                             dst->start      = src->start + offset;
5231                             dst->end        = src->end   + offset;
5232                             dst->block      = src->block;
5233                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5234                                                     src->src_regex
5235                                                         ? src->src_regex
5236                                                         : (REGEXP*)rx);
5237                             n++;
5238                         }
5239                     }
5240                 }
5241
5242                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5243                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5244                 {
5245                     sv_setsv(pat, sv);
5246                     /* overloading involved: all bets are off over literal
5247                      * code. Pretend we haven't seen it */
5248                     pRExC_state->num_code_blocks -= n;
5249                     n = 0;
5250
5251                 }
5252                 else {
5253                     sv_catsv_nomg(pat, msv);
5254                     if (code)
5255                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5256                 }
5257             }
5258             SvSETMAGIC(pat);
5259         }
5260         else
5261             pat = *patternp;
5262
5263         /* handle bare regex: foo =~ $re */
5264         {
5265             SV *re = pat;
5266             if (SvROK(re))
5267                 re = SvRV(re);
5268             if (SvTYPE(re) == SVt_REGEXP) {
5269                 if (is_bare_re)
5270                     *is_bare_re = 1;
5271                 SvREFCNT_inc(re);
5272                 Safefree(pRExC_state->code_blocks);
5273                 return (REGEXP*)re;
5274             }
5275         }
5276     }
5277     else {
5278         /* not a list of SVs, so must be a list of OPs */
5279         assert(expr);
5280         if (expr->op_type == OP_LIST) {
5281             int i = -1;
5282             bool is_code = 0;
5283             OP *o;
5284
5285             pat = newSVpvn("", 0);
5286             SAVEFREESV(pat);
5287             if (code_is_utf8)
5288                 SvUTF8_on(pat);
5289
5290             /* given a list of CONSTs and DO blocks in expr, append all
5291              * the CONSTs to pat, and record the start and end of each
5292              * code block in code_blocks[] (each DO{} op is followed by an
5293              * OP_CONST containing the corresponding literal '(?{...})
5294              * text)
5295              */
5296             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5297                 if (o->op_type == OP_CONST) {
5298                     sv_catsv(pat, cSVOPo_sv);
5299                     if (is_code) {
5300                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5301                         is_code = 0;
5302                     }
5303                 }
5304                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5305                     assert(i+1 < pRExC_state->num_code_blocks);
5306                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5307                     pRExC_state->code_blocks[i].block = o;
5308                     pRExC_state->code_blocks[i].src_regex = NULL;
5309                     is_code = 1;
5310                 }
5311             }
5312         }
5313         else {
5314             assert(expr->op_type == OP_CONST);
5315             pat = cSVOPx_sv(expr);
5316         }
5317     }
5318
5319     exp = SvPV_nomg(pat, plen);
5320
5321     if (!eng->op_comp) {
5322         if ((SvUTF8(pat) && IN_BYTES)
5323                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5324         {
5325             /* make a temporary copy; either to convert to bytes,
5326              * or to avoid repeating get-magic / overloaded stringify */
5327             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5328                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5329         }
5330         Safefree(pRExC_state->code_blocks);
5331         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5332     }
5333
5334     /* ignore the utf8ness if the pattern is 0 length */
5335     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5336     RExC_uni_semantics = 0;
5337     RExC_contains_locale = 0;
5338
5339     /****************** LONG JUMP TARGET HERE***********************/
5340     /* Longjmp back to here if have to switch in midstream to utf8 */
5341     if (! RExC_orig_utf8) {
5342         JMPENV_PUSH(jump_ret);
5343         used_setjump = TRUE;
5344     }
5345
5346     if (jump_ret == 0) {    /* First time through */
5347         xend = exp + plen;
5348
5349         DEBUG_COMPILE_r({
5350             SV *dsv= sv_newmortal();
5351             RE_PV_QUOTED_DECL(s, RExC_utf8,
5352                 dsv, exp, plen, 60);
5353             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5354                            PL_colors[4],PL_colors[5],s);
5355         });
5356     }
5357     else {  /* longjumped back */
5358         U8 *src, *dst;
5359         int n=0;
5360         STRLEN s = 0, d = 0;
5361         bool do_end = 0;
5362
5363         /* If the cause for the longjmp was other than changing to utf8, pop
5364          * our own setjmp, and longjmp to the correct handler */
5365         if (jump_ret != UTF8_LONGJMP) {
5366             JMPENV_POP;
5367             JMPENV_JUMP(jump_ret);
5368         }
5369
5370         GET_RE_DEBUG_FLAGS;
5371
5372         /* It's possible to write a regexp in ascii that represents Unicode
5373         codepoints outside of the byte range, such as via \x{100}. If we
5374         detect such a sequence we have to convert the entire pattern to utf8
5375         and then recompile, as our sizing calculation will have been based
5376         on 1 byte == 1 character, but we will need to use utf8 to encode
5377         at least some part of the pattern, and therefore must convert the whole
5378         thing.
5379         -- dmq */
5380         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5381             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5382
5383         /* upgrade pattern to UTF8, and if there are code blocks,
5384          * recalculate the indices.
5385          * This is essentially an unrolled Perl_bytes_to_utf8() */
5386
5387         src = (U8*)SvPV_nomg(pat, plen);
5388         Newx(dst, plen * 2 + 1, U8);
5389
5390         while (s < plen) {
5391             const UV uv = NATIVE_TO_ASCII(src[s]);
5392             if (UNI_IS_INVARIANT(uv))
5393                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5394             else {
5395                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5396                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5397             }
5398             if (n < pRExC_state->num_code_blocks) {
5399                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5400                     pRExC_state->code_blocks[n].start = d;
5401                     assert(dst[d] == '(');
5402                     do_end = 1;
5403                 }
5404                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5405                     pRExC_state->code_blocks[n].end = d;
5406                     assert(dst[d] == ')');
5407                     do_end = 0;
5408                     n++;
5409                 }
5410             }
5411             s++;
5412             d++;
5413         }
5414         dst[d] = '\0';
5415         plen = d;
5416         exp = (char*) dst;
5417         xend = exp + plen;
5418         SAVEFREEPV(exp);
5419         RExC_orig_utf8 = RExC_utf8 = 1;
5420     }
5421
5422     /* return old regex if pattern hasn't changed */
5423
5424     if (   old_re
5425         && !recompile
5426         && !!RX_UTF8(old_re) == !!RExC_utf8
5427         && RX_PRECOMP(old_re)
5428         && RX_PRELEN(old_re) == plen
5429         && memEQ(RX_PRECOMP(old_re), exp, plen))
5430     {
5431         /* see if there are any run-time code blocks */
5432         int n = 0;
5433         STRLEN s;
5434         bool runtime = 0;
5435         for (s = 0; s < plen; s++) {
5436             if (n < pRExC_state->num_code_blocks
5437                 && s == pRExC_state->code_blocks[n].start)
5438             {
5439                 s = pRExC_state->code_blocks[n].end;
5440                 n++;
5441                 continue;
5442             }
5443             if (exp[s] == '(' && exp[s+1] == '?' &&
5444                 (exp[s+2] == '{' || (exp[s+2] == '?' && exp[s+3] == '{')))
5445             {
5446                 runtime = 1;
5447                 break;
5448             }
5449         }
5450         /* with runtime code, always recompile */
5451         if (!runtime) {
5452             ReREFCNT_inc(old_re);
5453             if (used_setjump) {
5454                 JMPENV_POP;
5455             }
5456             Safefree(pRExC_state->code_blocks);
5457             return old_re;
5458         }
5459     }
5460
5461 #ifdef TRIE_STUDY_OPT
5462     restudied = 0;
5463 #endif
5464
5465     rx_flags = orig_rx_flags;
5466
5467     if (initial_charset == REGEX_LOCALE_CHARSET) {
5468         RExC_contains_locale = 1;
5469     }
5470     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5471
5472         /* Set to use unicode semantics if the pattern is in utf8 and has the
5473          * 'depends' charset specified, as it means unicode when utf8  */
5474         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5475     }
5476
5477     RExC_precomp = exp;
5478     RExC_flags = rx_flags;
5479     RExC_pm_flags = pm_flags;
5480     RExC_sawback = 0;
5481
5482     RExC_seen = 0;
5483     RExC_in_lookbehind = 0;
5484     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5485     RExC_seen_evals = 0;
5486     RExC_extralen = 0;
5487     RExC_override_recoding = 0;
5488
5489     /* First pass: determine size, legality. */
5490     RExC_parse = exp;
5491     RExC_start = exp;
5492     RExC_end = xend;
5493     RExC_naughty = 0;
5494     RExC_npar = 1;
5495     RExC_nestroot = 0;
5496     RExC_size = 0L;
5497     RExC_emit = &PL_regdummy;
5498     RExC_whilem_seen = 0;
5499     RExC_open_parens = NULL;
5500     RExC_close_parens = NULL;
5501     RExC_opend = NULL;
5502     RExC_paren_names = NULL;
5503 #ifdef DEBUGGING
5504     RExC_paren_name_list = NULL;
5505 #endif
5506     RExC_recurse = NULL;
5507     RExC_recurse_count = 0;
5508     pRExC_state->code_index = 0;
5509
5510 #if 0 /* REGC() is (currently) a NOP at the first pass.
5511        * Clever compilers notice this and complain. --jhi */
5512     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5513 #endif
5514     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
5515     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5516         RExC_precomp = NULL;
5517         Safefree(pRExC_state->code_blocks);
5518         return(NULL);
5519     }
5520
5521     /* Here, finished first pass.  Get rid of any added setjmp */
5522     if (used_setjump) {
5523         JMPENV_POP;
5524     }
5525
5526     DEBUG_PARSE_r({
5527         PerlIO_printf(Perl_debug_log,
5528             "Required size %"IVdf" nodes\n"
5529             "Starting second pass (creation)\n",
5530             (IV)RExC_size);
5531         RExC_lastnum=0;
5532         RExC_lastparse=NULL;
5533     });
5534
5535     /* The first pass could have found things that force Unicode semantics */
5536     if ((RExC_utf8 || RExC_uni_semantics)
5537          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5538     {
5539         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5540     }
5541
5542     /* Small enough for pointer-storage convention?
5543        If extralen==0, this means that we will not need long jumps. */
5544     if (RExC_size >= 0x10000L && RExC_extralen)
5545         RExC_size += RExC_extralen;
5546     else
5547         RExC_extralen = 0;
5548     if (RExC_whilem_seen > 15)
5549         RExC_whilem_seen = 15;
5550
5551     /* Allocate space and zero-initialize. Note, the two step process
5552        of zeroing when in debug mode, thus anything assigned has to
5553        happen after that */
5554     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5555     r = (struct regexp*)SvANY(rx);
5556     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5557          char, regexp_internal);
5558     if ( r == NULL || ri == NULL )
5559         FAIL("Regexp out of space");
5560 #ifdef DEBUGGING
5561     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5562     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5563 #else
5564     /* bulk initialize base fields with 0. */
5565     Zero(ri, sizeof(regexp_internal), char);
5566 #endif
5567
5568     /* non-zero initialization begins here */
5569     RXi_SET( r, ri );
5570     r->engine= eng;
5571     r->extflags = rx_flags;
5572     if (pm_flags & PMf_IS_QR) {
5573         ri->code_blocks = pRExC_state->code_blocks;
5574         ri->num_code_blocks = pRExC_state->num_code_blocks;
5575     }
5576     else
5577         SAVEFREEPV(pRExC_state->code_blocks);
5578
5579     {
5580         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5581         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5582
5583         /* The caret is output if there are any defaults: if not all the STD
5584          * flags are set, or if no character set specifier is needed */
5585         bool has_default =
5586                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5587                     || ! has_charset);
5588         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5589         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5590                             >> RXf_PMf_STD_PMMOD_SHIFT);
5591         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5592         char *p;
5593         /* Allocate for the worst case, which is all the std flags are turned
5594          * on.  If more precision is desired, we could do a population count of
5595          * the flags set.  This could be done with a small lookup table, or by
5596          * shifting, masking and adding, or even, when available, assembly
5597          * language for a machine-language population count.
5598          * We never output a minus, as all those are defaults, so are
5599          * covered by the caret */
5600         const STRLEN wraplen = plen + has_p + has_runon
5601             + has_default       /* If needs a caret */
5602
5603                 /* If needs a character set specifier */
5604             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5605             + (sizeof(STD_PAT_MODS) - 1)
5606             + (sizeof("(?:)") - 1);
5607
5608         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5609         SvPOK_on(rx);
5610         if (RExC_utf8)
5611             SvFLAGS(rx) |= SVf_UTF8;
5612         *p++='('; *p++='?';
5613
5614         /* If a default, cover it using the caret */
5615         if (has_default) {
5616             *p++= DEFAULT_PAT_MOD;
5617         }
5618         if (has_charset) {
5619             STRLEN len;
5620             const char* const name = get_regex_charset_name(r->extflags, &len);
5621             Copy(name, p, len, char);
5622             p += len;
5623         }
5624         if (has_p)
5625             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5626         {
5627             char ch;
5628             while((ch = *fptr++)) {
5629                 if(reganch & 1)
5630                     *p++ = ch;
5631                 reganch >>= 1;
5632             }
5633         }
5634
5635         *p++ = ':';
5636         Copy(RExC_precomp, p, plen, char);
5637         assert ((RX_WRAPPED(rx) - p) < 16);
5638         r->pre_prefix = p - RX_WRAPPED(rx);
5639         p += plen;
5640         if (has_runon)
5641             *p++ = '\n';
5642         *p++ = ')';
5643         *p = 0;
5644         SvCUR_set(rx, p - SvPVX_const(rx));
5645     }
5646
5647     r->intflags = 0;
5648     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5649
5650     if (RExC_seen & REG_SEEN_RECURSE) {
5651         Newxz(RExC_open_parens, RExC_npar,regnode *);
5652         SAVEFREEPV(RExC_open_parens);
5653         Newxz(RExC_close_parens,RExC_npar,regnode *);
5654         SAVEFREEPV(RExC_close_parens);
5655     }
5656
5657     /* Useful during FAIL. */
5658 #ifdef RE_TRACK_PATTERN_OFFSETS
5659     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5660     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5661                           "%s %"UVuf" bytes for offset annotations.\n",
5662                           ri->u.offsets ? "Got" : "Couldn't get",
5663                           (UV)((2*RExC_size+1) * sizeof(U32))));
5664 #endif
5665     SetProgLen(ri,RExC_size);
5666     RExC_rx_sv = rx;
5667     RExC_rx = r;
5668     RExC_rxi = ri;
5669
5670     /* Second pass: emit code. */
5671     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5672     RExC_pm_flags = pm_flags;
5673     RExC_parse = exp;
5674     RExC_end = xend;
5675     RExC_naughty = 0;
5676     RExC_npar = 1;
5677     RExC_emit_start = ri->program;
5678     RExC_emit = ri->program;
5679     RExC_emit_bound = ri->program + RExC_size + 1;
5680     pRExC_state->code_index = 0;
5681
5682     /* Store the count of eval-groups for security checks: */
5683     RExC_rx->seen_evals = RExC_seen_evals;
5684     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5685     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5686         ReREFCNT_dec(rx);
5687         return(NULL);
5688     }
5689     /* XXXX To minimize changes to RE engine we always allocate
5690        3-units-long substrs field. */
5691     Newx(r->substrs, 1, struct reg_substr_data);
5692     if (RExC_recurse_count) {
5693         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5694         SAVEFREEPV(RExC_recurse);
5695     }
5696
5697 reStudy:
5698     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5699     Zero(r->substrs, 1, struct reg_substr_data);
5700
5701 #ifdef TRIE_STUDY_OPT
5702     if (!restudied) {
5703         StructCopy(&zero_scan_data, &data, scan_data_t);
5704         copyRExC_state = RExC_state;
5705     } else {
5706         U32 seen=RExC_seen;
5707         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5708
5709         RExC_state = copyRExC_state;
5710         if (seen & REG_TOP_LEVEL_BRANCHES)
5711             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5712         else
5713             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5714         if (data.last_found) {
5715             SvREFCNT_dec(data.longest_fixed);
5716             SvREFCNT_dec(data.longest_float);
5717             SvREFCNT_dec(data.last_found);
5718         }
5719         StructCopy(&zero_scan_data, &data, scan_data_t);
5720     }
5721 #else
5722     StructCopy(&zero_scan_data, &data, scan_data_t);
5723 #endif
5724
5725     /* Dig out information for optimizations. */
5726     r->extflags = RExC_flags; /* was pm_op */
5727     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5728
5729     if (UTF)
5730         SvUTF8_on(rx);  /* Unicode in it? */
5731     ri->regstclass = NULL;
5732     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5733         r->intflags |= PREGf_NAUGHTY;
5734     scan = ri->program + 1;             /* First BRANCH. */
5735
5736     /* testing for BRANCH here tells us whether there is "must appear"
5737        data in the pattern. If there is then we can use it for optimisations */
5738     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5739         I32 fake;
5740         STRLEN longest_float_length, longest_fixed_length;
5741         struct regnode_charclass_class ch_class; /* pointed to by data */
5742         int stclass_flag;
5743         I32 last_close = 0; /* pointed to by data */
5744         regnode *first= scan;
5745         regnode *first_next= regnext(first);
5746         /*
5747          * Skip introductions and multiplicators >= 1
5748          * so that we can extract the 'meat' of the pattern that must
5749          * match in the large if() sequence following.
5750          * NOTE that EXACT is NOT covered here, as it is normally
5751          * picked up by the optimiser separately.
5752          *
5753          * This is unfortunate as the optimiser isnt handling lookahead
5754          * properly currently.
5755          *
5756          */
5757         while ((OP(first) == OPEN && (sawopen = 1)) ||
5758                /* An OR of *one* alternative - should not happen now. */
5759             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5760             /* for now we can't handle lookbehind IFMATCH*/
5761             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5762             (OP(first) == PLUS) ||
5763             (OP(first) == MINMOD) ||
5764                /* An {n,m} with n>0 */
5765             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5766             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5767         {
5768                 /*
5769                  * the only op that could be a regnode is PLUS, all the rest
5770                  * will be regnode_1 or regnode_2.
5771                  *
5772                  */
5773                 if (OP(first) == PLUS)
5774                     sawplus = 1;
5775                 else
5776                     first += regarglen[OP(first)];
5777
5778                 first = NEXTOPER(first);
5779                 first_next= regnext(first);
5780         }
5781
5782         /* Starting-point info. */
5783       again:
5784         DEBUG_PEEP("first:",first,0);
5785         /* Ignore EXACT as we deal with it later. */
5786         if (PL_regkind[OP(first)] == EXACT) {
5787             if (OP(first) == EXACT)
5788                 NOOP;   /* Empty, get anchored substr later. */
5789             else
5790                 ri->regstclass = first;
5791         }
5792 #ifdef TRIE_STCLASS
5793         else if (PL_regkind[OP(first)] == TRIE &&
5794                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5795         {
5796             regnode *trie_op;
5797             /* this can happen only on restudy */
5798             if ( OP(first) == TRIE ) {
5799                 struct regnode_1 *trieop = (struct regnode_1 *)
5800                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
5801                 StructCopy(first,trieop,struct regnode_1);
5802                 trie_op=(regnode *)trieop;
5803             } else {
5804                 struct regnode_charclass *trieop = (struct regnode_charclass *)
5805                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5806                 StructCopy(first,trieop,struct regnode_charclass);
5807                 trie_op=(regnode *)trieop;
5808             }
5809             OP(trie_op)+=2;
5810             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5811             ri->regstclass = trie_op;
5812         }
5813 #endif
5814         else if (REGNODE_SIMPLE(OP(first)))
5815             ri->regstclass = first;
5816         else if (PL_regkind[OP(first)] == BOUND ||
5817                  PL_regkind[OP(first)] == NBOUND)
5818             ri->regstclass = first;
5819         else if (PL_regkind[OP(first)] == BOL) {
5820             r->extflags |= (OP(first) == MBOL
5821                            ? RXf_ANCH_MBOL
5822                            : (OP(first) == SBOL
5823                               ? RXf_ANCH_SBOL
5824                               : RXf_ANCH_BOL));
5825             first = NEXTOPER(first);
5826             goto again;
5827         }
5828         else if (OP(first) == GPOS) {
5829             r->extflags |= RXf_ANCH_GPOS;
5830             first = NEXTOPER(first);
5831             goto again;
5832         }
5833         else if ((!sawopen || !RExC_sawback) &&
5834             (OP(first) == STAR &&
5835             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5836             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5837         {
5838             /* turn .* into ^.* with an implied $*=1 */
5839             const int type =
5840                 (OP(NEXTOPER(first)) == REG_ANY)
5841                     ? RXf_ANCH_MBOL
5842                     : RXf_ANCH_SBOL;
5843             r->extflags |= type;
5844             r->intflags |= PREGf_IMPLICIT;
5845             first = NEXTOPER(first);
5846             goto again;
5847         }
5848         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5849             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5850             /* x+ must match at the 1st pos of run of x's */
5851             r->intflags |= PREGf_SKIP;
5852
5853         /* Scan is after the zeroth branch, first is atomic matcher. */
5854 #ifdef TRIE_STUDY_OPT
5855         DEBUG_PARSE_r(
5856             if (!restudied)
5857                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5858                               (IV)(first - scan + 1))
5859         );
5860 #else
5861         DEBUG_PARSE_r(
5862             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5863                 (IV)(first - scan + 1))
5864         );
5865 #endif
5866
5867
5868         /*
5869         * If there's something expensive in the r.e., find the
5870         * longest literal string that must appear and make it the
5871         * regmust.  Resolve ties in favor of later strings, since
5872         * the regstart check works with the beginning of the r.e.
5873         * and avoiding duplication strengthens checking.  Not a
5874         * strong reason, but sufficient in the absence of others.
5875         * [Now we resolve ties in favor of the earlier string if
5876         * it happens that c_offset_min has been invalidated, since the
5877         * earlier string may buy us something the later one won't.]
5878         */
5879
5880         data.longest_fixed = newSVpvs("");
5881         data.longest_float = newSVpvs("");
5882         data.last_found = newSVpvs("");
5883         data.longest = &(data.longest_fixed);
5884         first = scan;
5885         if (!ri->regstclass) {
5886             cl_init(pRExC_state, &ch_class);
5887             data.start_class = &ch_class;
5888             stclass_flag = SCF_DO_STCLASS_AND;
5889         } else                          /* XXXX Check for BOUND? */
5890             stclass_flag = 0;
5891         data.last_closep = &last_close;
5892
5893         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5894             &data, -1, NULL, NULL,
5895             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5896
5897
5898         CHECK_RESTUDY_GOTO;
5899
5900
5901         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5902              && data.last_start_min == 0 && data.last_end > 0
5903              && !RExC_seen_zerolen
5904              && !(RExC_seen & REG_SEEN_VERBARG)
5905              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5906             r->extflags |= RXf_CHECK_ALL;
5907         scan_commit(pRExC_state, &data,&minlen,0);
5908         SvREFCNT_dec(data.last_found);
5909
5910         /* Note that code very similar to this but for anchored string
5911            follows immediately below, changes may need to be made to both.
5912            Be careful.
5913          */
5914         longest_float_length = CHR_SVLEN(data.longest_float);
5915         if (longest_float_length
5916             || (data.flags & SF_FL_BEFORE_EOL
5917                 && (!(data.flags & SF_FL_BEFORE_MEOL)
5918                     || (RExC_flags & RXf_PMf_MULTILINE))))
5919         {
5920             I32 t,ml;
5921
5922             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5923             if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5924                 || (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5925                     && data.offset_fixed == data.offset_float_min
5926                     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5927                     goto remove_float;          /* As in (a)+. */
5928
5929             /* copy the information about the longest float from the reg_scan_data
5930                over to the program. */
5931             if (SvUTF8(data.longest_float)) {
5932                 r->float_utf8 = data.longest_float;
5933                 r->float_substr = NULL;
5934             } else {
5935                 r->float_substr = data.longest_float;
5936                 r->float_utf8 = NULL;
5937             }
5938             /* float_end_shift is how many chars that must be matched that
5939                follow this item. We calculate it ahead of time as once the
5940                lookbehind offset is added in we lose the ability to correctly
5941                calculate it.*/
5942             ml = data.minlen_float ? *(data.minlen_float)
5943                                    : (I32)longest_float_length;
5944             r->float_end_shift = ml - data.offset_float_min
5945                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5946                 + data.lookbehind_float;
5947             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5948             r->float_max_offset = data.offset_float_max;
5949             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5950                 r->float_max_offset -= data.lookbehind_float;
5951
5952             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5953                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5954                            || (RExC_flags & RXf_PMf_MULTILINE)));
5955             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5956         }
5957         else {
5958           remove_float:
5959             r->float_substr = r->float_utf8 = NULL;
5960             SvREFCNT_dec(data.longest_float);
5961             longest_float_length = 0;
5962         }
5963
5964         /* Note that code very similar to this but for floating string
5965            is immediately above, changes may need to be made to both.
5966            Be careful.
5967          */
5968         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5969
5970         /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5971         if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5972             && (longest_fixed_length
5973                 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5974                     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5975                         || (RExC_flags & RXf_PMf_MULTILINE)))) )
5976         {
5977             I32 t,ml;
5978
5979             /* copy the information about the longest fixed
5980                from the reg_scan_data over to the program. */
5981             if (SvUTF8(data.longest_fixed)) {
5982                 r->anchored_utf8 = data.longest_fixed;
5983                 r->anchored_substr = NULL;
5984             } else {
5985                 r->anchored_substr = data.longest_fixed;
5986                 r->anchored_utf8 = NULL;
5987             }
5988             /* fixed_end_shift is how many chars that must be matched that
5989                follow this item. We calculate it ahead of time as once the
5990                lookbehind offset is added in we lose the ability to correctly
5991                calculate it.*/
5992             ml = data.minlen_fixed ? *(data.minlen_fixed)
5993                                    : (I32)longest_fixed_length;
5994             r->anchored_end_shift = ml - data.offset_fixed
5995                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5996                 + data.lookbehind_fixed;
5997             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5998
5999             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
6000                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
6001                      || (RExC_flags & RXf_PMf_MULTILINE)));
6002             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
6003         }
6004         else {
6005             r->anchored_substr = r->anchored_utf8 = NULL;
6006             SvREFCNT_dec(data.longest_fixed);
6007             longest_fixed_length = 0;
6008         }
6009         if (ri->regstclass
6010             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6011             ri->regstclass = NULL;
6012
6013         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6014             && stclass_flag
6015             && !(data.start_class->flags & ANYOF_EOS)
6016             && !cl_is_anything(data.start_class))
6017         {
6018             const U32 n = add_data(pRExC_state, 1, "f");
6019             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6020
6021             Newx(RExC_rxi->data->data[n], 1,
6022                 struct regnode_charclass_class);
6023             StructCopy(data.start_class,
6024                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6025                        struct regnode_charclass_class);
6026             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6027             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6028             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6029                       regprop(r, sv, (regnode*)data.start_class);
6030                       PerlIO_printf(Perl_debug_log,
6031                                     "synthetic stclass \"%s\".\n",
6032                                     SvPVX_const(sv));});
6033         }
6034
6035         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6036         if (longest_fixed_length > longest_float_length) {
6037             r->check_end_shift = r->anchored_end_shift;
6038             r->check_substr = r->anchored_substr;
6039             r->check_utf8 = r->anchored_utf8;
6040             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6041             if (r->extflags & RXf_ANCH_SINGLE)
6042                 r->extflags |= RXf_NOSCAN;
6043         }
6044         else {
6045             r->check_end_shift = r->float_end_shift;
6046             r->check_substr = r->float_substr;
6047             r->check_utf8 = r->float_utf8;
6048             r->check_offset_min = r->float_min_offset;
6049             r->check_offset_max = r->float_max_offset;
6050         }
6051         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6052            This should be changed ASAP!  */
6053         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6054             r->extflags |= RXf_USE_INTUIT;
6055             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6056                 r->extflags |= RXf_INTUIT_TAIL;
6057         }
6058         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6059         if ( (STRLEN)minlen < longest_float_length )
6060             minlen= longest_float_length;
6061         if ( (STRLEN)minlen < longest_fixed_length )
6062             minlen= longest_fixed_length;
6063         */
6064     }
6065     else {
6066         /* Several toplevels. Best we can is to set minlen. */
6067         I32 fake;
6068         struct regnode_charclass_class ch_class;
6069         I32 last_close = 0;
6070
6071         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6072
6073         scan = ri->program + 1;
6074         cl_init(pRExC_state, &ch_class);
6075         data.start_class = &ch_class;
6076         data.last_closep = &last_close;
6077
6078
6079         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6080             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6081
6082         CHECK_RESTUDY_GOTO;
6083
6084         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6085                 = r->float_substr = r->float_utf8 = NULL;
6086
6087         if (!(data.start_class->flags & ANYOF_EOS)
6088             && !cl_is_anything(data.start_class))
6089         {
6090             const U32 n = add_data(pRExC_state, 1, "f");
6091             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6092
6093             Newx(RExC_rxi->data->data[n], 1,
6094                 struct regnode_charclass_class);
6095             StructCopy(data.start_class,
6096                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6097                        struct regnode_charclass_class);
6098             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6099             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6100             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6101                       regprop(r, sv, (regnode*)data.start_class);
6102                       PerlIO_printf(Perl_debug_log,
6103                                     "synthetic stclass \"%s\".\n",
6104                                     SvPVX_const(sv));});
6105         }
6106     }
6107
6108     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6109        the "real" pattern. */
6110     DEBUG_OPTIMISE_r({
6111         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6112                       (IV)minlen, (IV)r->minlen);
6113     });
6114     r->minlenret = minlen;
6115     if (r->minlen < minlen)
6116         r->minlen = minlen;
6117
6118     if (RExC_seen & REG_SEEN_GPOS)
6119         r->extflags |= RXf_GPOS_SEEN;
6120     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6121         r->extflags |= RXf_LOOKBEHIND_SEEN;
6122     if (RExC_seen & REG_SEEN_EVAL)
6123         r->extflags |= RXf_EVAL_SEEN;
6124     if (RExC_seen & REG_SEEN_CANY)
6125         r->extflags |= RXf_CANY_SEEN;
6126     if (RExC_seen & REG_SEEN_VERBARG)
6127         r->intflags |= PREGf_VERBARG_SEEN;
6128     if (RExC_seen & REG_SEEN_CUTGROUP)
6129         r->intflags |= PREGf_CUTGROUP_SEEN;
6130     if (RExC_paren_names)
6131         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6132     else
6133         RXp_PAREN_NAMES(r) = NULL;
6134
6135 #ifdef STUPID_PATTERN_CHECKS
6136     if (RX_PRELEN(rx) == 0)
6137         r->extflags |= RXf_NULL;
6138     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6139         /* XXX: this should happen BEFORE we compile */
6140         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6141     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6142         r->extflags |= RXf_WHITE;
6143     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6144         r->extflags |= RXf_START_ONLY;
6145 #else
6146     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6147             /* XXX: this should happen BEFORE we compile */
6148             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6149     else {
6150         regnode *first = ri->program + 1;
6151         U8 fop = OP(first);
6152
6153         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6154             r->extflags |= RXf_NULL;
6155         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6156             r->extflags |= RXf_START_ONLY;
6157         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6158                              && OP(regnext(first)) == END)
6159             r->extflags |= RXf_WHITE;
6160     }
6161 #endif
6162 #ifdef DEBUGGING
6163     if (RExC_paren_names) {
6164         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6165         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6166     } else
6167 #endif
6168         ri->name_list_idx = 0;
6169
6170     if (RExC_recurse_count) {
6171         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6172             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6173             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6174         }
6175     }
6176     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6177     /* assume we don't need to swap parens around before we match */
6178
6179     DEBUG_DUMP_r({
6180         PerlIO_printf(Perl_debug_log,"Final program:\n");
6181         regdump(r);
6182     });
6183 #ifdef RE_TRACK_PATTERN_OFFSETS
6184     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6185         const U32 len = ri->u.offsets[0];
6186         U32 i;
6187         GET_RE_DEBUG_FLAGS_DECL;
6188         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6189         for (i = 1; i <= len; i++) {
6190             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6191                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6192                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6193             }
6194         PerlIO_printf(Perl_debug_log, "\n");
6195     });
6196 #endif
6197     return rx;
6198 }
6199
6200
6201 SV*
6202 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6203                     const U32 flags)
6204 {
6205     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6206
6207     PERL_UNUSED_ARG(value);
6208
6209     if (flags & RXapif_FETCH) {
6210         return reg_named_buff_fetch(rx, key, flags);
6211     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6212         Perl_croak_no_modify(aTHX);
6213         return NULL;
6214     } else if (flags & RXapif_EXISTS) {
6215         return reg_named_buff_exists(rx, key, flags)
6216             ? &PL_sv_yes
6217             : &PL_sv_no;
6218     } else if (flags & RXapif_REGNAMES) {
6219         return reg_named_buff_all(rx, flags);
6220     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6221         return reg_named_buff_scalar(rx, flags);
6222     } else {
6223         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6224         return NULL;
6225     }
6226 }
6227
6228 SV*
6229 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6230                          const U32 flags)
6231 {
6232     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6233     PERL_UNUSED_ARG(lastkey);
6234
6235     if (flags & RXapif_FIRSTKEY)
6236         return reg_named_buff_firstkey(rx, flags);
6237     else if (flags & RXapif_NEXTKEY)
6238         return reg_named_buff_nextkey(rx, flags);
6239     else {
6240         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6241         return NULL;
6242     }
6243 }
6244
6245 SV*
6246 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6247                           const U32 flags)
6248 {
6249     AV *retarray = NULL;
6250     SV *ret;
6251     struct regexp *const rx = (struct regexp *)SvANY(r);
6252
6253     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6254
6255     if (flags & RXapif_ALL)
6256         retarray=newAV();
6257
6258     if (rx && RXp_PAREN_NAMES(rx)) {
6259         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6260         if (he_str) {
6261             IV i;
6262             SV* sv_dat=HeVAL(he_str);
6263             I32 *nums=(I32*)SvPVX(sv_dat);
6264             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6265                 if ((I32)(rx->nparens) >= nums[i]
6266                     && rx->offs[nums[i]].start != -1
6267                     && rx->offs[nums[i]].end != -1)
6268                 {
6269                     ret = newSVpvs("");
6270                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6271                     if (!retarray)
6272                         return ret;
6273                 } else {
6274                     if (retarray)
6275                         ret = newSVsv(&PL_sv_undef);
6276                 }
6277                 if (retarray)
6278                     av_push(retarray, ret);
6279             }
6280             if (retarray)
6281                 return newRV_noinc(MUTABLE_SV(retarray));
6282         }
6283     }
6284     return NULL;
6285 }
6286
6287 bool
6288 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6289                            const U32 flags)
6290 {
6291     struct regexp *const rx = (struct regexp *)SvANY(r);
6292
6293     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6294
6295     if (rx && RXp_PAREN_NAMES(rx)) {
6296         if (flags & RXapif_ALL) {
6297             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6298         } else {
6299             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6300             if (sv) {
6301                 SvREFCNT_dec(sv);
6302                 return TRUE;
6303             } else {
6304                 return FALSE;
6305             }
6306         }
6307     } else {
6308         return FALSE;
6309     }
6310 }
6311
6312 SV*
6313 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6314 {
6315     struct regexp *const rx = (struct regexp *)SvANY(r);
6316
6317     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6318
6319     if ( rx && RXp_PAREN_NAMES(rx) ) {
6320         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6321
6322         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6323     } else {
6324         return FALSE;
6325     }
6326 }
6327
6328 SV*
6329 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6330 {
6331     struct regexp *const rx = (struct regexp *)SvANY(r);
6332     GET_RE_DEBUG_FLAGS_DECL;
6333
6334     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6335
6336     if (rx && RXp_PAREN_NAMES(rx)) {
6337         HV *hv = RXp_PAREN_NAMES(rx);
6338         HE *temphe;
6339         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6340             IV i;
6341             IV parno = 0;
6342             SV* sv_dat = HeVAL(temphe);
6343             I32 *nums = (I32*)SvPVX(sv_dat);
6344             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6345                 if ((I32)(rx->lastparen) >= nums[i] &&
6346                     rx->offs[nums[i]].start != -1 &&
6347                     rx->offs[nums[i]].end != -1)
6348                 {
6349                     parno = nums[i];
6350                     break;
6351                 }
6352             }
6353             if (parno || flags & RXapif_ALL) {
6354                 return newSVhek(HeKEY_hek(temphe));
6355             }
6356         }
6357     }
6358     return NULL;
6359 }
6360
6361 SV*
6362 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6363 {
6364     SV *ret;
6365     AV *av;
6366     I32 length;
6367     struct regexp *const rx = (struct regexp *)SvANY(r);
6368
6369     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6370
6371     if (rx && RXp_PAREN_NAMES(rx)) {
6372         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6373             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6374         } else if (flags & RXapif_ONE) {
6375             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6376             av = MUTABLE_AV(SvRV(ret));
6377             length = av_len(av);
6378             SvREFCNT_dec(ret);
6379             return newSViv(length + 1);
6380         } else {
6381             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6382             return NULL;
6383         }
6384     }
6385     return &PL_sv_undef;
6386 }
6387
6388 SV*
6389 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6390 {
6391     struct regexp *const rx = (struct regexp *)SvANY(r);
6392     AV *av = newAV();
6393
6394     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6395
6396     if (rx && RXp_PAREN_NAMES(rx)) {
6397         HV *hv= RXp_PAREN_NAMES(rx);
6398         HE *temphe;
6399         (void)hv_iterinit(hv);
6400         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6401             IV i;
6402             IV parno = 0;
6403             SV* sv_dat = HeVAL(temphe);
6404             I32 *nums = (I32*)SvPVX(sv_dat);
6405             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6406                 if ((I32)(rx->lastparen) >= nums[i] &&
6407                     rx->offs[nums[i]].start != -1 &&
6408                     rx->offs[nums[i]].end != -1)
6409                 {
6410                     parno = nums[i];
6411                     break;
6412                 }
6413             }
6414             if (parno || flags & RXapif_ALL) {
6415                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6416             }
6417         }
6418     }
6419
6420     return newRV_noinc(MUTABLE_SV(av));
6421 }
6422
6423 void
6424 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6425                              SV * const sv)
6426 {
6427     struct regexp *const rx = (struct regexp *)SvANY(r);
6428     char *s = NULL;
6429     I32 i = 0;
6430     I32 s1, t1;
6431
6432     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6433
6434     if (!rx->subbeg) {
6435         sv_setsv(sv,&PL_sv_undef);
6436         return;
6437     }
6438     else
6439     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
6440         /* $` */
6441         i = rx->offs[0].start;
6442         s = rx->subbeg;
6443     }
6444     else
6445     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6446         /* $' */
6447         s = rx->subbeg + rx->offs[0].end;
6448         i = rx->sublen - rx->offs[0].end;
6449     }
6450     else
6451     if ( 0 <= paren && paren <= (I32)rx->nparens &&
6452         (s1 = rx->offs[paren].start) != -1 &&
6453         (t1 = rx->offs[paren].end) != -1)
6454     {
6455         /* $& $1 ... */
6456         i = t1 - s1;
6457         s = rx->subbeg + s1;
6458     } else {
6459         sv_setsv(sv,&PL_sv_undef);
6460         return;
6461     }
6462     assert(rx->sublen >= (s - rx->subbeg) + i );
6463     if (i >= 0) {
6464         const int oldtainted = PL_tainted;
6465         TAINT_NOT;
6466         sv_setpvn(sv, s, i);
6467         PL_tainted = oldtainted;
6468         if ( (rx->extflags & RXf_CANY_SEEN)
6469             ? (RXp_MATCH_UTF8(rx)
6470                         && (!i || is_utf8_string((U8*)s, i)))
6471             : (RXp_MATCH_UTF8(rx)) )
6472         {
6473             SvUTF8_on(sv);
6474         }
6475         else
6476             SvUTF8_off(sv);
6477         if (PL_tainting) {
6478             if (RXp_MATCH_TAINTED(rx)) {
6479                 if (SvTYPE(sv) >= SVt_PVMG) {
6480                     MAGIC* const mg = SvMAGIC(sv);
6481                     MAGIC* mgt;
6482                     PL_tainted = 1;
6483                     SvMAGIC_set(sv, mg->mg_moremagic);
6484                     SvTAINT(sv);
6485                     if ((mgt = SvMAGIC(sv))) {
6486                         mg->mg_moremagic = mgt;
6487                         SvMAGIC_set(sv, mg);
6488                     }
6489                 } else {
6490                     PL_tainted = 1;
6491                     SvTAINT(sv);
6492                 }
6493             } else
6494                 SvTAINTED_off(sv);
6495         }
6496     } else {
6497         sv_setsv(sv,&PL_sv_undef);
6498         return;
6499     }
6500 }
6501
6502 void
6503 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6504                                                          SV const * const value)
6505 {
6506     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6507
6508     PERL_UNUSED_ARG(rx);
6509     PERL_UNUSED_ARG(paren);
6510     PERL_UNUSED_ARG(value);
6511
6512     if (!PL_localizing)
6513         Perl_croak_no_modify(aTHX);
6514 }
6515
6516 I32
6517 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6518                               const I32 paren)
6519 {
6520     struct regexp *const rx = (struct regexp *)SvANY(r);
6521     I32 i;
6522     I32 s1, t1;
6523
6524     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6525
6526     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6527         switch (paren) {
6528       /* $` / ${^PREMATCH} */
6529       case RX_BUFF_IDX_PREMATCH:
6530         if (rx->offs[0].start != -1) {
6531                         i = rx->offs[0].start;
6532                         if (i > 0) {
6533                                 s1 = 0;
6534                                 t1 = i;
6535                                 goto getlen;
6536                         }
6537             }
6538         return 0;
6539       /* $' / ${^POSTMATCH} */
6540       case RX_BUFF_IDX_POSTMATCH:
6541             if (rx->offs[0].end != -1) {
6542                         i = rx->sublen - rx->offs[0].end;
6543                         if (i > 0) {
6544                                 s1 = rx->offs[0].end;
6545                                 t1 = rx->sublen;
6546                                 goto getlen;
6547                         }
6548             }
6549         return 0;
6550       /* $& / ${^MATCH}, $1, $2, ... */
6551       default:
6552             if (paren <= (I32)rx->nparens &&
6553             (s1 = rx->offs[paren].start) != -1 &&
6554             (t1 = rx->offs[paren].end) != -1)
6555             {
6556             i = t1 - s1;
6557             goto getlen;
6558         } else {
6559             if (ckWARN(WARN_UNINITIALIZED))
6560                 report_uninit((const SV *)sv);
6561             return 0;
6562         }
6563     }
6564   getlen:
6565     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6566         const char * const s = rx->subbeg + s1;
6567         const U8 *ep;
6568         STRLEN el;
6569
6570         i = t1 - s1;
6571         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6572                         i = el;
6573     }
6574     return i;
6575 }
6576
6577 SV*
6578 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6579 {
6580     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6581         PERL_UNUSED_ARG(rx);
6582         if (0)
6583             return NULL;
6584         else
6585             return newSVpvs("Regexp");
6586 }
6587
6588 /* Scans the name of a named buffer from the pattern.
6589  * If flags is REG_RSN_RETURN_NULL returns null.
6590  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6591  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6592  * to the parsed name as looked up in the RExC_paren_names hash.
6593  * If there is an error throws a vFAIL().. type exception.
6594  */
6595
6596 #define REG_RSN_RETURN_NULL    0
6597 #define REG_RSN_RETURN_NAME    1
6598 #define REG_RSN_RETURN_DATA    2
6599
6600 STATIC SV*
6601 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6602 {
6603     char *name_start = RExC_parse;
6604
6605     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6606
6607     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6608          /* skip IDFIRST by using do...while */
6609         if (UTF)
6610             do {
6611                 RExC_parse += UTF8SKIP(RExC_parse);
6612             } while (isALNUM_utf8((U8*)RExC_parse));
6613         else
6614             do {
6615                 RExC_parse++;
6616             } while (isALNUM(*RExC_parse));
6617     }
6618
6619     if ( flags ) {
6620         SV* sv_name
6621             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6622                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6623         if ( flags == REG_RSN_RETURN_NAME)
6624             return sv_name;
6625         else if (flags==REG_RSN_RETURN_DATA) {
6626             HE *he_str = NULL;
6627             SV *sv_dat = NULL;
6628             if ( ! sv_name )      /* should not happen*/
6629                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6630             if (RExC_paren_names)
6631                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6632             if ( he_str )
6633                 sv_dat = HeVAL(he_str);
6634             if ( ! sv_dat )
6635                 vFAIL("Reference to nonexistent named group");
6636             return sv_dat;
6637         }
6638         else {
6639             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6640                        (unsigned long) flags);
6641         }
6642         /* NOT REACHED */
6643     }
6644     return NULL;
6645 }
6646
6647 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6648     int rem=(int)(RExC_end - RExC_parse);                       \
6649     int cut;                                                    \
6650     int num;                                                    \
6651     int iscut=0;                                                \
6652     if (rem>10) {                                               \
6653         rem=10;                                                 \
6654         iscut=1;                                                \
6655     }                                                           \
6656     cut=10-rem;                                                 \
6657     if (RExC_lastparse!=RExC_parse)                             \
6658         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6659             rem, RExC_parse,                                    \
6660             cut + 4,                                            \
6661             iscut ? "..." : "<"                                 \
6662         );                                                      \
6663     else                                                        \
6664         PerlIO_printf(Perl_debug_log,"%16s","");                \
6665                                                                 \
6666     if (SIZE_ONLY)                                              \
6667        num = RExC_size + 1;                                     \
6668     else                                                        \
6669        num=REG_NODE_NUM(RExC_emit);                             \
6670     if (RExC_lastnum!=num)                                      \
6671        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6672     else                                                        \
6673        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6674     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6675         (int)((depth*2)), "",                                   \
6676         (funcname)                                              \
6677     );                                                          \
6678     RExC_lastnum=num;                                           \
6679     RExC_lastparse=RExC_parse;                                  \
6680 })
6681
6682
6683
6684 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6685     DEBUG_PARSE_MSG((funcname));                            \
6686     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6687 })
6688 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6689     DEBUG_PARSE_MSG((funcname));                            \
6690     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6691 })
6692
6693 /* This section of code defines the inversion list object and its methods.  The
6694  * interfaces are highly subject to change, so as much as possible is static to
6695  * this file.  An inversion list is here implemented as a malloc'd C UV array
6696  * with some added info that is placed as UVs at the beginning in a header
6697  * portion.  An inversion list for Unicode is an array of code points, sorted
6698  * by ordinal number.  The zeroth element is the first code point in the list.
6699  * The 1th element is the first element beyond that not in the list.  In other
6700  * words, the first range is
6701  *  invlist[0]..(invlist[1]-1)
6702  * The other ranges follow.  Thus every element whose index is divisible by two
6703  * marks the beginning of a range that is in the list, and every element not
6704  * divisible by two marks the beginning of a range not in the list.  A single
6705  * element inversion list that contains the single code point N generally
6706  * consists of two elements
6707  *  invlist[0] == N
6708  *  invlist[1] == N+1
6709  * (The exception is when N is the highest representable value on the
6710  * machine, in which case the list containing just it would be a single
6711  * element, itself.  By extension, if the last range in the list extends to
6712  * infinity, then the first element of that range will be in the inversion list
6713  * at a position that is divisible by two, and is the final element in the
6714  * list.)
6715  * Taking the complement (inverting) an inversion list is quite simple, if the
6716  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6717  * This implementation reserves an element at the beginning of each inversion list
6718  * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
6719  * beginning of the list is either that element if 0, or the next one if 1.
6720  *
6721  * More about inversion lists can be found in "Unicode Demystified"
6722  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6723  * More will be coming when functionality is added later.
6724  *
6725  * The inversion list data structure is currently implemented as an SV pointing
6726  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6727  * array of UV whose memory management is automatically handled by the existing
6728  * facilities for SV's.
6729  *
6730  * Some of the methods should always be private to the implementation, and some
6731  * should eventually be made public */
6732
6733 #define INVLIST_LEN_OFFSET 0    /* Number of elements in the inversion list */
6734 #define INVLIST_ITER_OFFSET 1   /* Current iteration position */
6735
6736 /* This is a combination of a version and data structure type, so that one
6737  * being passed in can be validated to be an inversion list of the correct
6738  * vintage.  When the structure of the header is changed, a new random number
6739  * in the range 2**31-1 should be generated and the new() method changed to
6740  * insert that at this location.  Then, if an auxiliary program doesn't change
6741  * correspondingly, it will be discovered immediately */
6742 #define INVLIST_VERSION_ID_OFFSET 2
6743 #define INVLIST_VERSION_ID 1064334010
6744
6745 /* For safety, when adding new elements, remember to #undef them at the end of
6746  * the inversion list code section */
6747
6748 #define INVLIST_ZERO_OFFSET 3   /* 0 or 1; must be last element in header */
6749 /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
6750  * contains the code point U+00000, and begins here.  If 1, the inversion list
6751  * doesn't contain U+0000, and it begins at the next UV in the array.
6752  * Inverting an inversion list consists of adding or removing the 0 at the
6753  * beginning of it.  By reserving a space for that 0, inversion can be made
6754  * very fast */
6755
6756 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6757
6758 /* Internally things are UVs */
6759 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6760 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6761
6762 #define INVLIST_INITIAL_LEN 10
6763
6764 PERL_STATIC_INLINE UV*
6765 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6766 {
6767     /* Returns a pointer to the first element in the inversion list's array.
6768      * This is called upon initialization of an inversion list.  Where the
6769      * array begins depends on whether the list has the code point U+0000
6770      * in it or not.  The other parameter tells it whether the code that
6771      * follows this call is about to put a 0 in the inversion list or not.
6772      * The first element is either the element with 0, if 0, or the next one,
6773      * if 1 */
6774
6775     UV* zero = get_invlist_zero_addr(invlist);
6776
6777     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6778
6779     /* Must be empty */
6780     assert(! *get_invlist_len_addr(invlist));
6781
6782     /* 1^1 = 0; 1^0 = 1 */
6783     *zero = 1 ^ will_have_0;
6784     return zero + *zero;
6785 }
6786
6787 PERL_STATIC_INLINE UV*
6788 S_invlist_array(pTHX_ SV* const invlist)
6789 {
6790     /* Returns the pointer to the inversion list's array.  Every time the
6791      * length changes, this needs to be called in case malloc or realloc moved
6792      * it */
6793
6794     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6795
6796     /* Must not be empty.  If these fail, you probably didn't check for <len>
6797      * being non-zero before trying to get the array */
6798     assert(*get_invlist_len_addr(invlist));
6799     assert(*get_invlist_zero_addr(invlist) == 0
6800            || *get_invlist_zero_addr(invlist) == 1);
6801
6802     /* The array begins either at the element reserved for zero if the
6803      * list contains 0 (that element will be set to 0), or otherwise the next
6804      * element (in which case the reserved element will be set to 1). */
6805     return (UV *) (get_invlist_zero_addr(invlist)
6806                    + *get_invlist_zero_addr(invlist));
6807 }
6808
6809 PERL_STATIC_INLINE UV*
6810 S_get_invlist_len_addr(pTHX_ SV* invlist)
6811 {
6812     /* Return the address of the UV that contains the current number
6813      * of used elements in the inversion list */
6814
6815     PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6816
6817     return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6818 }
6819
6820 PERL_STATIC_INLINE UV
6821 S_invlist_len(pTHX_ SV* const invlist)
6822 {
6823     /* Returns the current number of elements stored in the inversion list's
6824      * array */
6825
6826     PERL_ARGS_ASSERT_INVLIST_LEN;
6827
6828     return *get_invlist_len_addr(invlist);
6829 }
6830
6831 PERL_STATIC_INLINE void
6832 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6833 {
6834     /* Sets the current number of elements stored in the inversion list */
6835
6836     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6837
6838     *get_invlist_len_addr(invlist) = len;
6839
6840     assert(len <= SvLEN(invlist));
6841
6842     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6843     /* If the list contains U+0000, that element is part of the header,
6844      * and should not be counted as part of the array.  It will contain
6845      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
6846      * subtract:
6847      *  SvCUR_set(invlist,
6848      *            TO_INTERNAL_SIZE(len
6849      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
6850      * But, this is only valid if len is not 0.  The consequences of not doing
6851      * this is that the memory allocation code may think that 1 more UV is
6852      * being used than actually is, and so might do an unnecessary grow.  That
6853      * seems worth not bothering to make this the precise amount.
6854      *
6855      * Note that when inverting, SvCUR shouldn't change */
6856 }
6857
6858 PERL_STATIC_INLINE UV
6859 S_invlist_max(pTHX_ SV* const invlist)
6860 {
6861     /* Returns the maximum number of elements storable in the inversion list's
6862      * array, without having to realloc() */
6863
6864     PERL_ARGS_ASSERT_INVLIST_MAX;
6865
6866     return FROM_INTERNAL_SIZE(SvLEN(invlist));
6867 }
6868
6869 PERL_STATIC_INLINE UV*
6870 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6871 {
6872     /* Return the address of the UV that is reserved to hold 0 if the inversion
6873      * list contains 0.  This has to be the last element of the heading, as the
6874      * list proper starts with either it if 0, or the next element if not.
6875      * (But we force it to contain either 0 or 1) */
6876
6877     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6878
6879     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6880 }
6881
6882 #ifndef PERL_IN_XSUB_RE
6883 SV*
6884 Perl__new_invlist(pTHX_ IV initial_size)
6885 {
6886
6887     /* Return a pointer to a newly constructed inversion list, with enough
6888      * space to store 'initial_size' elements.  If that number is negative, a
6889      * system default is used instead */
6890
6891     SV* new_list;
6892
6893     if (initial_size < 0) {
6894         initial_size = INVLIST_INITIAL_LEN;
6895     }
6896
6897     /* Allocate the initial space */
6898     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6899     invlist_set_len(new_list, 0);
6900
6901     /* Force iterinit() to be used to get iteration to work */
6902     *get_invlist_iter_addr(new_list) = UV_MAX;
6903
6904     /* This should force a segfault if a method doesn't initialize this
6905      * properly */
6906     *get_invlist_zero_addr(new_list) = UV_MAX;
6907
6908     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6909 #if HEADER_LENGTH != 4
6910 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6911 #endif
6912
6913     return new_list;
6914 }
6915 #endif
6916
6917 STATIC SV*
6918 S__new_invlist_C_array(pTHX_ UV* list)
6919 {
6920     /* Return a pointer to a newly constructed inversion list, initialized to
6921      * point to <list>, which has to be in the exact correct inversion list
6922      * form, including internal fields.  Thus this is a dangerous routine that
6923      * should not be used in the wrong hands */
6924
6925     SV* invlist = newSV_type(SVt_PV);
6926
6927     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6928
6929     SvPV_set(invlist, (char *) list);
6930     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
6931                                shouldn't touch it */
6932     SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6933
6934     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6935         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6936     }
6937
6938     return invlist;
6939 }
6940
6941 STATIC void
6942 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6943 {
6944     /* Grow the maximum size of an inversion list */
6945
6946     PERL_ARGS_ASSERT_INVLIST_EXTEND;
6947
6948     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6949 }
6950
6951 PERL_STATIC_INLINE void
6952 S_invlist_trim(pTHX_ SV* const invlist)
6953 {
6954     PERL_ARGS_ASSERT_INVLIST_TRIM;
6955
6956     /* Change the length of the inversion list to how many entries it currently
6957      * has */
6958
6959     SvPV_shrink_to_cur((SV *) invlist);
6960 }
6961
6962 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6963  * etc */
6964 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6965 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6966
6967 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6968
6969 STATIC void
6970 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6971 {
6972    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6973     * the end of the inversion list.  The range must be above any existing
6974     * ones. */
6975
6976     UV* array;
6977     UV max = invlist_max(invlist);
6978     UV len = invlist_len(invlist);
6979
6980     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6981
6982     if (len == 0) { /* Empty lists must be initialized */
6983         array = _invlist_array_init(invlist, start == 0);
6984     }
6985     else {
6986         /* Here, the existing list is non-empty. The current max entry in the
6987          * list is generally the first value not in the set, except when the
6988          * set extends to the end of permissible values, in which case it is
6989          * the first entry in that final set, and so this call is an attempt to
6990          * append out-of-order */
6991
6992         UV final_element = len - 1;
6993         array = invlist_array(invlist);
6994         if (array[final_element] > start
6995             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6996         {
6997             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6998                        array[final_element], start,
6999                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7000         }
7001
7002         /* Here, it is a legal append.  If the new range begins with the first
7003          * value not in the set, it is extending the set, so the new first
7004          * value not in the set is one greater than the newly extended range.
7005          * */
7006         if (array[final_element] == start) {
7007             if (end != UV_MAX) {
7008                 array[final_element] = end + 1;
7009             }
7010             else {
7011                 /* But if the end is the maximum representable on the machine,
7012                  * just let the range that this would extend to have no end */
7013                 invlist_set_len(invlist, len - 1);
7014             }
7015             return;
7016         }
7017     }
7018
7019     /* Here the new range doesn't extend any existing set.  Add it */
7020
7021     len += 2;   /* Includes an element each for the start and end of range */
7022
7023     /* If overflows the existing space, extend, which may cause the array to be
7024      * moved */
7025     if (max < len) {
7026         invlist_extend(invlist, len);
7027         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7028                                            failure in invlist_array() */
7029         array = invlist_array(invlist);
7030     }
7031     else {
7032         invlist_set_len(invlist, len);
7033     }
7034
7035     /* The next item on the list starts the range, the one after that is
7036      * one past the new range.  */
7037     array[len - 2] = start;
7038     if (end != UV_MAX) {
7039         array[len - 1] = end + 1;
7040     }
7041     else {
7042         /* But if the end is the maximum representable on the machine, just let
7043          * the range have no end */
7044         invlist_set_len(invlist, len - 1);
7045     }
7046 }
7047
7048 #ifndef PERL_IN_XSUB_RE
7049
7050 STATIC IV
7051 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
7052 {
7053     /* Searches the inversion list for the entry that contains the input code
7054      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7055      * return value is the index into the list's array of the range that
7056      * contains <cp> */
7057
7058     IV low = 0;
7059     IV high = invlist_len(invlist);
7060     const UV * const array = invlist_array(invlist);
7061
7062     PERL_ARGS_ASSERT_INVLIST_SEARCH;
7063
7064     /* If list is empty or the code point is before the first element, return
7065      * failure. */
7066     if (high == 0 || cp < array[0]) {
7067         return -1;
7068     }
7069
7070     /* Binary search.  What we are looking for is <i> such that
7071      *  array[i] <= cp < array[i+1]
7072      * The loop below converges on the i+1. */
7073     while (low < high) {
7074         IV mid = (low + high) / 2;
7075         if (array[mid] <= cp) {
7076             low = mid + 1;
7077
7078             /* We could do this extra test to exit the loop early.
7079             if (cp < array[low]) {
7080                 return mid;
7081             }
7082             */
7083         }
7084         else { /* cp < array[mid] */
7085             high = mid;
7086         }
7087     }
7088
7089     return high - 1;
7090 }
7091
7092 void
7093 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7094 {
7095     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7096      * but is used when the swash has an inversion list.  This makes this much
7097      * faster, as it uses a binary search instead of a linear one.  This is
7098      * intimately tied to that function, and perhaps should be in utf8.c,
7099      * except it is intimately tied to inversion lists as well.  It assumes
7100      * that <swatch> is all 0's on input */
7101
7102     UV current = start;
7103     const IV len = invlist_len(invlist);
7104     IV i;
7105     const UV * array;
7106
7107     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7108
7109     if (len == 0) { /* Empty inversion list */
7110         return;
7111     }
7112
7113     array = invlist_array(invlist);
7114
7115     /* Find which element it is */
7116     i = invlist_search(invlist, start);
7117
7118     /* We populate from <start> to <end> */
7119     while (current < end) {
7120         UV upper;
7121
7122         /* The inversion list gives the results for every possible code point
7123          * after the first one in the list.  Only those ranges whose index is
7124          * even are ones that the inversion list matches.  For the odd ones,
7125          * and if the initial code point is not in the list, we have to skip
7126          * forward to the next element */
7127         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7128             i++;
7129             if (i >= len) { /* Finished if beyond the end of the array */
7130                 return;
7131             }
7132             current = array[i];
7133             if (current >= end) {   /* Finished if beyond the end of what we
7134                                        are populating */
7135                 return;
7136             }
7137         }
7138         assert(current >= start);
7139
7140         /* The current range ends one below the next one, except don't go past
7141          * <end> */
7142         i++;
7143         upper = (i < len && array[i] < end) ? array[i] : end;
7144
7145         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7146          * for each code point in it */
7147         for (; current < upper; current++) {
7148             const STRLEN offset = (STRLEN)(current - start);
7149             swatch[offset >> 3] |= 1 << (offset & 7);
7150         }
7151
7152         /* Quit if at the end of the list */
7153         if (i >= len) {
7154
7155             /* But first, have to deal with the highest possible code point on
7156              * the platform.  The previous code assumes that <end> is one
7157              * beyond where we want to populate, but that is impossible at the
7158              * platform's infinity, so have to handle it specially */
7159             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7160             {
7161                 const STRLEN offset = (STRLEN)(end - start);
7162                 swatch[offset >> 3] |= 1 << (offset & 7);
7163             }
7164             return;
7165         }
7166
7167         /* Advance to the next range, which will be for code points not in the
7168          * inversion list */
7169         current = array[i];
7170     }
7171
7172     return;
7173 }
7174
7175
7176 void
7177 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7178 {
7179     /* Take the union of two inversion lists and point <output> to it.  *output
7180      * should be defined upon input, and if it points to one of the two lists,
7181      * the reference count to that list will be decremented.  The first list,
7182      * <a>, may be NULL, in which case a copy of the second list is returned.
7183      * If <complement_b> is TRUE, the union is taken of the complement
7184      * (inversion) of <b> instead of b itself.
7185      *
7186      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7187      * Richard Gillam, published by Addison-Wesley, and explained at some
7188      * length there.  The preface says to incorporate its examples into your
7189      * code at your own risk.
7190      *
7191      * The algorithm is like a merge sort.
7192      *
7193      * XXX A potential performance improvement is to keep track as we go along
7194      * if only one of the inputs contributes to the result, meaning the other
7195      * is a subset of that one.  In that case, we can skip the final copy and
7196      * return the larger of the input lists, but then outside code might need
7197      * to keep track of whether to free the input list or not */
7198
7199     UV* array_a;    /* a's array */
7200     UV* array_b;
7201     UV len_a;       /* length of a's array */
7202     UV len_b;
7203
7204     SV* u;                      /* the resulting union */
7205     UV* array_u;
7206     UV len_u;
7207
7208     UV i_a = 0;             /* current index into a's array */
7209     UV i_b = 0;
7210     UV i_u = 0;
7211
7212     /* running count, as explained in the algorithm source book; items are
7213      * stopped accumulating and are output when the count changes to/from 0.
7214      * The count is incremented when we start a range that's in the set, and
7215      * decremented when we start a range that's not in the set.  So its range
7216      * is 0 to 2.  Only when the count is zero is something not in the set.
7217      */
7218     UV count = 0;
7219
7220     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7221     assert(a != b);
7222
7223     /* If either one is empty, the union is the other one */
7224     if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
7225         if (*output == a) {
7226             if (a != NULL) {
7227                 SvREFCNT_dec(a);
7228             }
7229         }
7230         if (*output != b) {
7231             *output = invlist_clone(b);
7232             if (complement_b) {
7233                 _invlist_invert(*output);
7234             }
7235         } /* else *output already = b; */
7236         return;
7237     }
7238     else if ((len_b = invlist_len(b)) == 0) {
7239         if (*output == b) {
7240             SvREFCNT_dec(b);
7241         }
7242
7243         /* The complement of an empty list is a list that has everything in it,
7244          * so the union with <a> includes everything too */
7245         if (complement_b) {
7246             if (a == *output) {
7247                 SvREFCNT_dec(a);
7248             }
7249             *output = _new_invlist(1);
7250             _append_range_to_invlist(*output, 0, UV_MAX);
7251         }
7252         else if (*output != a) {
7253             *output = invlist_clone(a);
7254         }
7255         /* else *output already = a; */
7256         return;
7257     }
7258
7259     /* Here both lists exist and are non-empty */
7260     array_a = invlist_array(a);
7261     array_b = invlist_array(b);
7262
7263     /* If are to take the union of 'a' with the complement of b, set it
7264      * up so are looking at b's complement. */
7265     if (complement_b) {
7266
7267         /* To complement, we invert: if the first element is 0, remove it.  To
7268          * do this, we just pretend the array starts one later, and clear the
7269          * flag as we don't have to do anything else later */
7270         if (array_b[0] == 0) {
7271             array_b++;
7272             len_b--;
7273             complement_b = FALSE;
7274         }
7275         else {
7276
7277             /* But if the first element is not zero, we unshift a 0 before the
7278              * array.  The data structure reserves a space for that 0 (which
7279              * should be a '1' right now), so physical shifting is unneeded,
7280              * but temporarily change that element to 0.  Before exiting the
7281              * routine, we must restore the element to '1' */
7282             array_b--;
7283             len_b++;
7284             array_b[0] = 0;
7285         }
7286     }
7287
7288     /* Size the union for the worst case: that the sets are completely
7289      * disjoint */
7290     u = _new_invlist(len_a + len_b);
7291
7292     /* Will contain U+0000 if either component does */
7293     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7294                                       || (len_b > 0 && array_b[0] == 0));
7295
7296     /* Go through each list item by item, stopping when exhausted one of
7297      * them */
7298     while (i_a < len_a && i_b < len_b) {
7299         UV cp;      /* The element to potentially add to the union's array */
7300         bool cp_in_set;   /* is it in the the input list's set or not */
7301
7302         /* We need to take one or the other of the two inputs for the union.
7303          * Since we are merging two sorted lists, we take the smaller of the
7304          * next items.  In case of a tie, we take the one that is in its set
7305          * first.  If we took one not in the set first, it would decrement the
7306          * count, possibly to 0 which would cause it to be output as ending the
7307          * range, and the next time through we would take the same number, and
7308          * output it again as beginning the next range.  By doing it the
7309          * opposite way, there is no possibility that the count will be
7310          * momentarily decremented to 0, and thus the two adjoining ranges will
7311          * be seamlessly merged.  (In a tie and both are in the set or both not
7312          * in the set, it doesn't matter which we take first.) */
7313         if (array_a[i_a] < array_b[i_b]
7314             || (array_a[i_a] == array_b[i_b]
7315                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7316         {
7317             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7318             cp= array_a[i_a++];
7319         }
7320         else {
7321             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7322             cp= array_b[i_b++];
7323         }
7324
7325         /* Here, have chosen which of the two inputs to look at.  Only output
7326          * if the running count changes to/from 0, which marks the
7327          * beginning/end of a range in that's in the set */
7328         if (cp_in_set) {
7329             if (count == 0) {
7330                 array_u[i_u++] = cp;
7331             }
7332             count++;
7333         }
7334         else {
7335             count--;
7336             if (count == 0) {
7337                 array_u[i_u++] = cp;
7338             }
7339         }
7340     }
7341
7342     /* Here, we are finished going through at least one of the lists, which
7343      * means there is something remaining in at most one.  We check if the list
7344      * that hasn't been exhausted is positioned such that we are in the middle
7345      * of a range in its set or not.  (i_a and i_b point to the element beyond
7346      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7347      * is potentially more to output.
7348      * There are four cases:
7349      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7350      *     in the union is entirely from the non-exhausted set.
7351      *  2) Both were in their sets, count is 2.  Nothing further should
7352      *     be output, as everything that remains will be in the exhausted
7353      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7354      *     that
7355      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7356      *     Nothing further should be output because the union includes
7357      *     everything from the exhausted set.  Not decrementing ensures that.
7358      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7359      *     decrementing to 0 insures that we look at the remainder of the
7360      *     non-exhausted set */
7361     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7362         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7363     {
7364         count--;
7365     }
7366
7367     /* The final length is what we've output so far, plus what else is about to
7368      * be output.  (If 'count' is non-zero, then the input list we exhausted
7369      * has everything remaining up to the machine's limit in its set, and hence
7370      * in the union, so there will be no further output. */
7371     len_u = i_u;
7372     if (count == 0) {
7373         /* At most one of the subexpressions will be non-zero */
7374         len_u += (len_a - i_a) + (len_b - i_b);
7375     }
7376
7377     /* Set result to final length, which can change the pointer to array_u, so
7378      * re-find it */
7379     if (len_u != invlist_len(u)) {
7380         invlist_set_len(u, len_u);
7381         invlist_trim(u);
7382         array_u = invlist_array(u);
7383     }
7384
7385     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7386      * the other) ended with everything above it not in its set.  That means
7387      * that the remaining part of the union is precisely the same as the
7388      * non-exhausted list, so can just copy it unchanged.  (If both list were
7389      * exhausted at the same time, then the operations below will be both 0.)
7390      */
7391     if (count == 0) {
7392         IV copy_count; /* At most one will have a non-zero copy count */
7393         if ((copy_count = len_a - i_a) > 0) {
7394             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7395         }
7396         else if ((copy_count = len_b - i_b) > 0) {
7397             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7398         }
7399     }
7400
7401     /*  We may be removing a reference to one of the inputs */
7402     if (a == *output || b == *output) {
7403         SvREFCNT_dec(*output);
7404     }
7405
7406     /* If we've changed b, restore it */
7407     if (complement_b) {
7408         array_b[0] = 1;
7409     }
7410
7411     *output = u;
7412     return;
7413 }
7414
7415 void
7416 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7417 {
7418     /* Take the intersection of two inversion lists and point <i> to it.  *i
7419      * should be defined upon input, and if it points to one of the two lists,
7420      * the reference count to that list will be decremented.
7421      * If <complement_b> is TRUE, the result will be the intersection of <a>
7422      * and the complement (or inversion) of <b> instead of <b> directly.
7423      *
7424      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7425      * Richard Gillam, published by Addison-Wesley, and explained at some
7426      * length there.  The preface says to incorporate its examples into your
7427      * code at your own risk.  In fact, it had bugs
7428      *
7429      * The algorithm is like a merge sort, and is essentially the same as the
7430      * union above
7431      */
7432
7433     UV* array_a;                /* a's array */
7434     UV* array_b;
7435     UV len_a;   /* length of a's array */
7436     UV len_b;
7437
7438     SV* r;                   /* the resulting intersection */
7439     UV* array_r;
7440     UV len_r;
7441
7442     UV i_a = 0;             /* current index into a's array */
7443     UV i_b = 0;
7444     UV i_r = 0;
7445
7446     /* running count, as explained in the algorithm source book; items are
7447      * stopped accumulating and are output when the count changes to/from 2.
7448      * The count is incremented when we start a range that's in the set, and
7449      * decremented when we start a range that's not in the set.  So its range
7450      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7451      */
7452     UV count = 0;
7453
7454     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7455     assert(a != b);
7456
7457     /* Special case if either one is empty */
7458     len_a = invlist_len(a);
7459     if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7460
7461         if (len_a != 0 && complement_b) {
7462
7463             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7464              * be empty.  Here, also we are using 'b's complement, which hence
7465              * must be every possible code point.  Thus the intersection is
7466              * simply 'a'. */
7467             if (*i != a) {
7468                 *i = invlist_clone(a);
7469
7470                 if (*i == b) {
7471                     SvREFCNT_dec(b);
7472                 }
7473             }
7474             /* else *i is already 'a' */
7475             return;
7476         }
7477
7478         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7479          * intersection must be empty */
7480         if (*i == a) {
7481             SvREFCNT_dec(a);
7482         }
7483         else if (*i == b) {
7484             SvREFCNT_dec(b);
7485         }
7486         *i = _new_invlist(0);
7487         return;
7488     }
7489
7490     /* Here both lists exist and are non-empty */
7491     array_a = invlist_array(a);
7492     array_b = invlist_array(b);
7493
7494     /* If are to take the intersection of 'a' with the complement of b, set it
7495      * up so are looking at b's complement. */
7496     if (complement_b) {
7497
7498         /* To complement, we invert: if the first element is 0, remove it.  To
7499          * do this, we just pretend the array starts one later, and clear the
7500          * flag as we don't have to do anything else later */
7501         if (array_b[0] == 0) {
7502             array_b++;
7503             len_b--;
7504             complement_b = FALSE;
7505         }
7506         else {
7507
7508             /* But if the first element is not zero, we unshift a 0 before the
7509              * array.  The data structure reserves a space for that 0 (which
7510              * should be a '1' right now), so physical shifting is unneeded,
7511              * but temporarily change that element to 0.  Before exiting the
7512              * routine, we must restore the element to '1' */
7513             array_b--;
7514             len_b++;
7515             array_b[0] = 0;
7516         }
7517     }
7518
7519     /* Size the intersection for the worst case: that the intersection ends up
7520      * fragmenting everything to be completely disjoint */
7521     r= _new_invlist(len_a + len_b);
7522
7523     /* Will contain U+0000 iff both components do */
7524     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7525                                      && len_b > 0 && array_b[0] == 0);
7526
7527     /* Go through each list item by item, stopping when exhausted one of
7528      * them */
7529     while (i_a < len_a && i_b < len_b) {
7530         UV cp;      /* The element to potentially add to the intersection's
7531                        array */
7532         bool cp_in_set; /* Is it in the input list's set or not */
7533
7534         /* We need to take one or the other of the two inputs for the
7535          * intersection.  Since we are merging two sorted lists, we take the
7536          * smaller of the next items.  In case of a tie, we take the one that
7537          * is not in its set first (a difference from the union algorithm).  If
7538          * we took one in the set first, it would increment the count, possibly
7539          * to 2 which would cause it to be output as starting a range in the
7540          * intersection, and the next time through we would take that same
7541          * number, and output it again as ending the set.  By doing it the
7542          * opposite of this, there is no possibility that the count will be
7543          * momentarily incremented to 2.  (In a tie and both are in the set or
7544          * both not in the set, it doesn't matter which we take first.) */
7545         if (array_a[i_a] < array_b[i_b]
7546             || (array_a[i_a] == array_b[i_b]
7547                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7548         {
7549             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7550             cp= array_a[i_a++];
7551         }
7552         else {
7553             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7554             cp= array_b[i_b++];
7555         }
7556
7557         /* Here, have chosen which of the two inputs to look at.  Only output
7558          * if the running count changes to/from 2, which marks the
7559          * beginning/end of a range that's in the intersection */
7560         if (cp_in_set) {
7561             count++;
7562             if (count == 2) {
7563                 array_r[i_r++] = cp;
7564             }
7565         }
7566         else {
7567             if (count == 2) {
7568                 array_r[i_r++] = cp;
7569             }
7570             count--;
7571         }
7572     }
7573
7574     /* Here, we are finished going through at least one of the lists, which
7575      * means there is something remaining in at most one.  We check if the list
7576      * that has been exhausted is positioned such that we are in the middle
7577      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7578      * the ones we care about.)  There are four cases:
7579      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7580      *     nothing left in the intersection.
7581      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7582      *     above 2.  What should be output is exactly that which is in the
7583      *     non-exhausted set, as everything it has is also in the intersection
7584      *     set, and everything it doesn't have can't be in the intersection
7585      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7586      *     gets incremented to 2.  Like the previous case, the intersection is
7587      *     everything that remains in the non-exhausted set.
7588      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7589      *     remains 1.  And the intersection has nothing more. */
7590     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7591         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7592     {
7593         count++;
7594     }
7595
7596     /* The final length is what we've output so far plus what else is in the
7597      * intersection.  At most one of the subexpressions below will be non-zero */
7598     len_r = i_r;
7599     if (count >= 2) {
7600         len_r += (len_a - i_a) + (len_b - i_b);
7601     }
7602
7603     /* Set result to final length, which can change the pointer to array_r, so
7604      * re-find it */
7605     if (len_r != invlist_len(r)) {
7606         invlist_set_len(r, len_r);
7607         invlist_trim(r);
7608         array_r = invlist_array(r);
7609     }
7610
7611     /* Finish outputting any remaining */
7612     if (count >= 2) { /* At most one will have a non-zero copy count */
7613         IV copy_count;
7614         if ((copy_count = len_a - i_a) > 0) {
7615             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7616         }
7617         else if ((copy_count = len_b - i_b) > 0) {
7618             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7619         }
7620     }
7621
7622     /*  We may be removing a reference to one of the inputs */
7623     if (a == *i || b == *i) {
7624         SvREFCNT_dec(*i);
7625     }
7626
7627     /* If we've changed b, restore it */
7628     if (complement_b) {
7629         array_b[0] = 1;
7630     }
7631
7632     *i = r;
7633     return;
7634 }
7635
7636 SV*
7637 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7638 {
7639     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7640      * set.  A pointer to the inversion list is returned.  This may actually be
7641      * a new list, in which case the passed in one has been destroyed.  The
7642      * passed in inversion list can be NULL, in which case a new one is created
7643      * with just the one range in it */
7644
7645     SV* range_invlist;
7646     UV len;
7647
7648     if (invlist == NULL) {
7649         invlist = _new_invlist(2);
7650         len = 0;
7651     }
7652     else {
7653         len = invlist_len(invlist);
7654     }
7655
7656     /* If comes after the final entry, can just append it to the end */
7657     if (len == 0
7658         || start >= invlist_array(invlist)
7659                                     [invlist_len(invlist) - 1])
7660     {
7661         _append_range_to_invlist(invlist, start, end);
7662         return invlist;
7663     }
7664
7665     /* Here, can't just append things, create and return a new inversion list
7666      * which is the union of this range and the existing inversion list */
7667     range_invlist = _new_invlist(2);
7668     _append_range_to_invlist(range_invlist, start, end);
7669
7670     _invlist_union(invlist, range_invlist, &invlist);
7671
7672     /* The temporary can be freed */
7673     SvREFCNT_dec(range_invlist);
7674
7675     return invlist;
7676 }
7677
7678 #endif
7679
7680 PERL_STATIC_INLINE SV*
7681 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7682     return _add_range_to_invlist(invlist, cp, cp);
7683 }
7684
7685 #ifndef PERL_IN_XSUB_RE
7686 void
7687 Perl__invlist_invert(pTHX_ SV* const invlist)
7688 {
7689     /* Complement the input inversion list.  This adds a 0 if the list didn't
7690      * have a zero; removes it otherwise.  As described above, the data
7691      * structure is set up so that this is very efficient */
7692
7693     UV* len_pos = get_invlist_len_addr(invlist);
7694
7695     PERL_ARGS_ASSERT__INVLIST_INVERT;
7696
7697     /* The inverse of matching nothing is matching everything */
7698     if (*len_pos == 0) {
7699         _append_range_to_invlist(invlist, 0, UV_MAX);
7700         return;
7701     }
7702
7703     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7704      * zero element was a 0, so it is being removed, so the length decrements
7705      * by 1; and vice-versa.  SvCUR is unaffected */
7706     if (*get_invlist_zero_addr(invlist) ^= 1) {
7707         (*len_pos)--;
7708     }
7709     else {
7710         (*len_pos)++;
7711     }
7712 }
7713
7714 void
7715 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7716 {
7717     /* Complement the input inversion list (which must be a Unicode property,
7718      * all of which don't match above the Unicode maximum code point.)  And
7719      * Perl has chosen to not have the inversion match above that either.  This
7720      * adds a 0x110000 if the list didn't end with it, and removes it if it did
7721      */
7722
7723     UV len;
7724     UV* array;
7725
7726     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7727
7728     _invlist_invert(invlist);
7729
7730     len = invlist_len(invlist);
7731
7732     if (len != 0) { /* If empty do nothing */
7733         array = invlist_array(invlist);
7734         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7735             /* Add 0x110000.  First, grow if necessary */
7736             len++;
7737             if (invlist_max(invlist) < len) {
7738                 invlist_extend(invlist, len);
7739                 array = invlist_array(invlist);
7740             }
7741             invlist_set_len(invlist, len);
7742             array[len - 1] = PERL_UNICODE_MAX + 1;
7743         }
7744         else {  /* Remove the 0x110000 */
7745             invlist_set_len(invlist, len - 1);
7746         }
7747     }
7748
7749     return;
7750 }
7751 #endif
7752
7753 PERL_STATIC_INLINE SV*
7754 S_invlist_clone(pTHX_ SV* const invlist)
7755 {
7756
7757     /* Return a new inversion list that is a copy of the input one, which is
7758      * unchanged */
7759
7760     /* Need to allocate extra space to accommodate Perl's addition of a
7761      * trailing NUL to SvPV's, since it thinks they are always strings */
7762     SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7763     STRLEN length = SvCUR(invlist);
7764
7765     PERL_ARGS_ASSERT_INVLIST_CLONE;
7766
7767     SvCUR_set(new_invlist, length); /* This isn't done automatically */
7768     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7769
7770     return new_invlist;
7771 }
7772
7773 PERL_STATIC_INLINE UV*
7774 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7775 {
7776     /* Return the address of the UV that contains the current iteration
7777      * position */
7778
7779     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7780
7781     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7782 }
7783
7784 PERL_STATIC_INLINE UV*
7785 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7786 {
7787     /* Return the address of the UV that contains the version id. */
7788
7789     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7790
7791     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7792 }
7793
7794 PERL_STATIC_INLINE void
7795 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
7796 {
7797     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7798
7799     *get_invlist_iter_addr(invlist) = 0;
7800 }
7801
7802 STATIC bool
7803 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7804 {
7805     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7806      * This call sets in <*start> and <*end>, the next range in <invlist>.
7807      * Returns <TRUE> if successful and the next call will return the next
7808      * range; <FALSE> if was already at the end of the list.  If the latter,
7809      * <*start> and <*end> are unchanged, and the next call to this function
7810      * will start over at the beginning of the list */
7811
7812     UV* pos = get_invlist_iter_addr(invlist);
7813     UV len = invlist_len(invlist);
7814     UV *array;
7815
7816     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7817
7818     if (*pos >= len) {
7819         *pos = UV_MAX;  /* Force iternit() to be required next time */
7820         return FALSE;
7821     }
7822
7823     array = invlist_array(invlist);
7824
7825     *start = array[(*pos)++];
7826
7827     if (*pos >= len) {
7828         *end = UV_MAX;
7829     }
7830     else {
7831         *end = array[(*pos)++] - 1;
7832     }
7833
7834     return TRUE;
7835 }
7836
7837 #ifndef PERL_IN_XSUB_RE
7838 SV *
7839 Perl__invlist_contents(pTHX_ SV* const invlist)
7840 {
7841     /* Get the contents of an inversion list into a string SV so that they can
7842      * be printed out.  It uses the format traditionally done for debug tracing
7843      */
7844
7845     UV start, end;
7846     SV* output = newSVpvs("\n");
7847
7848     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7849
7850     invlist_iterinit(invlist);
7851     while (invlist_iternext(invlist, &start, &end)) {
7852         if (end == UV_MAX) {
7853             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7854         }
7855         else if (end != start) {
7856             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7857                     start,       end);
7858         }
7859         else {
7860             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7861         }
7862     }
7863
7864     return output;
7865 }
7866 #endif
7867
7868 #if 0
7869 void
7870 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7871 {
7872     /* Dumps out the ranges in an inversion list.  The string 'header'
7873      * if present is output on a line before the first range */
7874
7875     UV start, end;
7876
7877     if (header && strlen(header)) {
7878         PerlIO_printf(Perl_debug_log, "%s\n", header);
7879     }
7880     invlist_iterinit(invlist);
7881     while (invlist_iternext(invlist, &start, &end)) {
7882         if (end == UV_MAX) {
7883             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7884         }
7885         else {
7886             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7887         }
7888     }
7889 }
7890 #endif
7891
7892 #undef HEADER_LENGTH
7893 #undef INVLIST_INITIAL_LENGTH
7894 #undef TO_INTERNAL_SIZE
7895 #undef FROM_INTERNAL_SIZE
7896 #undef INVLIST_LEN_OFFSET
7897 #undef INVLIST_ZERO_OFFSET
7898 #undef INVLIST_ITER_OFFSET
7899 #undef INVLIST_VERSION_ID
7900
7901 /* End of inversion list object */
7902
7903 /*
7904  - reg - regular expression, i.e. main body or parenthesized thing
7905  *
7906  * Caller must absorb opening parenthesis.
7907  *
7908  * Combining parenthesis handling with the base level of regular expression
7909  * is a trifle forced, but the need to tie the tails of the branches to what
7910  * follows makes it hard to avoid.
7911  */
7912 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7913 #ifdef DEBUGGING
7914 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7915 #else
7916 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7917 #endif
7918
7919 STATIC regnode *
7920 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7921     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7922 {
7923     dVAR;
7924     register regnode *ret;              /* Will be the head of the group. */
7925     register regnode *br;
7926     register regnode *lastbr;
7927     register regnode *ender = NULL;
7928     register I32 parno = 0;
7929     I32 flags;
7930     U32 oregflags = RExC_flags;
7931     bool have_branch = 0;
7932     bool is_open = 0;
7933     I32 freeze_paren = 0;
7934     I32 after_freeze = 0;
7935
7936     /* for (?g), (?gc), and (?o) warnings; warning
7937        about (?c) will warn about (?g) -- japhy    */
7938
7939 #define WASTED_O  0x01
7940 #define WASTED_G  0x02
7941 #define WASTED_C  0x04
7942 #define WASTED_GC (0x02|0x04)
7943     I32 wastedflags = 0x00;
7944
7945     char * parse_start = RExC_parse; /* MJD */
7946     char * const oregcomp_parse = RExC_parse;
7947
7948     GET_RE_DEBUG_FLAGS_DECL;
7949
7950     PERL_ARGS_ASSERT_REG;
7951     DEBUG_PARSE("reg ");
7952
7953     *flagp = 0;                         /* Tentatively. */
7954
7955
7956     /* Make an OPEN node, if parenthesized. */
7957     if (paren) {
7958         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7959             char *start_verb = RExC_parse;
7960             STRLEN verb_len = 0;
7961             char *start_arg = NULL;
7962             unsigned char op = 0;
7963             int argok = 1;
7964             int internal_argval = 0; /* internal_argval is only useful if !argok */
7965             while ( *RExC_parse && *RExC_parse != ')' ) {
7966                 if ( *RExC_parse == ':' ) {
7967                     start_arg = RExC_parse + 1;
7968                     break;
7969                 }
7970                 RExC_parse++;
7971             }
7972             ++start_verb;
7973             verb_len = RExC_parse - start_verb;
7974             if ( start_arg ) {
7975                 RExC_parse++;
7976                 while ( *RExC_parse && *RExC_parse != ')' )
7977                     RExC_parse++;
7978                 if ( *RExC_parse != ')' )
7979                     vFAIL("Unterminated verb pattern argument");
7980                 if ( RExC_parse == start_arg )
7981                     start_arg = NULL;
7982             } else {
7983                 if ( *RExC_parse != ')' )
7984                     vFAIL("Unterminated verb pattern");
7985             }
7986
7987             switch ( *start_verb ) {
7988             case 'A':  /* (*ACCEPT) */
7989                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7990                     op = ACCEPT;
7991                     internal_argval = RExC_nestroot;
7992                 }
7993                 break;
7994             case 'C':  /* (*COMMIT) */
7995                 if ( memEQs(start_verb,verb_len,"COMMIT") )
7996                     op = COMMIT;
7997                 break;
7998             case 'F':  /* (*FAIL) */
7999                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8000                     op = OPFAIL;
8001                     argok = 0;
8002                 }
8003                 break;
8004             case ':':  /* (*:NAME) */
8005             case 'M':  /* (*MARK:NAME) */
8006                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8007                     op = MARKPOINT;
8008                     argok = -1;
8009                 }
8010                 break;
8011             case 'P':  /* (*PRUNE) */
8012                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8013                     op = PRUNE;
8014                 break;
8015             case 'S':   /* (*SKIP) */
8016                 if ( memEQs(start_verb,verb_len,"SKIP") )
8017                     op = SKIP;
8018                 break;
8019             case 'T':  /* (*THEN) */
8020                 /* [19:06] <TimToady> :: is then */
8021                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8022                     op = CUTGROUP;
8023                     RExC_seen |= REG_SEEN_CUTGROUP;
8024                 }
8025                 break;
8026             }
8027             if ( ! op ) {
8028                 RExC_parse++;
8029                 vFAIL3("Unknown verb pattern '%.*s'",
8030                     verb_len, start_verb);
8031             }
8032             if ( argok ) {
8033                 if ( start_arg && internal_argval ) {
8034                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8035                         verb_len, start_verb);
8036                 } else if ( argok < 0 && !start_arg ) {
8037                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8038                         verb_len, start_verb);
8039                 } else {
8040                     ret = reganode(pRExC_state, op, internal_argval);
8041                     if ( ! internal_argval && ! SIZE_ONLY ) {
8042                         if (start_arg) {
8043                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8044                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8045                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8046                             ret->flags = 0;
8047                         } else {
8048                             ret->flags = 1;
8049                         }
8050                     }
8051                 }
8052                 if (!internal_argval)
8053                     RExC_seen |= REG_SEEN_VERBARG;
8054             } else if ( start_arg ) {
8055                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8056                         verb_len, start_verb);
8057             } else {
8058                 ret = reg_node(pRExC_state, op);
8059             }
8060             nextchar(pRExC_state);
8061             return ret;
8062         } else
8063         if (*RExC_parse == '?') { /* (?...) */
8064             bool is_logical = 0;
8065             const char * const seqstart = RExC_parse;
8066             bool has_use_defaults = FALSE;
8067
8068             RExC_parse++;
8069             paren = *RExC_parse++;
8070             ret = NULL;                 /* For look-ahead/behind. */
8071             switch (paren) {
8072
8073             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8074                 paren = *RExC_parse++;
8075                 if ( paren == '<')         /* (?P<...>) named capture */
8076                     goto named_capture;
8077                 else if (paren == '>') {   /* (?P>name) named recursion */
8078                     goto named_recursion;
8079                 }
8080                 else if (paren == '=') {   /* (?P=...)  named backref */
8081                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8082                        you change this make sure you change that */
8083                     char* name_start = RExC_parse;
8084                     U32 num = 0;
8085                     SV *sv_dat = reg_scan_name(pRExC_state,
8086                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8087                     if (RExC_parse == name_start || *RExC_parse != ')')
8088                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8089
8090                     if (!SIZE_ONLY) {
8091                         num = add_data( pRExC_state, 1, "S" );
8092                         RExC_rxi->data->data[num]=(void*)sv_dat;
8093                         SvREFCNT_inc_simple_void(sv_dat);
8094                     }
8095                     RExC_sawback = 1;
8096                     ret = reganode(pRExC_state,
8097                                    ((! FOLD)
8098                                      ? NREF
8099                                      : (MORE_ASCII_RESTRICTED)
8100                                        ? NREFFA
8101                                        : (AT_LEAST_UNI_SEMANTICS)
8102                                          ? NREFFU
8103                                          : (LOC)
8104                                            ? NREFFL
8105                                            : NREFF),
8106                                     num);
8107                     *flagp |= HASWIDTH;
8108
8109                     Set_Node_Offset(ret, parse_start+1);
8110                     Set_Node_Cur_Length(ret); /* MJD */
8111
8112                     nextchar(pRExC_state);
8113                     return ret;
8114                 }
8115                 RExC_parse++;
8116                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8117                 /*NOTREACHED*/
8118             case '<':           /* (?<...) */
8119                 if (*RExC_parse == '!')
8120                     paren = ',';
8121                 else if (*RExC_parse != '=')
8122               named_capture:
8123                 {               /* (?<...>) */
8124                     char *name_start;
8125                     SV *svname;
8126                     paren= '>';
8127             case '\'':          /* (?'...') */
8128                     name_start= RExC_parse;
8129                     svname = reg_scan_name(pRExC_state,
8130                         SIZE_ONLY ?  /* reverse test from the others */
8131                         REG_RSN_RETURN_NAME :
8132                         REG_RSN_RETURN_NULL);
8133                     if (RExC_parse == name_start) {
8134                         RExC_parse++;
8135                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8136                         /*NOTREACHED*/
8137                     }
8138                     if (*RExC_parse != paren)
8139                         vFAIL2("Sequence (?%c... not terminated",
8140                             paren=='>' ? '<' : paren);
8141                     if (SIZE_ONLY) {
8142                         HE *he_str;
8143                         SV *sv_dat = NULL;
8144                         if (!svname) /* shouldn't happen */
8145                             Perl_croak(aTHX_
8146                                 "panic: reg_scan_name returned NULL");
8147                         if (!RExC_paren_names) {
8148                             RExC_paren_names= newHV();
8149                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8150 #ifdef DEBUGGING
8151                             RExC_paren_name_list= newAV();
8152                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8153 #endif
8154                         }
8155                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8156                         if ( he_str )
8157                             sv_dat = HeVAL(he_str);
8158                         if ( ! sv_dat ) {
8159                             /* croak baby croak */
8160                             Perl_croak(aTHX_
8161                                 "panic: paren_name hash element allocation failed");
8162                         } else if ( SvPOK(sv_dat) ) {
8163                             /* (?|...) can mean we have dupes so scan to check
8164                                its already been stored. Maybe a flag indicating
8165                                we are inside such a construct would be useful,
8166                                but the arrays are likely to be quite small, so
8167                                for now we punt -- dmq */
8168                             IV count = SvIV(sv_dat);
8169                             I32 *pv = (I32*)SvPVX(sv_dat);
8170                             IV i;
8171                             for ( i = 0 ; i < count ; i++ ) {
8172                                 if ( pv[i] == RExC_npar ) {
8173                                     count = 0;
8174                                     break;
8175                                 }
8176                             }
8177                             if ( count ) {
8178                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8179                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8180                                 pv[count] = RExC_npar;
8181                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8182                             }
8183                         } else {
8184                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8185                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8186                             SvIOK_on(sv_dat);
8187                             SvIV_set(sv_dat, 1);
8188                         }
8189 #ifdef DEBUGGING
8190                         /* Yes this does cause a memory leak in debugging Perls */
8191                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8192                             SvREFCNT_dec(svname);
8193 #endif
8194
8195                         /*sv_dump(sv_dat);*/
8196                     }
8197                     nextchar(pRExC_state);
8198                     paren = 1;
8199                     goto capturing_parens;
8200                 }
8201                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8202                 RExC_in_lookbehind++;
8203                 RExC_parse++;
8204             case '=':           /* (?=...) */
8205                 RExC_seen_zerolen++;
8206                 break;
8207             case '!':           /* (?!...) */
8208                 RExC_seen_zerolen++;
8209                 if (*RExC_parse == ')') {
8210                     ret=reg_node(pRExC_state, OPFAIL);
8211                     nextchar(pRExC_state);
8212                     return ret;
8213                 }
8214                 break;
8215             case '|':           /* (?|...) */
8216                 /* branch reset, behave like a (?:...) except that
8217                    buffers in alternations share the same numbers */
8218                 paren = ':';
8219                 after_freeze = freeze_paren = RExC_npar;
8220                 break;
8221             case ':':           /* (?:...) */
8222             case '>':           /* (?>...) */
8223                 break;
8224             case '$':           /* (?$...) */
8225             case '@':           /* (?@...) */
8226                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8227                 break;
8228             case '#':           /* (?#...) */
8229                 while (*RExC_parse && *RExC_parse != ')')
8230                     RExC_parse++;
8231                 if (*RExC_parse != ')')
8232                     FAIL("Sequence (?#... not terminated");
8233                 nextchar(pRExC_state);
8234                 *flagp = TRYAGAIN;
8235                 return NULL;
8236             case '0' :           /* (?0) */
8237             case 'R' :           /* (?R) */
8238                 if (*RExC_parse != ')')
8239                     FAIL("Sequence (?R) not terminated");
8240                 ret = reg_node(pRExC_state, GOSTART);
8241                 *flagp |= POSTPONED;
8242                 nextchar(pRExC_state);
8243                 return ret;
8244                 /*notreached*/
8245             { /* named and numeric backreferences */
8246                 I32 num;
8247             case '&':            /* (?&NAME) */
8248                 parse_start = RExC_parse - 1;
8249               named_recursion:
8250                 {
8251                     SV *sv_dat = reg_scan_name(pRExC_state,
8252                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8253                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8254                 }
8255                 goto gen_recurse_regop;
8256                 /* NOT REACHED */
8257             case '+':
8258                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8259                     RExC_parse++;
8260                     vFAIL("Illegal pattern");
8261                 }
8262                 goto parse_recursion;
8263                 /* NOT REACHED*/
8264             case '-': /* (?-1) */
8265                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8266                     RExC_parse--; /* rewind to let it be handled later */
8267                     goto parse_flags;
8268                 }
8269                 /*FALLTHROUGH */
8270             case '1': case '2': case '3': case '4': /* (?1) */
8271             case '5': case '6': case '7': case '8': case '9':
8272                 RExC_parse--;
8273               parse_recursion:
8274                 num = atoi(RExC_parse);
8275                 parse_start = RExC_parse - 1; /* MJD */
8276                 if (*RExC_parse == '-')
8277                     RExC_parse++;
8278                 while (isDIGIT(*RExC_parse))
8279                         RExC_parse++;
8280                 if (*RExC_parse!=')')
8281                     vFAIL("Expecting close bracket");
8282
8283               gen_recurse_regop:
8284                 if ( paren == '-' ) {
8285                     /*
8286                     Diagram of capture buffer numbering.
8287                     Top line is the normal capture buffer numbers
8288                     Bottom line is the negative indexing as from
8289                     the X (the (?-2))
8290
8291                     +   1 2    3 4 5 X          6 7
8292                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8293                     -   5 4    3 2 1 X          x x
8294
8295                     */
8296                     num = RExC_npar + num;
8297                     if (num < 1)  {
8298                         RExC_parse++;
8299                         vFAIL("Reference to nonexistent group");
8300                     }
8301                 } else if ( paren == '+' ) {
8302                     num = RExC_npar + num - 1;
8303                 }
8304
8305                 ret = reganode(pRExC_state, GOSUB, num);
8306                 if (!SIZE_ONLY) {
8307                     if (num > (I32)RExC_rx->nparens) {
8308                         RExC_parse++;
8309                         vFAIL("Reference to nonexistent group");
8310                     }
8311                     ARG2L_SET( ret, RExC_recurse_count++);
8312                     RExC_emit++;
8313                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8314                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8315                 } else {
8316                     RExC_size++;
8317                 }
8318                 RExC_seen |= REG_SEEN_RECURSE;
8319                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8320                 Set_Node_Offset(ret, parse_start); /* MJD */
8321
8322                 *flagp |= POSTPONED;
8323                 nextchar(pRExC_state);
8324                 return ret;
8325             } /* named and numeric backreferences */
8326             /* NOT REACHED */
8327
8328             case '?':           /* (??...) */
8329                 is_logical = 1;
8330                 if (*RExC_parse != '{') {
8331                     RExC_parse++;
8332                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8333                     /*NOTREACHED*/
8334                 }
8335                 *flagp |= POSTPONED;
8336                 paren = *RExC_parse++;
8337                 /* FALL THROUGH */
8338             case '{':           /* (?{...}) */
8339             {
8340                 I32 count = 1;
8341                 U32 n = 0;
8342                 char c;
8343                 char *s = RExC_parse;
8344
8345                 RExC_seen_zerolen++;
8346                 RExC_seen |= REG_SEEN_EVAL;
8347
8348                 if (   pRExC_state->num_code_blocks
8349                     && pRExC_state->code_index < pRExC_state->num_code_blocks
8350                     && pRExC_state->code_blocks[pRExC_state->code_index].start
8351                         == (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8352                             - RExC_start)
8353                 ) {
8354                     /* this is a pre-compiled literal (?{}) */
8355                     struct reg_code_block *cb =
8356                         &pRExC_state->code_blocks[pRExC_state->code_index];
8357                     RExC_parse = RExC_start + cb->end;
8358                     if (SIZE_ONLY)
8359                         RExC_seen_evals++;
8360                     else {
8361                         OP *o = cb->block;
8362                         if (cb->src_regex) {
8363                             n = add_data(pRExC_state, 2, "rl");
8364                             RExC_rxi->data->data[n] =
8365                                 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8366                         RExC_rxi->data->data[n+1] = (void*)o->op_next;
8367                         }
8368                         else {
8369                             n = add_data(pRExC_state, 1,
8370                                    (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8371                             RExC_rxi->data->data[n] = (void*)o->op_next;
8372                         }
8373                     }
8374                     pRExC_state->code_index++;
8375                 }
8376                 else {
8377                     while (count && (c = *RExC_parse)) {
8378                         if (c == '\\') {
8379                             if (RExC_parse[1])
8380                                 RExC_parse++;
8381                         }
8382                         else if (c == '{')
8383                             count++;
8384                         else if (c == '}')
8385                             count--;
8386                         RExC_parse++;
8387                     }
8388                     if (*RExC_parse != ')') {
8389                         RExC_parse = s;
8390                         vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
8391                     }
8392                     if (!SIZE_ONLY) {
8393                         PAD *pad;
8394                         OP_4tree *sop, *rop;
8395                         SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
8396
8397                         ENTER;
8398                         Perl_save_re_context(aTHX);
8399                         rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
8400                         sop->op_private |= OPpREFCOUNTED;
8401                         /* re_dup will OpREFCNT_inc */
8402                         OpREFCNT_set(sop, 1);
8403                         LEAVE;
8404
8405                         n = add_data(pRExC_state, 3, "nop");
8406                         RExC_rxi->data->data[n] = (void*)rop;
8407                         RExC_rxi->data->data[n+1] = (void*)sop;
8408                         RExC_rxi->data->data[n+2] = (void*)pad;
8409                         SvREFCNT_dec(sv);
8410                     }
8411                     else {                                              /* First pass */
8412                         if (PL_reginterp_cnt < ++RExC_seen_evals
8413                             && IN_PERL_RUNTIME)
8414                             /* No compiled RE interpolated, has runtime
8415                                components ===> unsafe.  */
8416                             FAIL("Eval-group not allowed at runtime, use re 'eval'");
8417                         if (PL_tainting && PL_tainted)
8418                             FAIL("Eval-group in insecure regular expression");
8419     #if PERL_VERSION > 8
8420                         if (IN_PERL_COMPILETIME)
8421                             PL_cv_has_eval = 1;
8422     #endif
8423                     }
8424                 }
8425                 nextchar(pRExC_state);
8426
8427                 if (is_logical) {
8428                     ret = reg_node(pRExC_state, LOGICAL);
8429                     if (!SIZE_ONLY)
8430                         ret->flags = 2;
8431                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
8432                     /* deal with the length of this later - MJD */
8433                     return ret;
8434                 }
8435                 ret = reganode(pRExC_state, EVAL, n);
8436                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8437                 Set_Node_Offset(ret, parse_start);
8438                 return ret;
8439             }
8440             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8441             {
8442                 int is_define= 0;
8443                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8444                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8445                         || RExC_parse[1] == '<'
8446                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8447                         I32 flag;
8448
8449                         ret = reg_node(pRExC_state, LOGICAL);
8450                         if (!SIZE_ONLY)
8451                             ret->flags = 1;
8452                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8453                         goto insert_if;
8454                     }
8455                 }
8456                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8457                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8458                 {
8459                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8460                     char *name_start= RExC_parse++;
8461                     U32 num = 0;
8462                     SV *sv_dat=reg_scan_name(pRExC_state,
8463                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8464                     if (RExC_parse == name_start || *RExC_parse != ch)
8465                         vFAIL2("Sequence (?(%c... not terminated",
8466                             (ch == '>' ? '<' : ch));
8467                     RExC_parse++;
8468                     if (!SIZE_ONLY) {
8469                         num = add_data( pRExC_state, 1, "S" );
8470                         RExC_rxi->data->data[num]=(void*)sv_dat;
8471                         SvREFCNT_inc_simple_void(sv_dat);
8472                     }
8473                     ret = reganode(pRExC_state,NGROUPP,num);
8474                     goto insert_if_check_paren;
8475                 }
8476                 else if (RExC_parse[0] == 'D' &&
8477                          RExC_parse[1] == 'E' &&
8478                          RExC_parse[2] == 'F' &&
8479                          RExC_parse[3] == 'I' &&
8480                          RExC_parse[4] == 'N' &&
8481                          RExC_parse[5] == 'E')
8482                 {
8483                     ret = reganode(pRExC_state,DEFINEP,0);
8484                     RExC_parse +=6 ;
8485                     is_define = 1;
8486                     goto insert_if_check_paren;
8487                 }
8488                 else if (RExC_parse[0] == 'R') {
8489                     RExC_parse++;
8490                     parno = 0;
8491                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8492                         parno = atoi(RExC_parse++);
8493                         while (isDIGIT(*RExC_parse))
8494                             RExC_parse++;
8495                     } else if (RExC_parse[0] == '&') {
8496                         SV *sv_dat;
8497                         RExC_parse++;
8498                         sv_dat = reg_scan_name(pRExC_state,
8499                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8500                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8501                     }
8502                     ret = reganode(pRExC_state,INSUBP,parno);
8503                     goto insert_if_check_paren;
8504                 }
8505                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8506                     /* (?(1)...) */
8507                     char c;
8508                     parno = atoi(RExC_parse++);
8509
8510                     while (isDIGIT(*RExC_parse))
8511                         RExC_parse++;
8512                     ret = reganode(pRExC_state, GROUPP, parno);
8513
8514                  insert_if_check_paren:
8515                     if ((c = *nextchar(pRExC_state)) != ')')
8516                         vFAIL("Switch condition not recognized");
8517                   insert_if:
8518                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8519                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8520                     if (br == NULL)
8521                         br = reganode(pRExC_state, LONGJMP, 0);
8522                     else
8523                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8524                     c = *nextchar(pRExC_state);
8525                     if (flags&HASWIDTH)
8526                         *flagp |= HASWIDTH;
8527                     if (c == '|') {
8528                         if (is_define)
8529                             vFAIL("(?(DEFINE)....) does not allow branches");
8530                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8531                         regbranch(pRExC_state, &flags, 1,depth+1);
8532                         REGTAIL(pRExC_state, ret, lastbr);
8533                         if (flags&HASWIDTH)
8534                             *flagp |= HASWIDTH;
8535                         c = *nextchar(pRExC_state);
8536                     }
8537                     else
8538                         lastbr = NULL;
8539                     if (c != ')')
8540                         vFAIL("Switch (?(condition)... contains too many branches");
8541                     ender = reg_node(pRExC_state, TAIL);
8542                     REGTAIL(pRExC_state, br, ender);
8543                     if (lastbr) {
8544                         REGTAIL(pRExC_state, lastbr, ender);
8545                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8546                     }
8547                     else
8548                         REGTAIL(pRExC_state, ret, ender);
8549                     RExC_size++; /* XXX WHY do we need this?!!
8550                                     For large programs it seems to be required
8551                                     but I can't figure out why. -- dmq*/
8552                     return ret;
8553                 }
8554                 else {
8555                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8556                 }
8557             }
8558             case 0:
8559                 RExC_parse--; /* for vFAIL to print correctly */
8560                 vFAIL("Sequence (? incomplete");
8561                 break;
8562             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8563                                        that follow */
8564                 has_use_defaults = TRUE;
8565                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8566                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8567                                                 ? REGEX_UNICODE_CHARSET
8568                                                 : REGEX_DEPENDS_CHARSET);
8569                 goto parse_flags;
8570             default:
8571                 --RExC_parse;
8572                 parse_flags:      /* (?i) */
8573             {
8574                 U32 posflags = 0, negflags = 0;
8575                 U32 *flagsp = &posflags;
8576                 char has_charset_modifier = '\0';
8577                 regex_charset cs = get_regex_charset(RExC_flags);
8578                 if (cs == REGEX_DEPENDS_CHARSET
8579                     && (RExC_utf8 || RExC_uni_semantics))
8580                 {
8581                     cs = REGEX_UNICODE_CHARSET;
8582                 }
8583
8584                 while (*RExC_parse) {
8585                     /* && strchr("iogcmsx", *RExC_parse) */
8586                     /* (?g), (?gc) and (?o) are useless here
8587                        and must be globally applied -- japhy */
8588                     switch (*RExC_parse) {
8589                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8590                     case LOCALE_PAT_MOD:
8591                         if (has_charset_modifier) {
8592                             goto excess_modifier;
8593                         }
8594                         else if (flagsp == &negflags) {
8595                             goto neg_modifier;
8596                         }
8597                         cs = REGEX_LOCALE_CHARSET;
8598                         has_charset_modifier = LOCALE_PAT_MOD;
8599                         RExC_contains_locale = 1;
8600                         break;
8601                     case UNICODE_PAT_MOD:
8602                         if (has_charset_modifier) {
8603                             goto excess_modifier;
8604                         }
8605                         else if (flagsp == &negflags) {
8606                             goto neg_modifier;
8607                         }
8608                         cs = REGEX_UNICODE_CHARSET;
8609                         has_charset_modifier = UNICODE_PAT_MOD;
8610                         break;
8611                     case ASCII_RESTRICT_PAT_MOD:
8612                         if (flagsp == &negflags) {
8613                             goto neg_modifier;
8614                         }
8615                         if (has_charset_modifier) {
8616                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8617                                 goto excess_modifier;
8618                             }
8619                             /* Doubled modifier implies more restricted */
8620                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8621                         }
8622                         else {
8623                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8624                         }
8625                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8626                         break;
8627                     case DEPENDS_PAT_MOD:
8628                         if (has_use_defaults) {
8629                             goto fail_modifiers;
8630                         }
8631                         else if (flagsp == &negflags) {
8632                             goto neg_modifier;
8633                         }
8634                         else if (has_charset_modifier) {
8635                             goto excess_modifier;
8636                         }
8637
8638                         /* The dual charset means unicode semantics if the
8639                          * pattern (or target, not known until runtime) are
8640                          * utf8, or something in the pattern indicates unicode
8641                          * semantics */
8642                         cs = (RExC_utf8 || RExC_uni_semantics)
8643                              ? REGEX_UNICODE_CHARSET
8644                              : REGEX_DEPENDS_CHARSET;
8645                         has_charset_modifier = DEPENDS_PAT_MOD;
8646                         break;
8647                     excess_modifier:
8648                         RExC_parse++;
8649                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8650                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8651                         }
8652                         else if (has_charset_modifier == *(RExC_parse - 1)) {
8653                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8654                         }
8655                         else {
8656                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8657                         }
8658                         /*NOTREACHED*/
8659                     neg_modifier:
8660                         RExC_parse++;
8661                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8662                         /*NOTREACHED*/
8663                     case ONCE_PAT_MOD: /* 'o' */
8664                     case GLOBAL_PAT_MOD: /* 'g' */
8665                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8666                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8667                             if (! (wastedflags & wflagbit) ) {
8668                                 wastedflags |= wflagbit;
8669                                 vWARN5(
8670                                     RExC_parse + 1,
8671                                     "Useless (%s%c) - %suse /%c modifier",
8672                                     flagsp == &negflags ? "?-" : "?",
8673                                     *RExC_parse,
8674                                     flagsp == &negflags ? "don't " : "",
8675                                     *RExC_parse
8676                                 );
8677                             }
8678                         }
8679                         break;
8680
8681                     case CONTINUE_PAT_MOD: /* 'c' */
8682                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8683                             if (! (wastedflags & WASTED_C) ) {
8684                                 wastedflags |= WASTED_GC;
8685                                 vWARN3(
8686                                     RExC_parse + 1,
8687                                     "Useless (%sc) - %suse /gc modifier",
8688                                     flagsp == &negflags ? "?-" : "?",
8689                                     flagsp == &negflags ? "don't " : ""
8690                                 );
8691                             }
8692                         }
8693                         break;
8694                     case KEEPCOPY_PAT_MOD: /* 'p' */
8695                         if (flagsp == &negflags) {
8696                             if (SIZE_ONLY)
8697                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8698                         } else {
8699                             *flagsp |= RXf_PMf_KEEPCOPY;
8700                         }
8701                         break;
8702                     case '-':
8703                         /* A flag is a default iff it is following a minus, so
8704                          * if there is a minus, it means will be trying to
8705                          * re-specify a default which is an error */
8706                         if (has_use_defaults || flagsp == &negflags) {
8707             fail_modifiers:
8708                             RExC_parse++;
8709                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8710                             /*NOTREACHED*/
8711                         }
8712                         flagsp = &negflags;
8713                         wastedflags = 0;  /* reset so (?g-c) warns twice */
8714                         break;
8715                     case ':':
8716                         paren = ':';
8717                         /*FALLTHROUGH*/
8718                     case ')':
8719                         RExC_flags |= posflags;
8720                         RExC_flags &= ~negflags;
8721                         set_regex_charset(&RExC_flags, cs);
8722                         if (paren != ':') {
8723                             oregflags |= posflags;
8724                             oregflags &= ~negflags;
8725                             set_regex_charset(&oregflags, cs);
8726                         }
8727                         nextchar(pRExC_state);
8728                         if (paren != ':') {
8729                             *flagp = TRYAGAIN;
8730                             return NULL;
8731                         } else {
8732                             ret = NULL;
8733                             goto parse_rest;
8734                         }
8735                         /*NOTREACHED*/
8736                     default:
8737                         RExC_parse++;
8738                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8739                         /*NOTREACHED*/
8740                     }
8741                     ++RExC_parse;
8742                 }
8743             }} /* one for the default block, one for the switch */
8744         }
8745         else {                  /* (...) */
8746           capturing_parens:
8747             parno = RExC_npar;
8748             RExC_npar++;
8749
8750             ret = reganode(pRExC_state, OPEN, parno);
8751             if (!SIZE_ONLY ){
8752                 if (!RExC_nestroot)
8753                     RExC_nestroot = parno;
8754                 if (RExC_seen & REG_SEEN_RECURSE
8755                     && !RExC_open_parens[parno-1])
8756                 {
8757                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8758                         "Setting open paren #%"IVdf" to %d\n",
8759                         (IV)parno, REG_NODE_NUM(ret)));
8760                     RExC_open_parens[parno-1]= ret;
8761                 }
8762             }
8763             Set_Node_Length(ret, 1); /* MJD */
8764             Set_Node_Offset(ret, RExC_parse); /* MJD */
8765             is_open = 1;
8766         }
8767     }
8768     else                        /* ! paren */
8769         ret = NULL;
8770
8771    parse_rest:
8772     /* Pick up the branches, linking them together. */
8773     parse_start = RExC_parse;   /* MJD */
8774     br = regbranch(pRExC_state, &flags, 1,depth+1);
8775
8776     /*     branch_len = (paren != 0); */
8777
8778     if (br == NULL)
8779         return(NULL);
8780     if (*RExC_parse == '|') {
8781         if (!SIZE_ONLY && RExC_extralen) {
8782             reginsert(pRExC_state, BRANCHJ, br, depth+1);
8783         }
8784         else {                  /* MJD */
8785             reginsert(pRExC_state, BRANCH, br, depth+1);
8786             Set_Node_Length(br, paren != 0);
8787             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8788         }
8789         have_branch = 1;
8790         if (SIZE_ONLY)
8791             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
8792     }
8793     else if (paren == ':') {
8794         *flagp |= flags&SIMPLE;
8795     }
8796     if (is_open) {                              /* Starts with OPEN. */
8797         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
8798     }
8799     else if (paren != '?')              /* Not Conditional */
8800         ret = br;
8801     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8802     lastbr = br;
8803     while (*RExC_parse == '|') {
8804         if (!SIZE_ONLY && RExC_extralen) {
8805             ender = reganode(pRExC_state, LONGJMP,0);
8806             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8807         }
8808         if (SIZE_ONLY)
8809             RExC_extralen += 2;         /* Account for LONGJMP. */
8810         nextchar(pRExC_state);
8811         if (freeze_paren) {
8812             if (RExC_npar > after_freeze)
8813                 after_freeze = RExC_npar;
8814             RExC_npar = freeze_paren;
8815         }
8816         br = regbranch(pRExC_state, &flags, 0, depth+1);
8817
8818         if (br == NULL)
8819             return(NULL);
8820         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
8821         lastbr = br;
8822         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8823     }
8824
8825     if (have_branch || paren != ':') {
8826         /* Make a closing node, and hook it on the end. */
8827         switch (paren) {
8828         case ':':
8829             ender = reg_node(pRExC_state, TAIL);
8830             break;
8831         case 1:
8832             ender = reganode(pRExC_state, CLOSE, parno);
8833             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8834                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8835                         "Setting close paren #%"IVdf" to %d\n",
8836                         (IV)parno, REG_NODE_NUM(ender)));
8837                 RExC_close_parens[parno-1]= ender;
8838                 if (RExC_nestroot == parno)
8839                     RExC_nestroot = 0;
8840             }
8841             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8842             Set_Node_Length(ender,1); /* MJD */
8843             break;
8844         case '<':
8845         case ',':
8846         case '=':
8847         case '!':
8848             *flagp &= ~HASWIDTH;
8849             /* FALL THROUGH */
8850         case '>':
8851             ender = reg_node(pRExC_state, SUCCEED);
8852             break;
8853         case 0:
8854             ender = reg_node(pRExC_state, END);
8855             if (!SIZE_ONLY) {
8856                 assert(!RExC_opend); /* there can only be one! */
8857                 RExC_opend = ender;
8858             }
8859             break;
8860         }
8861         DEBUG_PARSE_r(if (!SIZE_ONLY) {
8862             SV * const mysv_val1=sv_newmortal();
8863             SV * const mysv_val2=sv_newmortal();
8864             DEBUG_PARSE_MSG("lsbr");
8865             regprop(RExC_rx, mysv_val1, lastbr);
8866             regprop(RExC_rx, mysv_val2, ender);
8867             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
8868                           SvPV_nolen_const(mysv_val1),
8869                           (IV)REG_NODE_NUM(lastbr),
8870                           SvPV_nolen_const(mysv_val2),
8871                           (IV)REG_NODE_NUM(ender),
8872                           (IV)(ender - lastbr)
8873             );
8874         });
8875         REGTAIL(pRExC_state, lastbr, ender);
8876
8877         if (have_branch && !SIZE_ONLY) {
8878             char is_nothing= 1;
8879             if (depth==1)
8880                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8881
8882             /* Hook the tails of the branches to the closing node. */
8883             for (br = ret; br; br = regnext(br)) {
8884                 const U8 op = PL_regkind[OP(br)];
8885                 if (op == BRANCH) {
8886                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8887                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
8888                         is_nothing= 0;
8889                 }
8890                 else if (op == BRANCHJ) {
8891                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8892                     /* for now we always disable this optimisation * /
8893                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
8894                     */
8895                         is_nothing= 0;
8896                 }
8897             }
8898             if (is_nothing) {
8899                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
8900                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
8901                     SV * const mysv_val1=sv_newmortal();
8902                     SV * const mysv_val2=sv_newmortal();
8903                     DEBUG_PARSE_MSG("NADA");
8904                     regprop(RExC_rx, mysv_val1, ret);
8905                     regprop(RExC_rx, mysv_val2, ender);
8906                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
8907                                   SvPV_nolen_const(mysv_val1),
8908                                   (IV)REG_NODE_NUM(ret),
8909                                   SvPV_nolen_const(mysv_val2),
8910                                   (IV)REG_NODE_NUM(ender),
8911                                   (IV)(ender - ret)
8912                     );
8913                 });
8914                 OP(br)= NOTHING;
8915                 if (OP(ender) == TAIL) {
8916                     NEXT_OFF(br)= 0;
8917                     RExC_emit= br + 1;
8918                 } else {
8919                     regnode *opt;
8920                     for ( opt= br + 1; opt < ender ; opt++ )
8921                         OP(opt)= OPTIMIZED;
8922                     NEXT_OFF(br)= ender - br;
8923                 }
8924             }
8925         }
8926     }
8927
8928     {
8929         const char *p;
8930         static const char parens[] = "=!<,>";
8931
8932         if (paren && (p = strchr(parens, paren))) {
8933             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8934             int flag = (p - parens) > 1;
8935
8936             if (paren == '>')
8937                 node = SUSPEND, flag = 0;
8938             reginsert(pRExC_state, node,ret, depth+1);
8939             Set_Node_Cur_Length(ret);
8940             Set_Node_Offset(ret, parse_start + 1);
8941             ret->flags = flag;
8942             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8943         }
8944     }
8945
8946     /* Check for proper termination. */
8947     if (paren) {
8948         RExC_flags = oregflags;
8949         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8950             RExC_parse = oregcomp_parse;
8951             vFAIL("Unmatched (");
8952         }
8953     }
8954     else if (!paren && RExC_parse < RExC_end) {
8955         if (*RExC_parse == ')') {
8956             RExC_parse++;
8957             vFAIL("Unmatched )");
8958         }
8959         else
8960             FAIL("Junk on end of regexp");      /* "Can't happen". */
8961         /* NOTREACHED */
8962     }
8963
8964     if (RExC_in_lookbehind) {
8965         RExC_in_lookbehind--;
8966     }
8967     if (after_freeze > RExC_npar)
8968         RExC_npar = after_freeze;
8969     return(ret);
8970 }
8971
8972 /*
8973  - regbranch - one alternative of an | operator
8974  *
8975  * Implements the concatenation operator.
8976  */
8977 STATIC regnode *
8978 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8979 {
8980     dVAR;
8981     register regnode *ret;
8982     register regnode *chain = NULL;
8983     register regnode *latest;
8984     I32 flags = 0, c = 0;
8985     GET_RE_DEBUG_FLAGS_DECL;
8986
8987     PERL_ARGS_ASSERT_REGBRANCH;
8988
8989     DEBUG_PARSE("brnc");
8990
8991     if (first)
8992         ret = NULL;
8993     else {
8994         if (!SIZE_ONLY && RExC_extralen)
8995             ret = reganode(pRExC_state, BRANCHJ,0);
8996         else {
8997             ret = reg_node(pRExC_state, BRANCH);
8998             Set_Node_Length(ret, 1);
8999         }
9000     }
9001
9002     if (!first && SIZE_ONLY)
9003         RExC_extralen += 1;                     /* BRANCHJ */
9004
9005     *flagp = WORST;                     /* Tentatively. */
9006
9007     RExC_parse--;
9008     nextchar(pRExC_state);
9009     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9010         flags &= ~TRYAGAIN;
9011         latest = regpiece(pRExC_state, &flags,depth+1);
9012         if (latest == NULL) {
9013             if (flags & TRYAGAIN)
9014                 continue;
9015             return(NULL);
9016         }
9017         else if (ret == NULL)
9018             ret = latest;
9019         *flagp |= flags&(HASWIDTH|POSTPONED);
9020         if (chain == NULL)      /* First piece. */
9021             *flagp |= flags&SPSTART;
9022         else {
9023             RExC_naughty++;
9024             REGTAIL(pRExC_state, chain, latest);
9025         }
9026         chain = latest;
9027         c++;
9028     }
9029     if (chain == NULL) {        /* Loop ran zero times. */
9030         chain = reg_node(pRExC_state, NOTHING);
9031         if (ret == NULL)
9032             ret = chain;
9033     }
9034     if (c == 1) {
9035         *flagp |= flags&SIMPLE;
9036     }
9037
9038     return ret;
9039 }
9040
9041 /*
9042  - regpiece - something followed by possible [*+?]
9043  *
9044  * Note that the branching code sequences used for ? and the general cases
9045  * of * and + are somewhat optimized:  they use the same NOTHING node as
9046  * both the endmarker for their branch list and the body of the last branch.
9047  * It might seem that this node could be dispensed with entirely, but the
9048  * endmarker role is not redundant.
9049  */
9050 STATIC regnode *
9051 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9052 {
9053     dVAR;
9054     register regnode *ret;
9055     register char op;
9056     register char *next;
9057     I32 flags;
9058     const char * const origparse = RExC_parse;
9059     I32 min;
9060     I32 max = REG_INFTY;
9061 #ifdef RE_TRACK_PATTERN_OFFSETS
9062     char *parse_start;
9063 #endif
9064     const char *maxpos = NULL;
9065     GET_RE_DEBUG_FLAGS_DECL;
9066
9067     PERL_ARGS_ASSERT_REGPIECE;
9068
9069     DEBUG_PARSE("piec");
9070
9071     ret = regatom(pRExC_state, &flags,depth+1);
9072     if (ret == NULL) {
9073         if (flags & TRYAGAIN)
9074             *flagp |= TRYAGAIN;
9075         return(NULL);
9076     }
9077
9078     op = *RExC_parse;
9079
9080     if (op == '{' && regcurly(RExC_parse)) {
9081         maxpos = NULL;
9082 #ifdef RE_TRACK_PATTERN_OFFSETS
9083         parse_start = RExC_parse; /* MJD */
9084 #endif
9085         next = RExC_parse + 1;
9086         while (isDIGIT(*next) || *next == ',') {
9087             if (*next == ',') {
9088                 if (maxpos)
9089                     break;
9090                 else
9091                     maxpos = next;
9092             }
9093             next++;
9094         }
9095         if (*next == '}') {             /* got one */
9096             if (!maxpos)
9097                 maxpos = next;
9098             RExC_parse++;
9099             min = atoi(RExC_parse);
9100             if (*maxpos == ',')
9101                 maxpos++;
9102             else
9103                 maxpos = RExC_parse;
9104             max = atoi(maxpos);
9105             if (!max && *maxpos != '0')
9106                 max = REG_INFTY;                /* meaning "infinity" */
9107             else if (max >= REG_INFTY)
9108                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9109             RExC_parse = next;
9110             nextchar(pRExC_state);
9111
9112         do_curly:
9113             if ((flags&SIMPLE)) {
9114                 RExC_naughty += 2 + RExC_naughty / 2;
9115                 reginsert(pRExC_state, CURLY, ret, depth+1);
9116                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9117                 Set_Node_Cur_Length(ret);
9118             }
9119             else {
9120                 regnode * const w = reg_node(pRExC_state, WHILEM);
9121
9122                 w->flags = 0;
9123                 REGTAIL(pRExC_state, ret, w);
9124                 if (!SIZE_ONLY && RExC_extralen) {
9125                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9126                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9127                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9128                 }
9129                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9130                                 /* MJD hk */
9131                 Set_Node_Offset(ret, parse_start+1);
9132                 Set_Node_Length(ret,
9133                                 op == '{' ? (RExC_parse - parse_start) : 1);
9134
9135                 if (!SIZE_ONLY && RExC_extralen)
9136                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9137                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9138                 if (SIZE_ONLY)
9139                     RExC_whilem_seen++, RExC_extralen += 3;
9140                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9141             }
9142             ret->flags = 0;
9143
9144             if (min > 0)
9145                 *flagp = WORST;
9146             if (max > 0)
9147                 *flagp |= HASWIDTH;
9148             if (max < min)
9149                 vFAIL("Can't do {n,m} with n > m");
9150             if (!SIZE_ONLY) {
9151                 ARG1_SET(ret, (U16)min);
9152                 ARG2_SET(ret, (U16)max);
9153             }
9154
9155             goto nest_check;
9156         }
9157     }
9158
9159     if (!ISMULT1(op)) {
9160         *flagp = flags;
9161         return(ret);
9162     }
9163
9164 #if 0                           /* Now runtime fix should be reliable. */
9165
9166     /* if this is reinstated, don't forget to put this back into perldiag:
9167
9168             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9169
9170            (F) The part of the regexp subject to either the * or + quantifier
9171            could match an empty string. The {#} shows in the regular
9172            expression about where the problem was discovered.
9173
9174     */
9175
9176     if (!(flags&HASWIDTH) && op != '?')
9177       vFAIL("Regexp *+ operand could be empty");
9178 #endif
9179
9180 #ifdef RE_TRACK_PATTERN_OFFSETS
9181     parse_start = RExC_parse;
9182 #endif
9183     nextchar(pRExC_state);
9184
9185     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9186
9187     if (op == '*' && (flags&SIMPLE)) {
9188         reginsert(pRExC_state, STAR, ret, depth+1);
9189         ret->flags = 0;
9190         RExC_naughty += 4;
9191     }
9192     else if (op == '*') {
9193         min = 0;
9194         goto do_curly;
9195     }
9196     else if (op == '+' && (flags&SIMPLE)) {
9197         reginsert(pRExC_state, PLUS, ret, depth+1);
9198         ret->flags = 0;
9199         RExC_naughty += 3;
9200     }
9201     else if (op == '+') {
9202         min = 1;
9203         goto do_curly;
9204     }
9205     else if (op == '?') {
9206         min = 0; max = 1;
9207         goto do_curly;
9208     }
9209   nest_check:
9210     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9211         ckWARN3reg(RExC_parse,
9212                    "%.*s matches null string many times",
9213                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9214                    origparse);
9215     }
9216
9217     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9218         nextchar(pRExC_state);
9219         reginsert(pRExC_state, MINMOD, ret, depth+1);
9220         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9221     }
9222 #ifndef REG_ALLOW_MINMOD_SUSPEND
9223     else
9224 #endif
9225     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9226         regnode *ender;
9227         nextchar(pRExC_state);
9228         ender = reg_node(pRExC_state, SUCCEED);
9229         REGTAIL(pRExC_state, ret, ender);
9230         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9231         ret->flags = 0;
9232         ender = reg_node(pRExC_state, TAIL);
9233         REGTAIL(pRExC_state, ret, ender);
9234         /*ret= ender;*/
9235     }
9236
9237     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9238         RExC_parse++;
9239         vFAIL("Nested quantifiers");
9240     }
9241
9242     return(ret);
9243 }
9244
9245
9246 /* reg_namedseq(pRExC_state,UVp, UV depth)
9247
9248    This is expected to be called by a parser routine that has
9249    recognized '\N' and needs to handle the rest. RExC_parse is
9250    expected to point at the first char following the N at the time
9251    of the call.
9252
9253    The \N may be inside (indicated by valuep not being NULL) or outside a
9254    character class.
9255
9256    \N may begin either a named sequence, or if outside a character class, mean
9257    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9258    attempted to decide which, and in the case of a named sequence converted it
9259    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9260    where c1... are the characters in the sequence.  For single-quoted regexes,
9261    the tokenizer passes the \N sequence through unchanged; this code will not
9262    attempt to determine this nor expand those.  The net effect is that if the
9263    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
9264    signals that this \N occurrence means to match a non-newline.
9265
9266    Only the \N{U+...} form should occur in a character class, for the same
9267    reason that '.' inside a character class means to just match a period: it
9268    just doesn't make sense.
9269
9270    If valuep is non-null then it is assumed that we are parsing inside
9271    of a charclass definition and the first codepoint in the resolved
9272    string is returned via *valuep and the routine will return NULL.
9273    In this mode if a multichar string is returned from the charnames
9274    handler, a warning will be issued, and only the first char in the
9275    sequence will be examined. If the string returned is zero length
9276    then the value of *valuep is undefined and NON-NULL will
9277    be returned to indicate failure. (This will NOT be a valid pointer
9278    to a regnode.)
9279
9280    If valuep is null then it is assumed that we are parsing normal text and a
9281    new EXACT node is inserted into the program containing the resolved string,
9282    and a pointer to the new node is returned.  But if the string is zero length
9283    a NOTHING node is emitted instead.
9284
9285    On success RExC_parse is set to the char following the endbrace.
9286    Parsing failures will generate a fatal error via vFAIL(...)
9287  */
9288 STATIC regnode *
9289 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
9290 {
9291     char * endbrace;    /* '}' following the name */
9292     regnode *ret = NULL;
9293     char* p;
9294
9295     GET_RE_DEBUG_FLAGS_DECL;
9296
9297     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
9298
9299     GET_RE_DEBUG_FLAGS;
9300
9301     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9302      * modifier.  The other meaning does not */
9303     p = (RExC_flags & RXf_PMf_EXTENDED)
9304         ? regwhite( pRExC_state, RExC_parse )
9305         : RExC_parse;
9306
9307     /* Disambiguate between \N meaning a named character versus \N meaning
9308      * [^\n].  The former is assumed when it can't be the latter. */
9309     if (*p != '{' || regcurly(p)) {
9310         RExC_parse = p;
9311         if (valuep) {
9312             /* no bare \N in a charclass */
9313             vFAIL("\\N in a character class must be a named character: \\N{...}");
9314         }
9315         nextchar(pRExC_state);
9316         ret = reg_node(pRExC_state, REG_ANY);
9317         *flagp |= HASWIDTH|SIMPLE;
9318         RExC_naughty++;
9319         RExC_parse--;
9320         Set_Node_Length(ret, 1); /* MJD */
9321         return ret;
9322     }
9323
9324     /* Here, we have decided it should be a named sequence */
9325
9326     /* The test above made sure that the next real character is a '{', but
9327      * under the /x modifier, it could be separated by space (or a comment and
9328      * \n) and this is not allowed (for consistency with \x{...} and the
9329      * tokenizer handling of \N{NAME}). */
9330     if (*RExC_parse != '{') {
9331         vFAIL("Missing braces on \\N{}");
9332     }
9333
9334     RExC_parse++;       /* Skip past the '{' */
9335
9336     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9337         || ! (endbrace == RExC_parse            /* nothing between the {} */
9338               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9339                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9340     {
9341         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9342         vFAIL("\\N{NAME} must be resolved by the lexer");
9343     }
9344
9345     if (endbrace == RExC_parse) {   /* empty: \N{} */
9346         if (! valuep) {
9347             RExC_parse = endbrace + 1;
9348             return reg_node(pRExC_state,NOTHING);
9349         }
9350
9351         if (SIZE_ONLY) {
9352             ckWARNreg(RExC_parse,
9353                     "Ignoring zero length \\N{} in character class"
9354             );
9355             RExC_parse = endbrace + 1;
9356         }
9357         *valuep = 0;
9358         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
9359     }
9360
9361     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
9362     RExC_parse += 2;    /* Skip past the 'U+' */
9363
9364     if (valuep) {   /* In a bracketed char class */
9365         /* We only pay attention to the first char of
9366         multichar strings being returned. I kinda wonder
9367         if this makes sense as it does change the behaviour
9368         from earlier versions, OTOH that behaviour was broken
9369         as well. XXX Solution is to recharacterize as
9370         [rest-of-class]|multi1|multi2... */
9371
9372         STRLEN length_of_hex;
9373         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9374             | PERL_SCAN_DISALLOW_PREFIX
9375             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9376
9377         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
9378         if (endchar < endbrace) {
9379             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9380         }
9381
9382         length_of_hex = (STRLEN)(endchar - RExC_parse);
9383         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
9384
9385         /* The tokenizer should have guaranteed validity, but it's possible to
9386          * bypass it by using single quoting, so check */
9387         if (length_of_hex == 0
9388             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9389         {
9390             RExC_parse += length_of_hex;        /* Includes all the valid */
9391             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9392                             ? UTF8SKIP(RExC_parse)
9393                             : 1;
9394             /* Guard against malformed utf8 */
9395             if (RExC_parse >= endchar) RExC_parse = endchar;
9396             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9397         }
9398
9399         RExC_parse = endbrace + 1;
9400         if (endchar == endbrace) return NULL;
9401
9402         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
9403     }
9404     else {      /* Not a char class */
9405
9406         /* What is done here is to convert this to a sub-pattern of the form
9407          * (?:\x{char1}\x{char2}...)
9408          * and then call reg recursively.  That way, it retains its atomicness,
9409          * while not having to worry about special handling that some code
9410          * points may have.  toke.c has converted the original Unicode values
9411          * to native, so that we can just pass on the hex values unchanged.  We
9412          * do have to set a flag to keep recoding from happening in the
9413          * recursion */
9414
9415         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9416         STRLEN len;
9417         char *endchar;      /* Points to '.' or '}' ending cur char in the input
9418                                stream */
9419         char *orig_end = RExC_end;
9420
9421         while (RExC_parse < endbrace) {
9422
9423             /* Code points are separated by dots.  If none, there is only one
9424              * code point, and is terminated by the brace */
9425             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9426
9427             /* Convert to notation the rest of the code understands */
9428             sv_catpv(substitute_parse, "\\x{");
9429             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9430             sv_catpv(substitute_parse, "}");
9431
9432             /* Point to the beginning of the next character in the sequence. */
9433             RExC_parse = endchar + 1;
9434         }
9435         sv_catpv(substitute_parse, ")");
9436
9437         RExC_parse = SvPV(substitute_parse, len);
9438
9439         /* Don't allow empty number */
9440         if (len < 8) {
9441             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9442         }
9443         RExC_end = RExC_parse + len;
9444
9445         /* The values are Unicode, and therefore not subject to recoding */
9446         RExC_override_recoding = 1;
9447
9448         ret = reg(pRExC_state, 1, flagp, depth+1);
9449
9450         RExC_parse = endbrace;
9451         RExC_end = orig_end;
9452         RExC_override_recoding = 0;
9453
9454         nextchar(pRExC_state);
9455     }
9456
9457     return ret;
9458 }
9459
9460
9461 /*
9462  * reg_recode
9463  *
9464  * It returns the code point in utf8 for the value in *encp.
9465  *    value: a code value in the source encoding
9466  *    encp:  a pointer to an Encode object
9467  *
9468  * If the result from Encode is not a single character,
9469  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9470  */
9471 STATIC UV
9472 S_reg_recode(pTHX_ const char value, SV **encp)
9473 {
9474     STRLEN numlen = 1;
9475     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9476     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9477     const STRLEN newlen = SvCUR(sv);
9478     UV uv = UNICODE_REPLACEMENT;
9479
9480     PERL_ARGS_ASSERT_REG_RECODE;
9481
9482     if (newlen)
9483         uv = SvUTF8(sv)
9484              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9485              : *(U8*)s;
9486
9487     if (!newlen || numlen != newlen) {
9488         uv = UNICODE_REPLACEMENT;
9489         *encp = NULL;
9490     }
9491     return uv;
9492 }
9493
9494
9495 /*
9496  - regatom - the lowest level
9497
9498    Try to identify anything special at the start of the pattern. If there
9499    is, then handle it as required. This may involve generating a single regop,
9500    such as for an assertion; or it may involve recursing, such as to
9501    handle a () structure.
9502
9503    If the string doesn't start with something special then we gobble up
9504    as much literal text as we can.
9505
9506    Once we have been able to handle whatever type of thing started the
9507    sequence, we return.
9508
9509    Note: we have to be careful with escapes, as they can be both literal
9510    and special, and in the case of \10 and friends can either, depending
9511    on context. Specifically there are two separate switches for handling
9512    escape sequences, with the one for handling literal escapes requiring
9513    a dummy entry for all of the special escapes that are actually handled
9514    by the other.
9515 */
9516
9517 STATIC regnode *
9518 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9519 {
9520     dVAR;
9521     register regnode *ret = NULL;
9522     I32 flags;
9523     char *parse_start = RExC_parse;
9524     U8 op;
9525     GET_RE_DEBUG_FLAGS_DECL;
9526     DEBUG_PARSE("atom");
9527     *flagp = WORST;             /* Tentatively. */
9528
9529     PERL_ARGS_ASSERT_REGATOM;
9530
9531 tryagain:
9532     switch ((U8)*RExC_parse) {
9533     case '^':
9534         RExC_seen_zerolen++;
9535         nextchar(pRExC_state);
9536         if (RExC_flags & RXf_PMf_MULTILINE)
9537             ret = reg_node(pRExC_state, MBOL);
9538         else if (RExC_flags & RXf_PMf_SINGLELINE)
9539             ret = reg_node(pRExC_state, SBOL);
9540         else
9541             ret = reg_node(pRExC_state, BOL);
9542         Set_Node_Length(ret, 1); /* MJD */
9543         break;
9544     case '$':
9545         nextchar(pRExC_state);
9546         if (*RExC_parse)
9547             RExC_seen_zerolen++;
9548         if (RExC_flags & RXf_PMf_MULTILINE)
9549             ret = reg_node(pRExC_state, MEOL);
9550         else if (RExC_flags & RXf_PMf_SINGLELINE)
9551             ret = reg_node(pRExC_state, SEOL);
9552         else
9553             ret = reg_node(pRExC_state, EOL);
9554         Set_Node_Length(ret, 1); /* MJD */
9555         break;
9556     case '.':
9557         nextchar(pRExC_state);
9558         if (RExC_flags & RXf_PMf_SINGLELINE)
9559             ret = reg_node(pRExC_state, SANY);
9560         else
9561             ret = reg_node(pRExC_state, REG_ANY);
9562         *flagp |= HASWIDTH|SIMPLE;
9563         RExC_naughty++;
9564         Set_Node_Length(ret, 1); /* MJD */
9565         break;
9566     case '[':
9567     {
9568         char * const oregcomp_parse = ++RExC_parse;
9569         ret = regclass(pRExC_state,depth+1);
9570         if (*RExC_parse != ']') {
9571             RExC_parse = oregcomp_parse;
9572             vFAIL("Unmatched [");
9573         }
9574         nextchar(pRExC_state);
9575         *flagp |= HASWIDTH|SIMPLE;
9576         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
9577         break;
9578     }
9579     case '(':
9580         nextchar(pRExC_state);
9581         ret = reg(pRExC_state, 1, &flags,depth+1);
9582         if (ret == NULL) {
9583                 if (flags & TRYAGAIN) {
9584                     if (RExC_parse == RExC_end) {
9585                          /* Make parent create an empty node if needed. */
9586                         *flagp |= TRYAGAIN;
9587                         return(NULL);
9588                     }
9589                     goto tryagain;
9590                 }
9591                 return(NULL);
9592         }
9593         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9594         break;
9595     case '|':
9596     case ')':
9597         if (flags & TRYAGAIN) {
9598             *flagp |= TRYAGAIN;
9599             return NULL;
9600         }
9601         vFAIL("Internal urp");
9602                                 /* Supposed to be caught earlier. */
9603         break;
9604     case '?':
9605     case '+':
9606     case '*':
9607         RExC_parse++;
9608         vFAIL("Quantifier follows nothing");
9609         break;
9610     case '\\':
9611         /* Special Escapes
9612
9613            This switch handles escape sequences that resolve to some kind
9614            of special regop and not to literal text. Escape sequnces that
9615            resolve to literal text are handled below in the switch marked
9616            "Literal Escapes".
9617
9618            Every entry in this switch *must* have a corresponding entry
9619            in the literal escape switch. However, the opposite is not
9620            required, as the default for this switch is to jump to the
9621            literal text handling code.
9622         */
9623         switch ((U8)*++RExC_parse) {
9624         /* Special Escapes */
9625         case 'A':
9626             RExC_seen_zerolen++;
9627             ret = reg_node(pRExC_state, SBOL);
9628             *flagp |= SIMPLE;
9629             goto finish_meta_pat;
9630         case 'G':
9631             ret = reg_node(pRExC_state, GPOS);
9632             RExC_seen |= REG_SEEN_GPOS;
9633             *flagp |= SIMPLE;
9634             goto finish_meta_pat;
9635         case 'K':
9636             RExC_seen_zerolen++;
9637             ret = reg_node(pRExC_state, KEEPS);
9638             *flagp |= SIMPLE;
9639             /* XXX:dmq : disabling in-place substitution seems to
9640              * be necessary here to avoid cases of memory corruption, as
9641              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9642              */
9643             RExC_seen |= REG_SEEN_LOOKBEHIND;
9644             goto finish_meta_pat;
9645         case 'Z':
9646             ret = reg_node(pRExC_state, SEOL);
9647             *flagp |= SIMPLE;
9648             RExC_seen_zerolen++;                /* Do not optimize RE away */
9649             goto finish_meta_pat;
9650         case 'z':
9651             ret = reg_node(pRExC_state, EOS);
9652             *flagp |= SIMPLE;
9653             RExC_seen_zerolen++;                /* Do not optimize RE away */
9654             goto finish_meta_pat;
9655         case 'C':
9656             ret = reg_node(pRExC_state, CANY);
9657             RExC_seen |= REG_SEEN_CANY;
9658             *flagp |= HASWIDTH|SIMPLE;
9659             goto finish_meta_pat;
9660         case 'X':
9661             ret = reg_node(pRExC_state, CLUMP);
9662             *flagp |= HASWIDTH;
9663             goto finish_meta_pat;
9664         case 'w':
9665             switch (get_regex_charset(RExC_flags)) {
9666                 case REGEX_LOCALE_CHARSET:
9667                     op = ALNUML;
9668                     break;
9669                 case REGEX_UNICODE_CHARSET:
9670                     op = ALNUMU;
9671                     break;
9672                 case REGEX_ASCII_RESTRICTED_CHARSET:
9673                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9674                     op = ALNUMA;
9675                     break;
9676                 case REGEX_DEPENDS_CHARSET:
9677                     op = ALNUM;
9678                     break;
9679                 default:
9680                     goto bad_charset;
9681             }
9682             ret = reg_node(pRExC_state, op);
9683             *flagp |= HASWIDTH|SIMPLE;
9684             goto finish_meta_pat;
9685         case 'W':
9686             switch (get_regex_charset(RExC_flags)) {
9687                 case REGEX_LOCALE_CHARSET:
9688                     op = NALNUML;
9689                     break;
9690                 case REGEX_UNICODE_CHARSET:
9691                     op = NALNUMU;
9692                     break;
9693                 case REGEX_ASCII_RESTRICTED_CHARSET:
9694                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9695                     op = NALNUMA;
9696                     break;
9697                 case REGEX_DEPENDS_CHARSET:
9698                     op = NALNUM;
9699                     break;
9700                 default:
9701                     goto bad_charset;
9702             }
9703             ret = reg_node(pRExC_state, op);
9704             *flagp |= HASWIDTH|SIMPLE;
9705             goto finish_meta_pat;
9706         case 'b':
9707             RExC_seen_zerolen++;
9708             RExC_seen |= REG_SEEN_LOOKBEHIND;
9709             switch (get_regex_charset(RExC_flags)) {
9710                 case REGEX_LOCALE_CHARSET:
9711                     op = BOUNDL;
9712                     break;
9713                 case REGEX_UNICODE_CHARSET:
9714                     op = BOUNDU;
9715                     break;
9716                 case REGEX_ASCII_RESTRICTED_CHARSET:
9717                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9718                     op = BOUNDA;
9719                     break;
9720                 case REGEX_DEPENDS_CHARSET:
9721                     op = BOUND;
9722                     break;
9723                 default:
9724                     goto bad_charset;
9725             }
9726             ret = reg_node(pRExC_state, op);
9727             FLAGS(ret) = get_regex_charset(RExC_flags);
9728             *flagp |= SIMPLE;
9729             goto finish_meta_pat;
9730         case 'B':
9731             RExC_seen_zerolen++;
9732             RExC_seen |= REG_SEEN_LOOKBEHIND;
9733             switch (get_regex_charset(RExC_flags)) {
9734                 case REGEX_LOCALE_CHARSET:
9735                     op = NBOUNDL;
9736                     break;
9737                 case REGEX_UNICODE_CHARSET:
9738                     op = NBOUNDU;
9739                     break;
9740                 case REGEX_ASCII_RESTRICTED_CHARSET:
9741                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9742                     op = NBOUNDA;
9743                     break;
9744                 case REGEX_DEPENDS_CHARSET:
9745                     op = NBOUND;
9746                     break;
9747                 default:
9748                     goto bad_charset;
9749             }
9750             ret = reg_node(pRExC_state, op);
9751             FLAGS(ret) = get_regex_charset(RExC_flags);
9752             *flagp |= SIMPLE;
9753             goto finish_meta_pat;
9754         case 's':
9755             switch (get_regex_charset(RExC_flags)) {
9756                 case REGEX_LOCALE_CHARSET:
9757                     op = SPACEL;
9758                     break;
9759                 case REGEX_UNICODE_CHARSET:
9760                     op = SPACEU;
9761                     break;
9762                 case REGEX_ASCII_RESTRICTED_CHARSET:
9763                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9764                     op = SPACEA;
9765                     break;
9766                 case REGEX_DEPENDS_CHARSET:
9767                     op = SPACE;
9768                     break;
9769                 default:
9770                     goto bad_charset;
9771             }
9772             ret = reg_node(pRExC_state, op);
9773             *flagp |= HASWIDTH|SIMPLE;
9774             goto finish_meta_pat;
9775         case 'S':
9776             switch (get_regex_charset(RExC_flags)) {
9777                 case REGEX_LOCALE_CHARSET:
9778                     op = NSPACEL;
9779                     break;
9780                 case REGEX_UNICODE_CHARSET:
9781                     op = NSPACEU;
9782                     break;
9783                 case REGEX_ASCII_RESTRICTED_CHARSET:
9784                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9785                     op = NSPACEA;
9786                     break;
9787                 case REGEX_DEPENDS_CHARSET:
9788                     op = NSPACE;
9789                     break;
9790                 default:
9791                     goto bad_charset;
9792             }
9793             ret = reg_node(pRExC_state, op);
9794             *flagp |= HASWIDTH|SIMPLE;
9795             goto finish_meta_pat;
9796         case 'd':
9797             switch (get_regex_charset(RExC_flags)) {
9798                 case REGEX_LOCALE_CHARSET:
9799                     op = DIGITL;
9800                     break;
9801                 case REGEX_ASCII_RESTRICTED_CHARSET:
9802                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9803                     op = DIGITA;
9804                     break;
9805                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9806                 case REGEX_UNICODE_CHARSET:
9807                     op = DIGIT;
9808                     break;
9809                 default:
9810                     goto bad_charset;
9811             }
9812             ret = reg_node(pRExC_state, op);
9813             *flagp |= HASWIDTH|SIMPLE;
9814             goto finish_meta_pat;
9815         case 'D':
9816             switch (get_regex_charset(RExC_flags)) {
9817                 case REGEX_LOCALE_CHARSET:
9818                     op = NDIGITL;
9819                     break;
9820                 case REGEX_ASCII_RESTRICTED_CHARSET:
9821                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9822                     op = NDIGITA;
9823                     break;
9824                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9825                 case REGEX_UNICODE_CHARSET:
9826                     op = NDIGIT;
9827                     break;
9828                 default:
9829                     goto bad_charset;
9830             }
9831             ret = reg_node(pRExC_state, op);
9832             *flagp |= HASWIDTH|SIMPLE;
9833             goto finish_meta_pat;
9834         case 'R':
9835             ret = reg_node(pRExC_state, LNBREAK);
9836             *flagp |= HASWIDTH|SIMPLE;
9837             goto finish_meta_pat;
9838         case 'h':
9839             ret = reg_node(pRExC_state, HORIZWS);
9840             *flagp |= HASWIDTH|SIMPLE;
9841             goto finish_meta_pat;
9842         case 'H':
9843             ret = reg_node(pRExC_state, NHORIZWS);
9844             *flagp |= HASWIDTH|SIMPLE;
9845             goto finish_meta_pat;
9846         case 'v':
9847             ret = reg_node(pRExC_state, VERTWS);
9848             *flagp |= HASWIDTH|SIMPLE;
9849             goto finish_meta_pat;
9850         case 'V':
9851             ret = reg_node(pRExC_state, NVERTWS);
9852             *flagp |= HASWIDTH|SIMPLE;
9853          finish_meta_pat:
9854             nextchar(pRExC_state);
9855             Set_Node_Length(ret, 2); /* MJD */
9856             break;
9857         case 'p':
9858         case 'P':
9859             {
9860                 char* const oldregxend = RExC_end;
9861 #ifdef DEBUGGING
9862                 char* parse_start = RExC_parse - 2;
9863 #endif
9864
9865                 if (RExC_parse[1] == '{') {
9866                   /* a lovely hack--pretend we saw [\pX] instead */
9867                     RExC_end = strchr(RExC_parse, '}');
9868                     if (!RExC_end) {
9869                         const U8 c = (U8)*RExC_parse;
9870                         RExC_parse += 2;
9871                         RExC_end = oldregxend;
9872                         vFAIL2("Missing right brace on \\%c{}", c);
9873                     }
9874                     RExC_end++;
9875                 }
9876                 else {
9877                     RExC_end = RExC_parse + 2;
9878                     if (RExC_end > oldregxend)
9879                         RExC_end = oldregxend;
9880                 }
9881                 RExC_parse--;
9882
9883                 ret = regclass(pRExC_state,depth+1);
9884
9885                 RExC_end = oldregxend;
9886                 RExC_parse--;
9887
9888                 Set_Node_Offset(ret, parse_start + 2);
9889                 Set_Node_Cur_Length(ret);
9890                 nextchar(pRExC_state);
9891                 *flagp |= HASWIDTH|SIMPLE;
9892             }
9893             break;
9894         case 'N':
9895             /* Handle \N and \N{NAME} here and not below because it can be
9896             multicharacter. join_exact() will join them up later on.
9897             Also this makes sure that things like /\N{BLAH}+/ and
9898             \N{BLAH} being multi char Just Happen. dmq*/
9899             ++RExC_parse;
9900             ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9901             break;
9902         case 'k':    /* Handle \k<NAME> and \k'NAME' */
9903         parse_named_seq:
9904         {
9905             char ch= RExC_parse[1];
9906             if (ch != '<' && ch != '\'' && ch != '{') {
9907                 RExC_parse++;
9908                 vFAIL2("Sequence %.2s... not terminated",parse_start);
9909             } else {
9910                 /* this pretty much dupes the code for (?P=...) in reg(), if
9911                    you change this make sure you change that */
9912                 char* name_start = (RExC_parse += 2);
9913                 U32 num = 0;
9914                 SV *sv_dat = reg_scan_name(pRExC_state,
9915                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9916                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9917                 if (RExC_parse == name_start || *RExC_parse != ch)
9918                     vFAIL2("Sequence %.3s... not terminated",parse_start);
9919
9920                 if (!SIZE_ONLY) {
9921                     num = add_data( pRExC_state, 1, "S" );
9922                     RExC_rxi->data->data[num]=(void*)sv_dat;
9923                     SvREFCNT_inc_simple_void(sv_dat);
9924                 }
9925
9926                 RExC_sawback = 1;
9927                 ret = reganode(pRExC_state,
9928                                ((! FOLD)
9929                                  ? NREF
9930                                  : (MORE_ASCII_RESTRICTED)
9931                                    ? NREFFA
9932                                    : (AT_LEAST_UNI_SEMANTICS)
9933                                      ? NREFFU
9934                                      : (LOC)
9935                                        ? NREFFL
9936                                        : NREFF),
9937                                 num);
9938                 *flagp |= HASWIDTH;
9939
9940                 /* override incorrect value set in reganode MJD */
9941                 Set_Node_Offset(ret, parse_start+1);
9942                 Set_Node_Cur_Length(ret); /* MJD */
9943                 nextchar(pRExC_state);
9944
9945             }
9946             break;
9947         }
9948         case 'g':
9949         case '1': case '2': case '3': case '4':
9950         case '5': case '6': case '7': case '8': case '9':
9951             {
9952                 I32 num;
9953                 bool isg = *RExC_parse == 'g';
9954                 bool isrel = 0;
9955                 bool hasbrace = 0;
9956                 if (isg) {
9957                     RExC_parse++;
9958                     if (*RExC_parse == '{') {
9959                         RExC_parse++;
9960                         hasbrace = 1;
9961                     }
9962                     if (*RExC_parse == '-') {
9963                         RExC_parse++;
9964                         isrel = 1;
9965                     }
9966                     if (hasbrace && !isDIGIT(*RExC_parse)) {
9967                         if (isrel) RExC_parse--;
9968                         RExC_parse -= 2;
9969                         goto parse_named_seq;
9970                 }   }
9971                 num = atoi(RExC_parse);
9972                 if (isg && num == 0)
9973                     vFAIL("Reference to invalid group 0");
9974                 if (isrel) {
9975                     num = RExC_npar - num;
9976                     if (num < 1)
9977                         vFAIL("Reference to nonexistent or unclosed group");
9978                 }
9979                 if (!isg && num > 9 && num >= RExC_npar)
9980                     goto defchar;
9981                 else {
9982                     char * const parse_start = RExC_parse - 1; /* MJD */
9983                     while (isDIGIT(*RExC_parse))
9984                         RExC_parse++;
9985                     if (parse_start == RExC_parse - 1)
9986                         vFAIL("Unterminated \\g... pattern");
9987                     if (hasbrace) {
9988                         if (*RExC_parse != '}')
9989                             vFAIL("Unterminated \\g{...} pattern");
9990                         RExC_parse++;
9991                     }
9992                     if (!SIZE_ONLY) {
9993                         if (num > (I32)RExC_rx->nparens)
9994                             vFAIL("Reference to nonexistent group");
9995                     }
9996                     RExC_sawback = 1;
9997                     ret = reganode(pRExC_state,
9998                                    ((! FOLD)
9999                                      ? REF
10000                                      : (MORE_ASCII_RESTRICTED)
10001                                        ? REFFA
10002                                        : (AT_LEAST_UNI_SEMANTICS)
10003                                          ? REFFU
10004                                          : (LOC)
10005                                            ? REFFL
10006                                            : REFF),
10007                                     num);
10008                     *flagp |= HASWIDTH;
10009
10010                     /* override incorrect value set in reganode MJD */
10011                     Set_Node_Offset(ret, parse_start+1);
10012                     Set_Node_Cur_Length(ret); /* MJD */
10013                     RExC_parse--;
10014                     nextchar(pRExC_state);
10015                 }
10016             }
10017             break;
10018         case '\0':
10019             if (RExC_parse >= RExC_end)
10020                 FAIL("Trailing \\");
10021             /* FALL THROUGH */
10022         default:
10023             /* Do not generate "unrecognized" warnings here, we fall
10024                back into the quick-grab loop below */
10025             parse_start--;
10026             goto defchar;
10027         }
10028         break;
10029
10030     case '#':
10031         if (RExC_flags & RXf_PMf_EXTENDED) {
10032             if ( reg_skipcomment( pRExC_state ) )
10033                 goto tryagain;
10034         }
10035         /* FALL THROUGH */
10036
10037     default:
10038
10039             parse_start = RExC_parse - 1;
10040
10041             RExC_parse++;
10042
10043         defchar: {
10044             register STRLEN len;
10045             register UV ender;
10046             register char *p;
10047             char *s;
10048             STRLEN foldlen;
10049             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
10050             U8 node_type;
10051
10052             /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node?  If so,
10053              * it is folded to 'ss' even if not utf8 */
10054             bool is_exactfu_sharp_s;
10055
10056             ender = 0;
10057             node_type = ((! FOLD) ? EXACT
10058                         : (LOC)
10059                           ? EXACTFL
10060                           : (MORE_ASCII_RESTRICTED)
10061                             ? EXACTFA
10062                             : (AT_LEAST_UNI_SEMANTICS)
10063                               ? EXACTFU
10064                               : EXACTF);
10065             ret = reg_node(pRExC_state, node_type);
10066             s = STRING(ret);
10067
10068             /* XXX The node can hold up to 255 bytes, yet this only goes to
10069              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10070              * 255 allows us to not have to worry about overflow due to
10071              * converting to utf8 and fold expansion, but that value is
10072              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10073              * split up by this limit into a single one using the real max of
10074              * 255.  Even at 127, this breaks under rare circumstances.  If
10075              * folding, we do not want to split a node at a character that is a
10076              * non-final in a multi-char fold, as an input string could just
10077              * happen to want to match across the node boundary.  The join
10078              * would solve that problem if the join actually happens.  But a
10079              * series of more than two nodes in a row each of 127 would cause
10080              * the first join to succeed to get to 254, but then there wouldn't
10081              * be room for the next one, which could at be one of those split
10082              * multi-char folds.  I don't know of any fool-proof solution.  One
10083              * could back off to end with only a code point that isn't such a
10084              * non-final, but it is possible for there not to be any in the
10085              * entire node. */
10086             for (len = 0, p = RExC_parse - 1;
10087                  len < 127 && p < RExC_end;
10088                  len++)
10089             {
10090                 char * const oldp = p;
10091
10092                 if (RExC_flags & RXf_PMf_EXTENDED)
10093                     p = regwhite( pRExC_state, p );
10094                 switch ((U8)*p) {
10095                 case '^':
10096                 case '$':
10097                 case '.':
10098                 case '[':
10099                 case '(':
10100                 case ')':
10101                 case '|':
10102                     goto loopdone;
10103                 case '\\':
10104                     /* Literal Escapes Switch
10105
10106                        This switch is meant to handle escape sequences that
10107                        resolve to a literal character.
10108
10109                        Every escape sequence that represents something
10110                        else, like an assertion or a char class, is handled
10111                        in the switch marked 'Special Escapes' above in this
10112                        routine, but also has an entry here as anything that
10113                        isn't explicitly mentioned here will be treated as
10114                        an unescaped equivalent literal.
10115                     */
10116
10117                     switch ((U8)*++p) {
10118                     /* These are all the special escapes. */
10119                     case 'A':             /* Start assertion */
10120                     case 'b': case 'B':   /* Word-boundary assertion*/
10121                     case 'C':             /* Single char !DANGEROUS! */
10122                     case 'd': case 'D':   /* digit class */
10123                     case 'g': case 'G':   /* generic-backref, pos assertion */
10124                     case 'h': case 'H':   /* HORIZWS */
10125                     case 'k': case 'K':   /* named backref, keep marker */
10126                     case 'N':             /* named char sequence */
10127                     case 'p': case 'P':   /* Unicode property */
10128                               case 'R':   /* LNBREAK */
10129                     case 's': case 'S':   /* space class */
10130                     case 'v': case 'V':   /* VERTWS */
10131                     case 'w': case 'W':   /* word class */
10132                     case 'X':             /* eXtended Unicode "combining character sequence" */
10133                     case 'z': case 'Z':   /* End of line/string assertion */
10134                         --p;
10135                         goto loopdone;
10136
10137                     /* Anything after here is an escape that resolves to a
10138                        literal. (Except digits, which may or may not)
10139                      */
10140                     case 'n':
10141                         ender = '\n';
10142                         p++;
10143                         break;
10144                     case 'r':
10145                         ender = '\r';
10146                         p++;
10147                         break;
10148                     case 't':
10149                         ender = '\t';
10150                         p++;
10151                         break;
10152                     case 'f':
10153                         ender = '\f';
10154                         p++;
10155                         break;
10156                     case 'e':
10157                           ender = ASCII_TO_NATIVE('\033');
10158                         p++;
10159                         break;
10160                     case 'a':
10161                           ender = ASCII_TO_NATIVE('\007');
10162                         p++;
10163                         break;
10164                     case 'o':
10165                         {
10166                             STRLEN brace_len = len;
10167                             UV result;
10168                             const char* error_msg;
10169
10170                             bool valid = grok_bslash_o(p,
10171                                                        &result,
10172                                                        &brace_len,
10173                                                        &error_msg,
10174                                                        1);
10175                             p += brace_len;
10176                             if (! valid) {
10177                                 RExC_parse = p; /* going to die anyway; point
10178                                                    to exact spot of failure */
10179                                 vFAIL(error_msg);
10180                             }
10181                             else
10182                             {
10183                                 ender = result;
10184                             }
10185                             if (PL_encoding && ender < 0x100) {
10186                                 goto recode_encoding;
10187                             }
10188                             if (ender > 0xff) {
10189                                 REQUIRE_UTF8;
10190                             }
10191                             break;
10192                         }
10193                     case 'x':
10194                         if (*++p == '{') {
10195                             char* const e = strchr(p, '}');
10196
10197                             if (!e) {
10198                                 RExC_parse = p + 1;
10199                                 vFAIL("Missing right brace on \\x{}");
10200                             }
10201                             else {
10202                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10203                                     | PERL_SCAN_DISALLOW_PREFIX;
10204                                 STRLEN numlen = e - p - 1;
10205                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
10206                                 if (ender > 0xff)
10207                                     REQUIRE_UTF8;
10208                                 p = e + 1;
10209                             }
10210                         }
10211                         else {
10212                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10213                             STRLEN numlen = 2;
10214                             ender = grok_hex(p, &numlen, &flags, NULL);
10215                             p += numlen;
10216                         }
10217                         if (PL_encoding && ender < 0x100)
10218                             goto recode_encoding;
10219                         break;
10220                     case 'c':
10221                         p++;
10222                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10223                         break;
10224                     case '0': case '1': case '2': case '3':case '4':
10225                     case '5': case '6': case '7': case '8':case '9':
10226                         if (*p == '0' ||
10227                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10228                         {
10229                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10230                             STRLEN numlen = 3;
10231                             ender = grok_oct(p, &numlen, &flags, NULL);
10232                             if (ender > 0xff) {
10233                                 REQUIRE_UTF8;
10234                             }
10235                             p += numlen;
10236                         }
10237                         else {
10238                             --p;
10239                             goto loopdone;
10240                         }
10241                         if (PL_encoding && ender < 0x100)
10242                             goto recode_encoding;
10243                         break;
10244                     recode_encoding:
10245                         if (! RExC_override_recoding) {
10246                             SV* enc = PL_encoding;
10247                             ender = reg_recode((const char)(U8)ender, &enc);
10248                             if (!enc && SIZE_ONLY)
10249                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10250                             REQUIRE_UTF8;
10251                         }
10252                         break;
10253                     case '\0':
10254                         if (p >= RExC_end)
10255                             FAIL("Trailing \\");
10256                         /* FALL THROUGH */
10257                     default:
10258                         if (!SIZE_ONLY&& isALPHA(*p)) {
10259                             ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10260                         }
10261                         goto normal_default;
10262                     }
10263                     break;
10264                 case '{':
10265                     /* Currently we don't warn when the lbrace is at the start
10266                      * of a construct.  This catches it in the middle of a
10267                      * literal string, or when its the first thing after
10268                      * something like "\b" */
10269                     if (! SIZE_ONLY
10270                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10271                     {
10272                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10273                     }
10274                     /*FALLTHROUGH*/
10275                 default:
10276                   normal_default:
10277                     if (UTF8_IS_START(*p) && UTF) {
10278                         STRLEN numlen;
10279                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10280                                                &numlen, UTF8_ALLOW_DEFAULT);
10281                         p += numlen;
10282                     }
10283                     else
10284                         ender = (U8) *p++;
10285                     break;
10286                 } /* End of switch on the literal */
10287
10288                 is_exactfu_sharp_s = (node_type == EXACTFU
10289                                       && ender == LATIN_SMALL_LETTER_SHARP_S);
10290                 if ( RExC_flags & RXf_PMf_EXTENDED)
10291                     p = regwhite( pRExC_state, p );
10292                 if ((UTF && FOLD) || is_exactfu_sharp_s) {
10293                     /* Prime the casefolded buffer.  Locale rules, which apply
10294                      * only to code points < 256, aren't known until execution,
10295                      * so for them, just output the original character using
10296                      * utf8.  If we start to fold non-UTF patterns, be sure to
10297                      * update join_exact() */
10298                     if (LOC && ender < 256) {
10299                         if (UNI_IS_INVARIANT(ender)) {
10300                             *tmpbuf = (U8) ender;
10301                             foldlen = 1;
10302                         } else {
10303                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
10304                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
10305                             foldlen = 2;
10306                         }
10307                     }
10308                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
10309                                                  */
10310                         ender = toLOWER(ender);
10311                         *tmpbuf = (U8) ender;
10312                         foldlen = 1;
10313                     }
10314                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
10315
10316                         /* Locale and /aa require more selectivity about the
10317                          * fold, so are handled below.  Otherwise, here, just
10318                          * use the fold */
10319                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
10320                     }
10321                     else {
10322                         /* Under locale rules or /aa we are not to mix,
10323                          * respectively, ords < 256 or ASCII with non-.  So
10324                          * reject folds that mix them, using only the
10325                          * non-folded code point.  So do the fold to a
10326                          * temporary, and inspect each character in it. */
10327                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
10328                         U8* s = trialbuf;
10329                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
10330                         U8* e = s + foldlen;
10331                         bool fold_ok = TRUE;
10332
10333                         while (s < e) {
10334                             if (isASCII(*s)
10335                                 || (LOC && (UTF8_IS_INVARIANT(*s)
10336                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
10337                             {
10338                                 fold_ok = FALSE;
10339                                 break;
10340                             }
10341                             s += UTF8SKIP(s);
10342                         }
10343                         if (fold_ok) {
10344                             Copy(trialbuf, tmpbuf, foldlen, U8);
10345                             ender = tmpender;
10346                         }
10347                         else {
10348                             uvuni_to_utf8(tmpbuf, ender);
10349                             foldlen = UNISKIP(ender);
10350                         }
10351                     }
10352                 }
10353                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
10354                     if (len)
10355                         p = oldp;
10356                     else if (UTF || is_exactfu_sharp_s) {
10357                          if (FOLD) {
10358                               /* Emit all the Unicode characters. */
10359                               STRLEN numlen;
10360                               for (foldbuf = tmpbuf;
10361                                    foldlen;
10362                                    foldlen -= numlen) {
10363
10364                                    /* tmpbuf has been constructed by us, so we
10365                                     * know it is valid utf8 */
10366                                    ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10367                                    if (numlen > 0) {
10368                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
10369                                         s       += unilen;
10370                                         len     += unilen;
10371                                         /* In EBCDIC the numlen
10372                                          * and unilen can differ. */
10373                                         foldbuf += numlen;
10374                                         if (numlen >= foldlen)
10375                                              break;
10376                                    }
10377                                    else
10378                                         break; /* "Can't happen." */
10379                               }
10380                          }
10381                          else {
10382                               const STRLEN unilen = reguni(pRExC_state, ender, s);
10383                               if (unilen > 0) {
10384                                    s   += unilen;
10385                                    len += unilen;
10386                               }
10387                          }
10388                     }
10389                     else {
10390                         len++;
10391                         REGC((char)ender, s++);
10392                     }
10393                     break;
10394                 }
10395                 if (UTF || is_exactfu_sharp_s) {
10396                      if (FOLD) {
10397                           /* Emit all the Unicode characters. */
10398                           STRLEN numlen;
10399                           for (foldbuf = tmpbuf;
10400                                foldlen;
10401                                foldlen -= numlen) {
10402                                ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10403                                if (numlen > 0) {
10404                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10405                                     len     += unilen;
10406                                     s       += unilen;
10407                                     /* In EBCDIC the numlen
10408                                      * and unilen can differ. */
10409                                     foldbuf += numlen;
10410                                     if (numlen >= foldlen)
10411                                          break;
10412                                }
10413                                else
10414                                     break;
10415                           }
10416                      }
10417                      else {
10418                           const STRLEN unilen = reguni(pRExC_state, ender, s);
10419                           if (unilen > 0) {
10420                                s   += unilen;
10421                                len += unilen;
10422                           }
10423                      }
10424                      len--;
10425                 }
10426                 else {
10427                     REGC((char)ender, s++);
10428                 }
10429             }
10430         loopdone:   /* Jumped to when encounters something that shouldn't be in
10431                        the node */
10432             RExC_parse = p - 1;
10433             Set_Node_Cur_Length(ret); /* MJD */
10434             nextchar(pRExC_state);
10435             {
10436                 /* len is STRLEN which is unsigned, need to copy to signed */
10437                 IV iv = len;
10438                 if (iv < 0)
10439                     vFAIL("Internal disaster");
10440             }
10441             if (len > 0)
10442                 *flagp |= HASWIDTH;
10443             if (len == 1 && UNI_IS_INVARIANT(ender))
10444                 *flagp |= SIMPLE;
10445
10446             if (SIZE_ONLY)
10447                 RExC_size += STR_SZ(len);
10448             else {
10449                 STR_LEN(ret) = len;
10450                 RExC_emit += STR_SZ(len);
10451             }
10452         }
10453         break;
10454     }
10455
10456     return(ret);
10457
10458 /* Jumped to when an unrecognized character set is encountered */
10459 bad_charset:
10460     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
10461     return(NULL);
10462 }
10463
10464 STATIC char *
10465 S_regwhite( RExC_state_t *pRExC_state, char *p )
10466 {
10467     const char *e = RExC_end;
10468
10469     PERL_ARGS_ASSERT_REGWHITE;
10470
10471     while (p < e) {
10472         if (isSPACE(*p))
10473             ++p;
10474         else if (*p == '#') {
10475             bool ended = 0;
10476             do {
10477                 if (*p++ == '\n') {
10478                     ended = 1;
10479                     break;
10480                 }
10481             } while (p < e);
10482             if (!ended)
10483                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10484         }
10485         else
10486             break;
10487     }
10488     return p;
10489 }
10490
10491 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
10492    Character classes ([:foo:]) can also be negated ([:^foo:]).
10493    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
10494    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
10495    but trigger failures because they are currently unimplemented. */
10496
10497 #define POSIXCC_DONE(c)   ((c) == ':')
10498 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
10499 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
10500
10501 STATIC I32
10502 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
10503 {
10504     dVAR;
10505     I32 namedclass = OOB_NAMEDCLASS;
10506
10507     PERL_ARGS_ASSERT_REGPPOSIXCC;
10508
10509     if (value == '[' && RExC_parse + 1 < RExC_end &&
10510         /* I smell either [: or [= or [. -- POSIX has been here, right? */
10511         POSIXCC(UCHARAT(RExC_parse))) {
10512         const char c = UCHARAT(RExC_parse);
10513         char* const s = RExC_parse++;
10514
10515         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
10516             RExC_parse++;
10517         if (RExC_parse == RExC_end)
10518             /* Grandfather lone [:, [=, [. */
10519             RExC_parse = s;
10520         else {
10521             const char* const t = RExC_parse++; /* skip over the c */
10522             assert(*t == c);
10523
10524             if (UCHARAT(RExC_parse) == ']') {
10525                 const char *posixcc = s + 1;
10526                 RExC_parse++; /* skip over the ending ] */
10527
10528                 if (*s == ':') {
10529                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
10530                     const I32 skip = t - posixcc;
10531
10532                     /* Initially switch on the length of the name.  */
10533                     switch (skip) {
10534                     case 4:
10535                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
10536                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
10537                         break;
10538                     case 5:
10539                         /* Names all of length 5.  */
10540                         /* alnum alpha ascii blank cntrl digit graph lower
10541                            print punct space upper  */
10542                         /* Offset 4 gives the best switch position.  */
10543                         switch (posixcc[4]) {
10544                         case 'a':
10545                             if (memEQ(posixcc, "alph", 4)) /* alpha */
10546                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
10547                             break;
10548                         case 'e':
10549                             if (memEQ(posixcc, "spac", 4)) /* space */
10550                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
10551                             break;
10552                         case 'h':
10553                             if (memEQ(posixcc, "grap", 4)) /* graph */
10554                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
10555                             break;
10556                         case 'i':
10557                             if (memEQ(posixcc, "asci", 4)) /* ascii */
10558                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
10559                             break;
10560                         case 'k':
10561                             if (memEQ(posixcc, "blan", 4)) /* blank */
10562                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
10563                             break;
10564                         case 'l':
10565                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10566                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
10567                             break;
10568                         case 'm':
10569                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
10570                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
10571                             break;
10572                         case 'r':
10573                             if (memEQ(posixcc, "lowe", 4)) /* lower */
10574                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10575                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
10576                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
10577                             break;
10578                         case 't':
10579                             if (memEQ(posixcc, "digi", 4)) /* digit */
10580                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10581                             else if (memEQ(posixcc, "prin", 4)) /* print */
10582                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10583                             else if (memEQ(posixcc, "punc", 4)) /* punct */
10584                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
10585                             break;
10586                         }
10587                         break;
10588                     case 6:
10589                         if (memEQ(posixcc, "xdigit", 6))
10590                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
10591                         break;
10592                     }
10593
10594                     if (namedclass == OOB_NAMEDCLASS)
10595                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10596                                       t - s - 1, s + 1);
10597                     assert (posixcc[skip] == ':');
10598                     assert (posixcc[skip+1] == ']');
10599                 } else if (!SIZE_ONLY) {
10600                     /* [[=foo=]] and [[.foo.]] are still future. */
10601
10602                     /* adjust RExC_parse so the warning shows after
10603                        the class closes */
10604                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
10605                         RExC_parse++;
10606                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10607                 }
10608             } else {
10609                 /* Maternal grandfather:
10610                  * "[:" ending in ":" but not in ":]" */
10611                 RExC_parse = s;
10612             }
10613         }
10614     }
10615
10616     return namedclass;
10617 }
10618
10619 STATIC void
10620 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10621 {
10622     dVAR;
10623
10624     PERL_ARGS_ASSERT_CHECKPOSIXCC;
10625
10626     if (POSIXCC(UCHARAT(RExC_parse))) {
10627         const char *s = RExC_parse;
10628         const char  c = *s++;
10629
10630         while (isALNUM(*s))
10631             s++;
10632         if (*s && c == *s && s[1] == ']') {
10633             ckWARN3reg(s+2,
10634                        "POSIX syntax [%c %c] belongs inside character classes",
10635                        c, c);
10636
10637             /* [[=foo=]] and [[.foo.]] are still future. */
10638             if (POSIXCC_NOTYET(c)) {
10639                 /* adjust RExC_parse so the error shows after
10640                    the class closes */
10641                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10642                     NOOP;
10643                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10644             }
10645         }
10646     }
10647 }
10648
10649 /* Generate the code to add a full posix character <class> to the bracketed
10650  * character class given by <node>.  (<node> is needed only under locale rules)
10651  * destlist     is the inversion list for non-locale rules that this class is
10652  *              to be added to
10653  * sourcelist   is the ASCII-range inversion list to add under /a rules
10654  * Xsourcelist  is the full Unicode range list to use otherwise. */
10655 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
10656     if (LOC) {                                                             \
10657         SV* scratch_list = NULL;                                           \
10658                                                                            \
10659         /* Set this class in the node for runtime matching */              \
10660         ANYOF_CLASS_SET(node, class);                                      \
10661                                                                            \
10662         /* For above Latin1 code points, we use the full Unicode range */  \
10663         _invlist_intersection(PL_AboveLatin1,                              \
10664                               Xsourcelist,                                 \
10665                               &scratch_list);                              \
10666         /* And set the output to it, adding instead if there already is an \
10667          * output.  Checking if <destlist> is NULL first saves an extra    \
10668          * clone.  Its reference count will be decremented at the next     \
10669          * union, etc, or if this is the only instance, at the end of the  \
10670          * routine */                                                      \
10671         if (! destlist) {                                                  \
10672             destlist = scratch_list;                                       \
10673         }                                                                  \
10674         else {                                                             \
10675             _invlist_union(destlist, scratch_list, &destlist);             \
10676             SvREFCNT_dec(scratch_list);                                    \
10677         }                                                                  \
10678     }                                                                      \
10679     else {                                                                 \
10680         /* For non-locale, just add it to any existing list */             \
10681         _invlist_union(destlist,                                           \
10682                        (AT_LEAST_ASCII_RESTRICTED)                         \
10683                            ? sourcelist                                    \
10684                            : Xsourcelist,                                  \
10685                        &destlist);                                         \
10686     }
10687
10688 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10689  */
10690 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
10691     if (LOC) {                                                             \
10692         SV* scratch_list = NULL;                                           \
10693         ANYOF_CLASS_SET(node, class);                                      \
10694         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
10695         if (! destlist) {                                                  \
10696             destlist = scratch_list;                                       \
10697         }                                                                  \
10698         else {                                                             \
10699             _invlist_union(destlist, scratch_list, &destlist);             \
10700             SvREFCNT_dec(scratch_list);                                    \
10701         }                                                                  \
10702     }                                                                      \
10703     else {                                                                 \
10704         _invlist_union_complement_2nd(destlist,                            \
10705                                     (AT_LEAST_ASCII_RESTRICTED)            \
10706                                         ? sourcelist                       \
10707                                         : Xsourcelist,                     \
10708                                     &destlist);                            \
10709         /* Under /d, everything in the upper half of the Latin1 range      \
10710          * matches this complement */                                      \
10711         if (DEPENDS_SEMANTICS) {                                           \
10712             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
10713         }                                                                  \
10714     }
10715
10716 /* Generate the code to add a posix character <class> to the bracketed
10717  * character class given by <node>.  (<node> is needed only under locale rules)
10718  * destlist       is the inversion list for non-locale rules that this class is
10719  *                to be added to
10720  * sourcelist     is the ASCII-range inversion list to add under /a rules
10721  * l1_sourcelist  is the Latin1 range list to use otherwise.
10722  * Xpropertyname  is the name to add to <run_time_list> of the property to
10723  *                specify the code points above Latin1 that will have to be
10724  *                determined at run-time
10725  * run_time_list  is a SV* that contains text names of properties that are to
10726  *                be computed at run time.  This concatenates <Xpropertyname>
10727  *                to it, apppropriately
10728  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10729  * time */
10730 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
10731                               l1_sourcelist, Xpropertyname, run_time_list) \
10732         /* First, resolve whether to use the ASCII-only list or the L1     \
10733          * list */                                                         \
10734         DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist,      \
10735                 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
10736                 Xpropertyname, run_time_list)
10737
10738 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
10739                 Xpropertyname, run_time_list)                              \
10740     /* If not /a matching, there are going to be code points we will have  \
10741      * to defer to runtime to look-up */                                   \
10742     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
10743         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10744     }                                                                      \
10745     if (LOC) {                                                             \
10746         ANYOF_CLASS_SET(node, class);                                      \
10747     }                                                                      \
10748     else {                                                                 \
10749         _invlist_union(destlist, sourcelist, &destlist);                   \
10750     }
10751
10752 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
10753  * this and DO_N_POSIX */
10754 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
10755                               l1_sourcelist, Xpropertyname, run_time_list) \
10756     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
10757         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
10758     }                                                                      \
10759     else {                                                                 \
10760         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10761         if (LOC) {                                                         \
10762             ANYOF_CLASS_SET(node, namedclass);                             \
10763         }                                                                  \
10764         else {                                                             \
10765             SV* scratch_list = NULL;                                       \
10766             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
10767             if (! destlist) {                                              \
10768                 destlist = scratch_list;                                   \
10769             }                                                              \
10770             else {                                                         \
10771                 _invlist_union(destlist, scratch_list, &destlist);         \
10772                 SvREFCNT_dec(scratch_list);                                \
10773             }                                                              \
10774             if (DEPENDS_SEMANTICS) {                                       \
10775                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
10776             }                                                              \
10777         }                                                                  \
10778     }
10779
10780 STATIC U8
10781 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10782 {
10783
10784     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10785      * Locale folding is done at run-time, so this function should not be
10786      * called for nodes that are for locales.
10787      *
10788      * This function sets the bit corresponding to the fold of the input
10789      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
10790      * 'F' is 'f'.
10791      *
10792      * It also knows about the characters that are in the bitmap that have
10793      * folds that are matchable only outside it, and sets the appropriate lists
10794      * and flags.
10795      *
10796      * It returns the number of bits that actually changed from 0 to 1 */
10797
10798     U8 stored = 0;
10799     U8 fold;
10800
10801     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10802
10803     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10804                                     : PL_fold[value];
10805
10806     /* It assumes the bit for 'value' has already been set */
10807     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10808         ANYOF_BITMAP_SET(node, fold);
10809         stored++;
10810     }
10811     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10812         /* Certain Latin1 characters have matches outside the bitmap.  To get
10813          * here, 'value' is one of those characters.   None of these matches is
10814          * valid for ASCII characters under /aa, which have been excluded by
10815          * the 'if' above.  The matches fall into three categories:
10816          * 1) They are singly folded-to or -from an above 255 character, as
10817          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10818          *    WITH DIAERESIS;
10819          * 2) They are part of a multi-char fold with another character in the
10820          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10821          * 3) They are part of a multi-char fold with a character not in the
10822          *    bitmap, such as various ligatures.
10823          * We aren't dealing fully with multi-char folds, except we do deal
10824          * with the pattern containing a character that has a multi-char fold
10825          * (not so much the inverse).
10826          * For types 1) and 3), the matches only happen when the target string
10827          * is utf8; that's not true for 2), and we set a flag for it.
10828          *
10829          * The code below adds to the passed in inversion list the single fold
10830          * closures for 'value'.  The values are hard-coded here so that an
10831          * innocent-looking character class, like /[ks]/i won't have to go out
10832          * to disk to find the possible matches.  XXX It would be better to
10833          * generate these via regen, in case a new version of the Unicode
10834          * standard adds new mappings, though that is not really likely. */
10835         switch (value) {
10836             case 'k':
10837             case 'K':
10838                 /* KELVIN SIGN */
10839                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10840                 break;
10841             case 's':
10842             case 'S':
10843                 /* LATIN SMALL LETTER LONG S */
10844                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10845                 break;
10846             case MICRO_SIGN:
10847                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10848                                                  GREEK_SMALL_LETTER_MU);
10849                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10850                                                  GREEK_CAPITAL_LETTER_MU);
10851                 break;
10852             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10853             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10854                 /* ANGSTROM SIGN */
10855                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10856                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
10857                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10858                                                      PL_fold_latin1[value]);
10859                 }
10860                 break;
10861             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10862                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10863                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10864                 break;
10865             case LATIN_SMALL_LETTER_SHARP_S:
10866                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10867                                         LATIN_CAPITAL_LETTER_SHARP_S);
10868
10869                 /* Under /a, /d, and /u, this can match the two chars "ss" */
10870                 if (! MORE_ASCII_RESTRICTED) {
10871                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
10872
10873                     /* And under /u or /a, it can match even if the target is
10874                      * not utf8 */
10875                     if (AT_LEAST_UNI_SEMANTICS) {
10876                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10877                     }
10878                 }
10879                 break;
10880             case 'F': case 'f':
10881             case 'I': case 'i':
10882             case 'L': case 'l':
10883             case 'T': case 't':
10884             case 'A': case 'a':
10885             case 'H': case 'h':
10886             case 'J': case 'j':
10887             case 'N': case 'n':
10888             case 'W': case 'w':
10889             case 'Y': case 'y':
10890                 /* These all are targets of multi-character folds from code
10891                  * points that require UTF8 to express, so they can't match
10892                  * unless the target string is in UTF-8, so no action here is
10893                  * necessary, as regexec.c properly handles the general case
10894                  * for UTF-8 matching */
10895                 break;
10896             default:
10897                 /* Use deprecated warning to increase the chances of this
10898                  * being output */
10899                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10900                 break;
10901         }
10902     }
10903     else if (DEPENDS_SEMANTICS
10904             && ! isASCII(value)
10905             && PL_fold_latin1[value] != value)
10906     {
10907            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10908             * folds only when the target string is in UTF-8.  We add the fold
10909             * here to the list of things to match outside the bitmap, which
10910             * won't be looked at unless it is UTF8 (or else if something else
10911             * says to look even if not utf8, but those things better not happen
10912             * under DEPENDS semantics. */
10913         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10914     }
10915
10916     return stored;
10917 }
10918
10919
10920 PERL_STATIC_INLINE U8
10921 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10922 {
10923     /* This inline function sets a bit in the bitmap if not already set, and if
10924      * appropriate, its fold, returning the number of bits that actually
10925      * changed from 0 to 1 */
10926
10927     U8 stored;
10928
10929     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10930
10931     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
10932         return 0;
10933     }
10934
10935     ANYOF_BITMAP_SET(node, value);
10936     stored = 1;
10937
10938     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
10939         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10940     }
10941
10942     return stored;
10943 }
10944
10945 STATIC void
10946 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10947 {
10948     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10949      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
10950      * the multi-character folds of characters in the node */
10951     SV *sv;
10952
10953     PERL_ARGS_ASSERT_ADD_ALTERNATE;
10954
10955     if (! *alternate_ptr) {
10956         *alternate_ptr = newAV();
10957     }
10958     sv = newSVpvn_utf8((char*)string, len, TRUE);
10959     av_push(*alternate_ptr, sv);
10960     return;
10961 }
10962
10963 /*
10964    parse a class specification and produce either an ANYOF node that
10965    matches the pattern or perhaps will be optimized into an EXACTish node
10966    instead. The node contains a bit map for the first 256 characters, with the
10967    corresponding bit set if that character is in the list.  For characters
10968    above 255, a range list is used */
10969
10970 STATIC regnode *
10971 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10972 {
10973     dVAR;
10974     register UV nextvalue;
10975     register IV prevvalue = OOB_UNICODE;
10976     register IV range = 0;
10977     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10978     register regnode *ret;
10979     STRLEN numlen;
10980     IV namedclass;
10981     char *rangebegin = NULL;
10982     bool need_class = 0;
10983     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
10984     SV *listsv = NULL;
10985     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10986                                       than just initialized.  */
10987     SV* properties = NULL;    /* Code points that match \p{} \P{} */
10988     UV element_count = 0;   /* Number of distinct elements in the class.
10989                                Optimizations may be possible if this is tiny */
10990     UV n;
10991
10992     /* Unicode properties are stored in a swash; this holds the current one
10993      * being parsed.  If this swash is the only above-latin1 component of the
10994      * character class, an optimization is to pass it directly on to the
10995      * execution engine.  Otherwise, it is set to NULL to indicate that there
10996      * are other things in the class that have to be dealt with at execution
10997      * time */
10998     SV* swash = NULL;           /* Code points that match \p{} \P{} */
10999
11000     /* Set if a component of this character class is user-defined; just passed
11001      * on to the engine */
11002     UV has_user_defined_property = 0;
11003
11004     /* code points this node matches that can't be stored in the bitmap */
11005     SV* nonbitmap = NULL;
11006
11007     /* The items that are to match that aren't stored in the bitmap, but are a
11008      * result of things that are stored there.  This is the fold closure of
11009      * such a character, either because it has DEPENDS semantics and shouldn't
11010      * be matched unless the target string is utf8, or is a code point that is
11011      * too large for the bit map, as for example, the fold of the MICRO SIGN is
11012      * above 255.  This all is solely for performance reasons.  By having this
11013      * code know the outside-the-bitmap folds that the bitmapped characters are
11014      * involved with, we don't have to go out to disk to find the list of
11015      * matches, unless the character class includes code points that aren't
11016      * storable in the bit map.  That means that a character class with an 's'
11017      * in it, for example, doesn't need to go out to disk to find everything
11018      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
11019      * empty unless there is something whose fold we don't know about, and will
11020      * have to go out to the disk to find. */
11021     SV* l1_fold_invlist = NULL;
11022
11023     /* List of multi-character folds that are matched by this node */
11024     AV* unicode_alternate  = NULL;
11025 #ifdef EBCDIC
11026     UV literal_endpoint = 0;
11027 #endif
11028     UV stored = 0;  /* how many chars stored in the bitmap */
11029
11030     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11031         case we need to change the emitted regop to an EXACT. */
11032     const char * orig_parse = RExC_parse;
11033     GET_RE_DEBUG_FLAGS_DECL;
11034
11035     PERL_ARGS_ASSERT_REGCLASS;
11036 #ifndef DEBUGGING
11037     PERL_UNUSED_ARG(depth);
11038 #endif
11039
11040     DEBUG_PARSE("clas");
11041
11042     /* Assume we are going to generate an ANYOF node. */
11043     ret = reganode(pRExC_state, ANYOF, 0);
11044
11045
11046     if (!SIZE_ONLY) {
11047         ANYOF_FLAGS(ret) = 0;
11048     }
11049
11050     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11051         RExC_naughty++;
11052         RExC_parse++;
11053         if (!SIZE_ONLY)
11054             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
11055
11056         /* We have decided to not allow multi-char folds in inverted character
11057          * classes, due to the confusion that can happen, especially with
11058          * classes that are designed for a non-Unicode world:  You have the
11059          * peculiar case that:
11060             "s s" =~ /^[^\xDF]+$/i => Y
11061             "ss"  =~ /^[^\xDF]+$/i => N
11062          *
11063          * See [perl #89750] */
11064         allow_full_fold = FALSE;
11065     }
11066
11067     if (SIZE_ONLY) {
11068         RExC_size += ANYOF_SKIP;
11069         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11070     }
11071     else {
11072         RExC_emit += ANYOF_SKIP;
11073         if (LOC) {
11074             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11075         }
11076         ANYOF_BITMAP_ZERO(ret);
11077         listsv = newSVpvs("# comment\n");
11078         initial_listsv_len = SvCUR(listsv);
11079     }
11080
11081     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11082
11083     if (!SIZE_ONLY && POSIXCC(nextvalue))
11084         checkposixcc(pRExC_state);
11085
11086     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11087     if (UCHARAT(RExC_parse) == ']')
11088         goto charclassloop;
11089
11090 parseit:
11091     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11092
11093     charclassloop:
11094
11095         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11096
11097         if (!range) {
11098             rangebegin = RExC_parse;
11099             element_count++;
11100         }
11101         if (UTF) {
11102             value = utf8n_to_uvchr((U8*)RExC_parse,
11103                                    RExC_end - RExC_parse,
11104                                    &numlen, UTF8_ALLOW_DEFAULT);
11105             RExC_parse += numlen;
11106         }
11107         else
11108             value = UCHARAT(RExC_parse++);
11109
11110         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11111         if (value == '[' && POSIXCC(nextvalue))
11112             namedclass = regpposixcc(pRExC_state, value);
11113         else if (value == '\\') {
11114             if (UTF) {
11115                 value = utf8n_to_uvchr((U8*)RExC_parse,
11116                                    RExC_end - RExC_parse,
11117                                    &numlen, UTF8_ALLOW_DEFAULT);
11118                 RExC_parse += numlen;
11119             }
11120             else
11121                 value = UCHARAT(RExC_parse++);
11122             /* Some compilers cannot handle switching on 64-bit integer
11123              * values, therefore value cannot be an UV.  Yes, this will
11124              * be a problem later if we want switch on Unicode.
11125              * A similar issue a little bit later when switching on
11126              * namedclass. --jhi */
11127             switch ((I32)value) {
11128             case 'w':   namedclass = ANYOF_ALNUM;       break;
11129             case 'W':   namedclass = ANYOF_NALNUM;      break;
11130             case 's':   namedclass = ANYOF_SPACE;       break;
11131             case 'S':   namedclass = ANYOF_NSPACE;      break;
11132             case 'd':   namedclass = ANYOF_DIGIT;       break;
11133             case 'D':   namedclass = ANYOF_NDIGIT;      break;
11134             case 'v':   namedclass = ANYOF_VERTWS;      break;
11135             case 'V':   namedclass = ANYOF_NVERTWS;     break;
11136             case 'h':   namedclass = ANYOF_HORIZWS;     break;
11137             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
11138             case 'N':  /* Handle \N{NAME} in class */
11139                 {
11140                     /* We only pay attention to the first char of
11141                     multichar strings being returned. I kinda wonder
11142                     if this makes sense as it does change the behaviour
11143                     from earlier versions, OTOH that behaviour was broken
11144                     as well. */
11145                     UV v; /* value is register so we cant & it /grrr */
11146                     if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
11147                         goto parseit;
11148                     }
11149                     value= v;
11150                 }
11151                 break;
11152             case 'p':
11153             case 'P':
11154                 {
11155                 char *e;
11156                 if (RExC_parse >= RExC_end)
11157                     vFAIL2("Empty \\%c{}", (U8)value);
11158                 if (*RExC_parse == '{') {
11159                     const U8 c = (U8)value;
11160                     e = strchr(RExC_parse++, '}');
11161                     if (!e)
11162                         vFAIL2("Missing right brace on \\%c{}", c);
11163                     while (isSPACE(UCHARAT(RExC_parse)))
11164                         RExC_parse++;
11165                     if (e == RExC_parse)
11166                         vFAIL2("Empty \\%c{}", c);
11167                     n = e - RExC_parse;
11168                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11169                         n--;
11170                 }
11171                 else {
11172                     e = RExC_parse;
11173                     n = 1;
11174                 }
11175                 if (!SIZE_ONLY) {
11176                     SV** invlistsvp;
11177                     SV* invlist;
11178                     char* name;
11179                     if (UCHARAT(RExC_parse) == '^') {
11180                          RExC_parse++;
11181                          n--;
11182                          value = value == 'p' ? 'P' : 'p'; /* toggle */
11183                          while (isSPACE(UCHARAT(RExC_parse))) {
11184                               RExC_parse++;
11185                               n--;
11186                          }
11187                     }
11188                     /* Try to get the definition of the property into
11189                      * <invlist>.  If /i is in effect, the effective property
11190                      * will have its name be <__NAME_i>.  The design is
11191                      * discussed in commit
11192                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11193                     Newx(name, n + sizeof("_i__\n"), char);
11194
11195                     sprintf(name, "%s%.*s%s\n",
11196                                     (FOLD) ? "__" : "",
11197                                     (int)n,
11198                                     RExC_parse,
11199                                     (FOLD) ? "_i" : ""
11200                     );
11201
11202                     /* Look up the property name, and get its swash and
11203                      * inversion list, if the property is found  */
11204                     if (swash) {
11205                         SvREFCNT_dec(swash);
11206                     }
11207                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
11208                                              1, /* binary */
11209                                              0, /* not tr/// */
11210                                              TRUE, /* this routine will handle
11211                                                       undefined properties */
11212                                              NULL, FALSE /* No inversion list */
11213                                             );
11214                     if (   ! swash
11215                         || ! SvROK(swash)
11216                         || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11217                         || ! (invlistsvp =
11218                                 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11219                                 "INVLIST", FALSE))
11220                         || ! (invlist = *invlistsvp))
11221                     {
11222                         if (swash) {
11223                             SvREFCNT_dec(swash);
11224                             swash = NULL;
11225                         }
11226
11227                         /* Here didn't find it.  It could be a user-defined
11228                          * property that will be available at run-time.  Add it
11229                          * to the list to look up then */
11230                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11231                                         (value == 'p' ? '+' : '!'),
11232                                         name);
11233                         has_user_defined_property = 1;
11234
11235                         /* We don't know yet, so have to assume that the
11236                          * property could match something in the Latin1 range,
11237                          * hence something that isn't utf8 */
11238                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11239                     }
11240                     else {
11241
11242                         /* Here, did get the swash and its inversion list.  If
11243                          * the swash is from a user-defined property, then this
11244                          * whole character class should be regarded as such */
11245                         SV** user_defined_svp =
11246                                             hv_fetchs(MUTABLE_HV(SvRV(swash)),
11247                                                         "USER_DEFINED", FALSE);
11248                         if (user_defined_svp) {
11249                             has_user_defined_property
11250                                                     |= SvUV(*user_defined_svp);
11251                         }
11252
11253                         /* Invert if asking for the complement */
11254                         if (value == 'P') {
11255                             _invlist_union_complement_2nd(properties, invlist, &properties);
11256
11257                             /* The swash can't be used as-is, because we've
11258                              * inverted things; delay removing it to here after
11259                              * have copied its invlist above */
11260                             SvREFCNT_dec(swash);
11261                             swash = NULL;
11262                         }
11263                         else {
11264                             _invlist_union(properties, invlist, &properties);
11265                         }
11266                     }
11267                     Safefree(name);
11268                 }
11269                 RExC_parse = e + 1;
11270                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
11271
11272                 /* \p means they want Unicode semantics */
11273                 RExC_uni_semantics = 1;
11274                 }
11275                 break;
11276             case 'n':   value = '\n';                   break;
11277             case 'r':   value = '\r';                   break;
11278             case 't':   value = '\t';                   break;
11279             case 'f':   value = '\f';                   break;
11280             case 'b':   value = '\b';                   break;
11281             case 'e':   value = ASCII_TO_NATIVE('\033');break;
11282             case 'a':   value = ASCII_TO_NATIVE('\007');break;
11283             case 'o':
11284                 RExC_parse--;   /* function expects to be pointed at the 'o' */
11285                 {
11286                     const char* error_msg;
11287                     bool valid = grok_bslash_o(RExC_parse,
11288                                                &value,
11289                                                &numlen,
11290                                                &error_msg,
11291                                                SIZE_ONLY);
11292                     RExC_parse += numlen;
11293                     if (! valid) {
11294                         vFAIL(error_msg);
11295                     }
11296                 }
11297                 if (PL_encoding && value < 0x100) {
11298                     goto recode_encoding;
11299                 }
11300                 break;
11301             case 'x':
11302                 if (*RExC_parse == '{') {
11303                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
11304                         | PERL_SCAN_DISALLOW_PREFIX;
11305                     char * const e = strchr(RExC_parse++, '}');
11306                     if (!e)
11307                         vFAIL("Missing right brace on \\x{}");
11308
11309                     numlen = e - RExC_parse;
11310                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
11311                     RExC_parse = e + 1;
11312                 }
11313                 else {
11314                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
11315                     numlen = 2;
11316                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
11317                     RExC_parse += numlen;
11318                 }
11319                 if (PL_encoding && value < 0x100)
11320                     goto recode_encoding;
11321                 break;
11322             case 'c':
11323                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11324                 break;
11325             case '0': case '1': case '2': case '3': case '4':
11326             case '5': case '6': case '7':
11327                 {
11328                     /* Take 1-3 octal digits */
11329                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11330                     numlen = 3;
11331                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11332                     RExC_parse += numlen;
11333                     if (PL_encoding && value < 0x100)
11334                         goto recode_encoding;
11335                     break;
11336                 }
11337             recode_encoding:
11338                 if (! RExC_override_recoding) {
11339                     SV* enc = PL_encoding;
11340                     value = reg_recode((const char)(U8)value, &enc);
11341                     if (!enc && SIZE_ONLY)
11342                         ckWARNreg(RExC_parse,
11343                                   "Invalid escape in the specified encoding");
11344                     break;
11345                 }
11346             default:
11347                 /* Allow \_ to not give an error */
11348                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11349                     ckWARN2reg(RExC_parse,
11350                                "Unrecognized escape \\%c in character class passed through",
11351                                (int)value);
11352                 }
11353                 break;
11354             }
11355         } /* end of \blah */
11356 #ifdef EBCDIC
11357         else
11358             literal_endpoint++;
11359 #endif
11360
11361         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11362
11363             /* What matches in a locale is not known until runtime, so need to
11364              * (one time per class) allocate extra space to pass to regexec.
11365              * The space will contain a bit for each named class that is to be
11366              * matched against.  This isn't needed for \p{} and pseudo-classes,
11367              * as they are not affected by locale, and hence are dealt with
11368              * separately */
11369             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
11370                 need_class = 1;
11371                 if (SIZE_ONLY) {
11372                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11373                 }
11374                 else {
11375                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11376                     ANYOF_CLASS_ZERO(ret);
11377                 }
11378                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11379             }
11380
11381             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
11382              * literal, as is the character that began the false range, i.e.
11383              * the 'a' in the examples */
11384             if (range) {
11385                 if (!SIZE_ONLY) {
11386                     const int w =
11387                         RExC_parse >= rangebegin ?
11388                         RExC_parse - rangebegin : 0;
11389                     ckWARN4reg(RExC_parse,
11390                                "False [] range \"%*.*s\"",
11391                                w, w, rangebegin);
11392
11393                     stored +=
11394                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11395                     if (prevvalue < 256) {
11396                         stored +=
11397                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
11398                     }
11399                     else {
11400                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
11401                     }
11402                 }
11403
11404                 range = 0; /* this was not a true range */
11405             }
11406
11407             if (!SIZE_ONLY) {
11408
11409                 /* Possible truncation here but in some 64-bit environments
11410                  * the compiler gets heartburn about switch on 64-bit values.
11411                  * A similar issue a little earlier when switching on value.
11412                  * --jhi */
11413                 switch ((I32)namedclass) {
11414
11415                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11416                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11417                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11418                     break;
11419                 case ANYOF_NALNUMC:
11420                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11421                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11422                     break;
11423                 case ANYOF_ALPHA:
11424                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11425                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11426                     break;
11427                 case ANYOF_NALPHA:
11428                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11429                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11430                     break;
11431                 case ANYOF_ASCII:
11432                     if (LOC) {
11433                         ANYOF_CLASS_SET(ret, namedclass);
11434                     }
11435                     else {
11436                         _invlist_union(properties, PL_ASCII, &properties);
11437                     }
11438                     break;
11439                 case ANYOF_NASCII:
11440                     if (LOC) {
11441                         ANYOF_CLASS_SET(ret, namedclass);
11442                     }
11443                     else {
11444                         _invlist_union_complement_2nd(properties,
11445                                                     PL_ASCII, &properties);
11446                         if (DEPENDS_SEMANTICS) {
11447                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11448                         }
11449                     }
11450                     break;
11451                 case ANYOF_BLANK:
11452                     DO_POSIX(ret, namedclass, properties,
11453                                             PL_PosixBlank, PL_XPosixBlank);
11454                     break;
11455                 case ANYOF_NBLANK:
11456                     DO_N_POSIX(ret, namedclass, properties,
11457                                             PL_PosixBlank, PL_XPosixBlank);
11458                     break;
11459                 case ANYOF_CNTRL:
11460                     DO_POSIX(ret, namedclass, properties,
11461                                             PL_PosixCntrl, PL_XPosixCntrl);
11462                     break;
11463                 case ANYOF_NCNTRL:
11464                     DO_N_POSIX(ret, namedclass, properties,
11465                                             PL_PosixCntrl, PL_XPosixCntrl);
11466                     break;
11467                 case ANYOF_DIGIT:
11468                     /* There are no digits in the Latin1 range outside of
11469                      * ASCII, so call the macro that doesn't have to resolve
11470                      * them */
11471                     DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
11472                         PL_PosixDigit, "XPosixDigit", listsv);
11473                     break;
11474                 case ANYOF_NDIGIT:
11475                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11476                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
11477                     break;
11478                 case ANYOF_GRAPH:
11479                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11480                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11481                     break;
11482                 case ANYOF_NGRAPH:
11483                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11484                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11485                     break;
11486                 case ANYOF_HORIZWS:
11487                     /* For these, we use the nonbitmap, as /d doesn't make a
11488                      * difference in what these match.  There would be problems
11489                      * if these characters had folds other than themselves, as
11490                      * nonbitmap is subject to folding.  It turns out that \h
11491                      * is just a synonym for XPosixBlank */
11492                     _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
11493                     break;
11494                 case ANYOF_NHORIZWS:
11495                     _invlist_union_complement_2nd(nonbitmap,
11496                                                  PL_XPosixBlank, &nonbitmap);
11497                     break;
11498                 case ANYOF_LOWER:
11499                 case ANYOF_NLOWER:
11500                 {   /* These require special handling, as they differ under
11501                        folding, matching Cased there (which in the ASCII range
11502                        is the same as Alpha */
11503
11504                     SV* ascii_source;
11505                     SV* l1_source;
11506                     const char *Xname;
11507
11508                     if (FOLD && ! LOC) {
11509                         ascii_source = PL_PosixAlpha;
11510                         l1_source = PL_L1Cased;
11511                         Xname = "Cased";
11512                     }
11513                     else {
11514                         ascii_source = PL_PosixLower;
11515                         l1_source = PL_L1PosixLower;
11516                         Xname = "XPosixLower";
11517                     }
11518                     if (namedclass == ANYOF_LOWER) {
11519                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11520                                     ascii_source, l1_source, Xname, listsv);
11521                     }
11522                     else {
11523                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11524                             properties, ascii_source, l1_source, Xname, listsv);
11525                     }
11526                     break;
11527                 }
11528                 case ANYOF_PRINT:
11529                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11530                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11531                     break;
11532                 case ANYOF_NPRINT:
11533                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11534                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11535                     break;
11536                 case ANYOF_PUNCT:
11537                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11538                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11539                     break;
11540                 case ANYOF_NPUNCT:
11541                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11542                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11543                     break;
11544                 case ANYOF_PSXSPC:
11545                     DO_POSIX(ret, namedclass, properties,
11546                                             PL_PosixSpace, PL_XPosixSpace);
11547                     break;
11548                 case ANYOF_NPSXSPC:
11549                     DO_N_POSIX(ret, namedclass, properties,
11550                                             PL_PosixSpace, PL_XPosixSpace);
11551                     break;
11552                 case ANYOF_SPACE:
11553                     DO_POSIX(ret, namedclass, properties,
11554                                             PL_PerlSpace, PL_XPerlSpace);
11555                     break;
11556                 case ANYOF_NSPACE:
11557                     DO_N_POSIX(ret, namedclass, properties,
11558                                             PL_PerlSpace, PL_XPerlSpace);
11559                     break;
11560                 case ANYOF_UPPER:   /* Same as LOWER, above */
11561                 case ANYOF_NUPPER:
11562                 {
11563                     SV* ascii_source;
11564                     SV* l1_source;
11565                     const char *Xname;
11566
11567                     if (FOLD && ! LOC) {
11568                         ascii_source = PL_PosixAlpha;
11569                         l1_source = PL_L1Cased;
11570                         Xname = "Cased";
11571                     }
11572                     else {
11573                         ascii_source = PL_PosixUpper;
11574                         l1_source = PL_L1PosixUpper;
11575                         Xname = "XPosixUpper";
11576                     }
11577                     if (namedclass == ANYOF_UPPER) {
11578                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11579                                     ascii_source, l1_source, Xname, listsv);
11580                     }
11581                     else {
11582                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11583                         properties, ascii_source, l1_source, Xname, listsv);
11584                     }
11585                     break;
11586                 }
11587                 case ANYOF_ALNUM:   /* Really is 'Word' */
11588                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11589                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11590                     break;
11591                 case ANYOF_NALNUM:
11592                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11593                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11594                     break;
11595                 case ANYOF_VERTWS:
11596                     /* For these, we use the nonbitmap, as /d doesn't make a
11597                      * difference in what these match.  There would be problems
11598                      * if these characters had folds other than themselves, as
11599                      * nonbitmap is subject to folding */
11600                     _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
11601                     break;
11602                 case ANYOF_NVERTWS:
11603                     _invlist_union_complement_2nd(nonbitmap,
11604                                                     PL_VertSpace, &nonbitmap);
11605                     break;
11606                 case ANYOF_XDIGIT:
11607                     DO_POSIX(ret, namedclass, properties,
11608                                             PL_PosixXDigit, PL_XPosixXDigit);
11609                     break;
11610                 case ANYOF_NXDIGIT:
11611                     DO_N_POSIX(ret, namedclass, properties,
11612                                             PL_PosixXDigit, PL_XPosixXDigit);
11613                     break;
11614                 case ANYOF_MAX:
11615                     /* this is to handle \p and \P */
11616                     break;
11617                 default:
11618                     vFAIL("Invalid [::] class");
11619                     break;
11620                 }
11621
11622                 continue;
11623             }
11624         } /* end of namedclass \blah */
11625
11626         if (range) {
11627             if (prevvalue > (IV)value) /* b-a */ {
11628                 const int w = RExC_parse - rangebegin;
11629                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11630                 range = 0; /* not a valid range */
11631             }
11632         }
11633         else {
11634             prevvalue = value; /* save the beginning of the range */
11635             if (RExC_parse+1 < RExC_end
11636                 && *RExC_parse == '-'
11637                 && RExC_parse[1] != ']')
11638             {
11639                 RExC_parse++;
11640
11641                 /* a bad range like \w-, [:word:]- ? */
11642                 if (namedclass > OOB_NAMEDCLASS) {
11643                     if (ckWARN(WARN_REGEXP)) {
11644                         const int w =
11645                             RExC_parse >= rangebegin ?
11646                             RExC_parse - rangebegin : 0;
11647                         vWARN4(RExC_parse,
11648                                "False [] range \"%*.*s\"",
11649                                w, w, rangebegin);
11650                     }
11651                     if (!SIZE_ONLY)
11652                         stored +=
11653                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11654                 } else
11655                     range = 1;  /* yeah, it's a range! */
11656                 continue;       /* but do it the next time */
11657             }
11658         }
11659
11660         /* non-Latin1 code point implies unicode semantics.  Must be set in
11661          * pass1 so is there for the whole of pass 2 */
11662         if (value > 255) {
11663             RExC_uni_semantics = 1;
11664         }
11665
11666         /* now is the next time */
11667         if (!SIZE_ONLY) {
11668             if (prevvalue < 256) {
11669                 const IV ceilvalue = value < 256 ? value : 255;
11670                 IV i;
11671 #ifdef EBCDIC
11672                 /* In EBCDIC [\x89-\x91] should include
11673                  * the \x8e but [i-j] should not. */
11674                 if (literal_endpoint == 2 &&
11675                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11676                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11677                 {
11678                     if (isLOWER(prevvalue)) {
11679                         for (i = prevvalue; i <= ceilvalue; i++)
11680                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11681                                 stored +=
11682                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11683                             }
11684                     } else {
11685                         for (i = prevvalue; i <= ceilvalue; i++)
11686                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11687                                 stored +=
11688                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11689                             }
11690                     }
11691                 }
11692                 else
11693 #endif
11694                       for (i = prevvalue; i <= ceilvalue; i++) {
11695                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11696                       }
11697           }
11698           if (value > 255) {
11699             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
11700             const UV natvalue      = NATIVE_TO_UNI(value);
11701             nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11702         }
11703 #ifdef EBCDIC
11704             literal_endpoint = 0;
11705 #endif
11706         }
11707
11708         range = 0; /* this range (if it was one) is done now */
11709     }
11710
11711
11712
11713     if (SIZE_ONLY)
11714         return ret;
11715     /****** !SIZE_ONLY AFTER HERE *********/
11716
11717     /* If folding and there are code points above 255, we calculate all
11718      * characters that could fold to or from the ones already on the list */
11719     if (FOLD && nonbitmap) {
11720         UV start, end;  /* End points of code point ranges */
11721
11722         SV* fold_intersection = NULL;
11723
11724         /* This is a list of all the characters that participate in folds
11725             * (except marks, etc in multi-char folds */
11726         if (! PL_utf8_foldable) {
11727             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11728             PL_utf8_foldable = _swash_to_invlist(swash);
11729             SvREFCNT_dec(swash);
11730         }
11731
11732         /* This is a hash that for a particular fold gives all characters
11733             * that are involved in it */
11734         if (! PL_utf8_foldclosures) {
11735
11736             /* If we were unable to find any folds, then we likely won't be
11737              * able to find the closures.  So just create an empty list.
11738              * Folding will effectively be restricted to the non-Unicode rules
11739              * hard-coded into Perl.  (This case happens legitimately during
11740              * compilation of Perl itself before the Unicode tables are
11741              * generated) */
11742             if (invlist_len(PL_utf8_foldable) == 0) {
11743                 PL_utf8_foldclosures = newHV();
11744             } else {
11745                 /* If the folds haven't been read in, call a fold function
11746                     * to force that */
11747                 if (! PL_utf8_tofold) {
11748                     U8 dummy[UTF8_MAXBYTES+1];
11749                     STRLEN dummy_len;
11750
11751                     /* This particular string is above \xff in both UTF-8 and
11752                      * UTFEBCDIC */
11753                     to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11754                     assert(PL_utf8_tofold); /* Verify that worked */
11755                 }
11756                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11757             }
11758         }
11759
11760         /* Only the characters in this class that participate in folds need be
11761          * checked.  Get the intersection of this class and all the possible
11762          * characters that are foldable.  This can quickly narrow down a large
11763          * class */
11764         _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11765
11766         /* Now look at the foldable characters in this class individually */
11767         invlist_iterinit(fold_intersection);
11768         while (invlist_iternext(fold_intersection, &start, &end)) {
11769             UV j;
11770
11771             /* Look at every character in the range */
11772             for (j = start; j <= end; j++) {
11773
11774                 /* Get its fold */
11775                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11776                 STRLEN foldlen;
11777                 const UV f =
11778                     _to_uni_fold_flags(j, foldbuf, &foldlen,
11779                                        (allow_full_fold) ? FOLD_FLAGS_FULL : 0);
11780
11781                 if (foldlen > (STRLEN)UNISKIP(f)) {
11782
11783                     /* Any multicharacter foldings (disallowed in lookbehind
11784                      * patterns) require the following transform: [ABCDEF] ->
11785                      * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11786                      * folds into "rst", all other characters fold to single
11787                      * characters.  We save away these multicharacter foldings,
11788                      * to be later saved as part of the additional "s" data. */
11789                     if (! RExC_in_lookbehind) {
11790                         U8* loc = foldbuf;
11791                         U8* e = foldbuf + foldlen;
11792
11793                         /* If any of the folded characters of this are in the
11794                          * Latin1 range, tell the regex engine that this can
11795                          * match a non-utf8 target string.  The only multi-byte
11796                          * fold whose source is in the Latin1 range (U+00DF)
11797                          * applies only when the target string is utf8, or
11798                          * under unicode rules */
11799                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11800                             while (loc < e) {
11801
11802                                 /* Can't mix ascii with non- under /aa */
11803                                 if (MORE_ASCII_RESTRICTED
11804                                     && (isASCII(*loc) != isASCII(j)))
11805                                 {
11806                                     goto end_multi_fold;
11807                                 }
11808                                 if (UTF8_IS_INVARIANT(*loc)
11809                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
11810                                 {
11811                                     /* Can't mix above and below 256 under LOC
11812                                      */
11813                                     if (LOC) {
11814                                         goto end_multi_fold;
11815                                     }
11816                                     ANYOF_FLAGS(ret)
11817                                             |= ANYOF_NONBITMAP_NON_UTF8;
11818                                     break;
11819                                 }
11820                                 loc += UTF8SKIP(loc);
11821                             }
11822                         }
11823
11824                         add_alternate(&unicode_alternate, foldbuf, foldlen);
11825                     end_multi_fold: ;
11826                     }
11827
11828                     /* This is special-cased, as it is the only letter which
11829                      * has both a multi-fold and single-fold in Latin1.  All
11830                      * the other chars that have single and multi-folds are
11831                      * always in utf8, and the utf8 folding algorithm catches
11832                      * them */
11833                     if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11834                         stored += set_regclass_bit(pRExC_state,
11835                                         ret,
11836                                         LATIN_SMALL_LETTER_SHARP_S,
11837                                         &l1_fold_invlist, &unicode_alternate);
11838                     }
11839                 }
11840                 else {
11841                     /* Single character fold.  Add everything in its fold
11842                      * closure to the list that this node should match */
11843                     SV** listp;
11844
11845                     /* The fold closures data structure is a hash with the keys
11846                      * being every character that is folded to, like 'k', and
11847                      * the values each an array of everything that folds to its
11848                      * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
11849                     if ((listp = hv_fetch(PL_utf8_foldclosures,
11850                                     (char *) foldbuf, foldlen, FALSE)))
11851                     {
11852                         AV* list = (AV*) *listp;
11853                         IV k;
11854                         for (k = 0; k <= av_len(list); k++) {
11855                             SV** c_p = av_fetch(list, k, FALSE);
11856                             UV c;
11857                             if (c_p == NULL) {
11858                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11859                             }
11860                             c = SvUV(*c_p);
11861
11862                             /* /aa doesn't allow folds between ASCII and non-;
11863                              * /l doesn't allow them between above and below
11864                              * 256 */
11865                             if ((MORE_ASCII_RESTRICTED
11866                                  && (isASCII(c) != isASCII(j)))
11867                                     || (LOC && ((c < 256) != (j < 256))))
11868                             {
11869                                 continue;
11870                             }
11871
11872                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11873                                 stored += set_regclass_bit(pRExC_state,
11874                                         ret,
11875                                         (U8) c,
11876                                         &l1_fold_invlist, &unicode_alternate);
11877                             }
11878                                 /* It may be that the code point is already in
11879                                  * this range or already in the bitmap, in
11880                                  * which case we need do nothing */
11881                             else if ((c < start || c > end)
11882                                         && (c > 255
11883                                             || ! ANYOF_BITMAP_TEST(ret, c)))
11884                             {
11885                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11886                             }
11887                         }
11888                     }
11889                 }
11890             }
11891         }
11892         SvREFCNT_dec(fold_intersection);
11893     }
11894
11895     /* Combine the two lists into one. */
11896     if (l1_fold_invlist) {
11897         if (nonbitmap) {
11898             _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11899             SvREFCNT_dec(l1_fold_invlist);
11900         }
11901         else {
11902             nonbitmap = l1_fold_invlist;
11903         }
11904     }
11905
11906     /* And combine the result (if any) with any inversion list from properties.
11907      * The lists are kept separate up to now because we don't want to fold the
11908      * properties */
11909     if (properties) {
11910         if (nonbitmap) {
11911             _invlist_union(nonbitmap, properties, &nonbitmap);
11912             SvREFCNT_dec(properties);
11913         }
11914         else {
11915             nonbitmap = properties;
11916         }
11917     }
11918
11919     /* Here, <nonbitmap> contains all the code points we can determine at
11920      * compile time that we haven't put into the bitmap.  Go through it, and
11921      * for things that belong in the bitmap, put them there, and delete from
11922      * <nonbitmap> */
11923     if (nonbitmap) {
11924
11925         /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11926          * possibly only should match when the target string is UTF-8 */
11927         UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11928
11929         /* This gets set if we actually need to modify things */
11930         bool change_invlist = FALSE;
11931
11932         UV start, end;
11933
11934         /* Start looking through <nonbitmap> */
11935         invlist_iterinit(nonbitmap);
11936         while (invlist_iternext(nonbitmap, &start, &end)) {
11937             UV high;
11938             int i;
11939
11940             /* Quit if are above what we should change */
11941             if (start > max_cp_to_set) {
11942                 break;
11943             }
11944
11945             change_invlist = TRUE;
11946
11947             /* Set all the bits in the range, up to the max that we are doing */
11948             high = (end < max_cp_to_set) ? end : max_cp_to_set;
11949             for (i = start; i <= (int) high; i++) {
11950                 if (! ANYOF_BITMAP_TEST(ret, i)) {
11951                     ANYOF_BITMAP_SET(ret, i);
11952                     stored++;
11953                     prevvalue = value;
11954                     value = i;
11955                 }
11956             }
11957         }
11958
11959         /* Done with loop; remove any code points that are in the bitmap from
11960          * <nonbitmap> */
11961         if (change_invlist) {
11962             _invlist_subtract(nonbitmap,
11963                               (DEPENDS_SEMANTICS)
11964                                 ? PL_ASCII
11965                                 : PL_Latin1,
11966                               &nonbitmap);
11967         }
11968
11969         /* If have completely emptied it, remove it completely */
11970         if (invlist_len(nonbitmap) == 0) {
11971             SvREFCNT_dec(nonbitmap);
11972             nonbitmap = NULL;
11973         }
11974     }
11975
11976     /* Here, we have calculated what code points should be in the character
11977      * class.  <nonbitmap> does not overlap the bitmap except possibly in the
11978      * case of DEPENDS rules.
11979      *
11980      * Now we can see about various optimizations.  Fold calculation (which we
11981      * did above) needs to take place before inversion.  Otherwise /[^k]/i
11982      * would invert to include K, which under /i would match k, which it
11983      * shouldn't. */
11984
11985     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
11986      * set the FOLD flag yet, so this does optimize those.  It doesn't
11987      * optimize locale.  Doing so perhaps could be done as long as there is
11988      * nothing like \w in it; some thought also would have to be given to the
11989      * interaction with above 0x100 chars */
11990     if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11991         && ! LOC
11992         && ! unicode_alternate
11993         /* In case of /d, there are some things that should match only when in
11994          * not in the bitmap, i.e., they require UTF8 to match.  These are
11995          * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11996          * case, they don't require UTF8, so can invert here */
11997         && (! nonbitmap
11998             || ! DEPENDS_SEMANTICS
11999             || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12000         && SvCUR(listsv) == initial_listsv_len)
12001     {
12002         int i;
12003         if (! nonbitmap) {
12004             for (i = 0; i < 256; ++i) {
12005                 if (ANYOF_BITMAP_TEST(ret, i)) {
12006                     ANYOF_BITMAP_CLEAR(ret, i);
12007                 }
12008                 else {
12009                     ANYOF_BITMAP_SET(ret, i);
12010                     prevvalue = value;
12011                     value = i;
12012                 }
12013             }
12014             /* The inversion means that everything above 255 is matched */
12015             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
12016         }
12017         else {
12018             /* Here, also has things outside the bitmap that may overlap with
12019              * the bitmap.  We have to sync them up, so that they get inverted
12020              * in both places.  Earlier, we removed all overlaps except in the
12021              * case of /d rules, so no syncing is needed except for this case
12022              */
12023             SV *remove_list = NULL;
12024
12025             if (DEPENDS_SEMANTICS) {
12026                 UV start, end;
12027
12028                 /* Set the bits that correspond to the ones that aren't in the
12029                  * bitmap.  Otherwise, when we invert, we'll miss these.
12030                  * Earlier, we removed from the nonbitmap all code points
12031                  * < 128, so there is no extra work here */
12032                 invlist_iterinit(nonbitmap);
12033                 while (invlist_iternext(nonbitmap, &start, &end)) {
12034                     if (start > 255) {  /* The bit map goes to 255 */
12035                         break;
12036                     }
12037                     if (end > 255) {
12038                         end = 255;
12039                     }
12040                     for (i = start; i <= (int) end; ++i) {
12041                         ANYOF_BITMAP_SET(ret, i);
12042                         prevvalue = value;
12043                         value = i;
12044                     }
12045                 }
12046             }
12047
12048             /* Now invert both the bitmap and the nonbitmap.  Anything in the
12049              * bitmap has to also be removed from the non-bitmap, but again,
12050              * there should not be overlap unless is /d rules. */
12051             _invlist_invert(nonbitmap);
12052
12053             /* Any swash can't be used as-is, because we've inverted things */
12054             if (swash) {
12055                 SvREFCNT_dec(swash);
12056                 swash = NULL;
12057             }
12058
12059             for (i = 0; i < 256; ++i) {
12060                 if (ANYOF_BITMAP_TEST(ret, i)) {
12061                     ANYOF_BITMAP_CLEAR(ret, i);
12062                     if (DEPENDS_SEMANTICS) {
12063                         if (! remove_list) {
12064                             remove_list = _new_invlist(2);
12065                         }
12066                         remove_list = add_cp_to_invlist(remove_list, i);
12067                     }
12068                 }
12069                 else {
12070                     ANYOF_BITMAP_SET(ret, i);
12071                     prevvalue = value;
12072                     value = i;
12073                 }
12074             }
12075
12076             /* And do the removal */
12077             if (DEPENDS_SEMANTICS) {
12078                 if (remove_list) {
12079                     _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
12080                     SvREFCNT_dec(remove_list);
12081                 }
12082             }
12083             else {
12084                 /* There is no overlap for non-/d, so just delete anything
12085                  * below 256 */
12086                 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
12087             }
12088         }
12089
12090         stored = 256 - stored;
12091
12092         /* Clear the invert flag since have just done it here */
12093         ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
12094     }
12095
12096     /* Folding in the bitmap is taken care of above, but not for locale (for
12097      * which we have to wait to see what folding is in effect at runtime), and
12098      * for some things not in the bitmap (only the upper latin folds in this
12099      * case, as all other single-char folding has been set above).  Set
12100      * run-time fold flag for these */
12101     if (FOLD && (LOC
12102                 || (DEPENDS_SEMANTICS
12103                     && nonbitmap
12104                     && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12105                 || unicode_alternate))
12106     {
12107         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12108     }
12109
12110     /* A single character class can be "optimized" into an EXACTish node.
12111      * Note that since we don't currently count how many characters there are
12112      * outside the bitmap, we are XXX missing optimization possibilities for
12113      * them.  This optimization can't happen unless this is a truly single
12114      * character class, which means that it can't be an inversion into a
12115      * many-character class, and there must be no possibility of there being
12116      * things outside the bitmap.  'stored' (only) for locales doesn't include
12117      * \w, etc, so have to make a special test that they aren't present
12118      *
12119      * Similarly A 2-character class of the very special form like [bB] can be
12120      * optimized into an EXACTFish node, but only for non-locales, and for
12121      * characters which only have the two folds; so things like 'fF' and 'Ii'
12122      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
12123      * FI'. */
12124     if (! nonbitmap
12125         && ! unicode_alternate
12126         && SvCUR(listsv) == initial_listsv_len
12127         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
12128         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12129                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
12130             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12131                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
12132                                  /* If the latest code point has a fold whose
12133                                   * bit is set, it must be the only other one */
12134                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
12135                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
12136     {
12137         /* Note that the information needed to decide to do this optimization
12138          * is not currently available until the 2nd pass, and that the actually
12139          * used EXACTish node takes less space than the calculated ANYOF node,
12140          * and hence the amount of space calculated in the first pass is larger
12141          * than actually used, so this optimization doesn't gain us any space.
12142          * But an EXACT node is faster than an ANYOF node, and can be combined
12143          * with any adjacent EXACT nodes later by the optimizer for further
12144          * gains.  The speed of executing an EXACTF is similar to an ANYOF
12145          * node, so the optimization advantage comes from the ability to join
12146          * it to adjacent EXACT nodes */
12147
12148         const char * cur_parse= RExC_parse;
12149         U8 op;
12150         RExC_emit = (regnode *)orig_emit;
12151         RExC_parse = (char *)orig_parse;
12152
12153         if (stored == 1) {
12154
12155             /* A locale node with one point can be folded; all the other cases
12156              * with folding will have two points, since we calculate them above
12157              */
12158             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
12159                  op = EXACTFL;
12160             }
12161             else {
12162                 op = EXACT;
12163             }
12164         }
12165         else {   /* else 2 chars in the bit map: the folds of each other */
12166
12167             /* Use the folded value, which for the cases where we get here,
12168              * is just the lower case of the current one (which may resolve to
12169              * itself, or to the other one */
12170             value = toLOWER_LATIN1(value);
12171
12172             /* To join adjacent nodes, they must be the exact EXACTish type.
12173              * Try to use the most likely type, by using EXACTFA if possible,
12174              * then EXACTFU if the regex calls for it, or is required because
12175              * the character is non-ASCII.  (If <value> is ASCII, its fold is
12176              * also ASCII for the cases where we get here.) */
12177             if (MORE_ASCII_RESTRICTED && isASCII(value)) {
12178                 op = EXACTFA;
12179             }
12180             else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
12181                 op = EXACTFU;
12182             }
12183             else {    /* Otherwise, more likely to be EXACTF type */
12184                 op = EXACTF;
12185             }
12186         }
12187
12188         ret = reg_node(pRExC_state, op);
12189         RExC_parse = (char *)cur_parse;
12190         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
12191             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
12192             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
12193             STR_LEN(ret)= 2;
12194             RExC_emit += STR_SZ(2);
12195         }
12196         else {
12197             *STRING(ret)= (char)value;
12198             STR_LEN(ret)= 1;
12199             RExC_emit += STR_SZ(1);
12200         }
12201         SvREFCNT_dec(listsv);
12202         return ret;
12203     }
12204
12205     /* If there is a swash and more than one element, we can't use the swash in
12206      * the optimization below. */
12207     if (swash && element_count > 1) {
12208         SvREFCNT_dec(swash);
12209         swash = NULL;
12210     }
12211     if (! nonbitmap
12212         && SvCUR(listsv) == initial_listsv_len
12213         && ! unicode_alternate)
12214     {
12215         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12216         SvREFCNT_dec(listsv);
12217         SvREFCNT_dec(unicode_alternate);
12218     }
12219     else {
12220         /* av[0] stores the character class description in its textual form:
12221          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
12222          *       appropriate swash, and is also useful for dumping the regnode.
12223          * av[1] if NULL, is a placeholder to later contain the swash computed
12224          *       from av[0].  But if no further computation need be done, the
12225          *       swash is stored there now.
12226          * av[2] stores the multicharacter foldings, used later in
12227          *       regexec.c:S_reginclass().
12228          * av[3] stores the nonbitmap inversion list for use in addition or
12229          *       instead of av[0]; not used if av[1] isn't NULL
12230          * av[4] is set if any component of the class is from a user-defined
12231          *       property; not used if av[1] isn't NULL */
12232         AV * const av = newAV();
12233         SV *rv;
12234
12235         av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
12236                         ? &PL_sv_undef
12237                         : listsv);
12238         if (swash) {
12239             av_store(av, 1, swash);
12240             SvREFCNT_dec(nonbitmap);
12241         }
12242         else {
12243             av_store(av, 1, NULL);
12244             if (nonbitmap) {
12245                 av_store(av, 3, nonbitmap);
12246                 av_store(av, 4, newSVuv(has_user_defined_property));
12247             }
12248         }
12249
12250         /* Store any computed multi-char folds only if we are allowing
12251          * them */
12252         if (allow_full_fold) {
12253             av_store(av, 2, MUTABLE_SV(unicode_alternate));
12254             if (unicode_alternate) { /* This node is variable length */
12255                 OP(ret) = ANYOFV;
12256             }
12257         }
12258         else {
12259             av_store(av, 2, NULL);
12260         }
12261         rv = newRV_noinc(MUTABLE_SV(av));
12262         n = add_data(pRExC_state, 1, "s");
12263         RExC_rxi->data->data[n] = (void*)rv;
12264         ARG_SET(ret, n);
12265     }
12266     return ret;
12267 }
12268
12269
12270 /* reg_skipcomment()
12271
12272    Absorbs an /x style # comments from the input stream.
12273    Returns true if there is more text remaining in the stream.
12274    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12275    terminates the pattern without including a newline.
12276
12277    Note its the callers responsibility to ensure that we are
12278    actually in /x mode
12279
12280 */
12281
12282 STATIC bool
12283 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12284 {
12285     bool ended = 0;
12286
12287     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12288
12289     while (RExC_parse < RExC_end)
12290         if (*RExC_parse++ == '\n') {
12291             ended = 1;
12292             break;
12293         }
12294     if (!ended) {
12295         /* we ran off the end of the pattern without ending
12296            the comment, so we have to add an \n when wrapping */
12297         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12298         return 0;
12299     } else
12300         return 1;
12301 }
12302
12303 /* nextchar()
12304
12305    Advances the parse position, and optionally absorbs
12306    "whitespace" from the inputstream.
12307
12308    Without /x "whitespace" means (?#...) style comments only,
12309    with /x this means (?#...) and # comments and whitespace proper.
12310
12311    Returns the RExC_parse point from BEFORE the scan occurs.
12312
12313    This is the /x friendly way of saying RExC_parse++.
12314 */
12315
12316 STATIC char*
12317 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12318 {
12319     char* const retval = RExC_parse++;
12320
12321     PERL_ARGS_ASSERT_NEXTCHAR;
12322
12323     for (;;) {
12324         if (RExC_end - RExC_parse >= 3
12325             && *RExC_parse == '('
12326             && RExC_parse[1] == '?'
12327             && RExC_parse[2] == '#')
12328         {
12329             while (*RExC_parse != ')') {
12330                 if (RExC_parse == RExC_end)
12331                     FAIL("Sequence (?#... not terminated");
12332                 RExC_parse++;
12333             }
12334             RExC_parse++;
12335             continue;
12336         }
12337         if (RExC_flags & RXf_PMf_EXTENDED) {
12338             if (isSPACE(*RExC_parse)) {
12339                 RExC_parse++;
12340                 continue;
12341             }
12342             else if (*RExC_parse == '#') {
12343                 if ( reg_skipcomment( pRExC_state ) )
12344                     continue;
12345             }
12346         }
12347         return retval;
12348     }
12349 }
12350
12351 /*
12352 - reg_node - emit a node
12353 */
12354 STATIC regnode *                        /* Location. */
12355 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12356 {
12357     dVAR;
12358     register regnode *ptr;
12359     regnode * const ret = RExC_emit;
12360     GET_RE_DEBUG_FLAGS_DECL;
12361
12362     PERL_ARGS_ASSERT_REG_NODE;
12363
12364     if (SIZE_ONLY) {
12365         SIZE_ALIGN(RExC_size);
12366         RExC_size += 1;
12367         return(ret);
12368     }
12369     if (RExC_emit >= RExC_emit_bound)
12370         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12371                    op, RExC_emit, RExC_emit_bound);
12372
12373     NODE_ALIGN_FILL(ret);
12374     ptr = ret;
12375     FILL_ADVANCE_NODE(ptr, op);
12376 #ifdef RE_TRACK_PATTERN_OFFSETS
12377     if (RExC_offsets) {         /* MJD */
12378         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
12379               "reg_node", __LINE__,
12380               PL_reg_name[op],
12381               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
12382                 ? "Overwriting end of array!\n" : "OK",
12383               (UV)(RExC_emit - RExC_emit_start),
12384               (UV)(RExC_parse - RExC_start),
12385               (UV)RExC_offsets[0]));
12386         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
12387     }
12388 #endif
12389     RExC_emit = ptr;
12390     return(ret);
12391 }
12392
12393 /*
12394 - reganode - emit a node with an argument
12395 */
12396 STATIC regnode *                        /* Location. */
12397 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12398 {
12399     dVAR;
12400     register regnode *ptr;
12401     regnode * const ret = RExC_emit;
12402     GET_RE_DEBUG_FLAGS_DECL;
12403
12404     PERL_ARGS_ASSERT_REGANODE;
12405
12406     if (SIZE_ONLY) {
12407         SIZE_ALIGN(RExC_size);
12408         RExC_size += 2;
12409         /*
12410            We can't do this:
12411
12412            assert(2==regarglen[op]+1);
12413
12414            Anything larger than this has to allocate the extra amount.
12415            If we changed this to be:
12416
12417            RExC_size += (1 + regarglen[op]);
12418
12419            then it wouldn't matter. Its not clear what side effect
12420            might come from that so its not done so far.
12421            -- dmq
12422         */
12423         return(ret);
12424     }
12425     if (RExC_emit >= RExC_emit_bound)
12426         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12427                    op, RExC_emit, RExC_emit_bound);
12428
12429     NODE_ALIGN_FILL(ret);
12430     ptr = ret;
12431     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
12432 #ifdef RE_TRACK_PATTERN_OFFSETS
12433     if (RExC_offsets) {         /* MJD */
12434         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12435               "reganode",
12436               __LINE__,
12437               PL_reg_name[op],
12438               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
12439               "Overwriting end of array!\n" : "OK",
12440               (UV)(RExC_emit - RExC_emit_start),
12441               (UV)(RExC_parse - RExC_start),
12442               (UV)RExC_offsets[0]));
12443         Set_Cur_Node_Offset;
12444     }
12445 #endif
12446     RExC_emit = ptr;
12447     return(ret);
12448 }
12449
12450 /*
12451 - reguni - emit (if appropriate) a Unicode character
12452 */
12453 STATIC STRLEN
12454 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
12455 {
12456     dVAR;
12457
12458     PERL_ARGS_ASSERT_REGUNI;
12459
12460     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
12461 }
12462
12463 /*
12464 - reginsert - insert an operator in front of already-emitted operand
12465 *
12466 * Means relocating the operand.
12467 */
12468 STATIC void
12469 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
12470 {
12471     dVAR;
12472     register regnode *src;
12473     register regnode *dst;
12474     register regnode *place;
12475     const int offset = regarglen[(U8)op];
12476     const int size = NODE_STEP_REGNODE + offset;
12477     GET_RE_DEBUG_FLAGS_DECL;
12478
12479     PERL_ARGS_ASSERT_REGINSERT;
12480     PERL_UNUSED_ARG(depth);
12481 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
12482     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
12483     if (SIZE_ONLY) {
12484         RExC_size += size;
12485         return;
12486     }
12487
12488     src = RExC_emit;
12489     RExC_emit += size;
12490     dst = RExC_emit;
12491     if (RExC_open_parens) {
12492         int paren;
12493         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
12494         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12495             if ( RExC_open_parens[paren] >= opnd ) {
12496                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
12497                 RExC_open_parens[paren] += size;
12498             } else {
12499                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12500             }
12501             if ( RExC_close_parens[paren] >= opnd ) {
12502                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
12503                 RExC_close_parens[paren] += size;
12504             } else {
12505                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12506             }
12507         }
12508     }
12509
12510     while (src > opnd) {
12511         StructCopy(--src, --dst, regnode);
12512 #ifdef RE_TRACK_PATTERN_OFFSETS
12513         if (RExC_offsets) {     /* MJD 20010112 */
12514             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
12515                   "reg_insert",
12516                   __LINE__,
12517                   PL_reg_name[op],
12518                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
12519                     ? "Overwriting end of array!\n" : "OK",
12520                   (UV)(src - RExC_emit_start),
12521                   (UV)(dst - RExC_emit_start),
12522                   (UV)RExC_offsets[0]));
12523             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
12524             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
12525         }
12526 #endif
12527     }
12528
12529
12530     place = opnd;               /* Op node, where operand used to be. */
12531 #ifdef RE_TRACK_PATTERN_OFFSETS
12532     if (RExC_offsets) {         /* MJD */
12533         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12534               "reginsert",
12535               __LINE__,
12536               PL_reg_name[op],
12537               (UV)(place - RExC_emit_start) > RExC_offsets[0]
12538               ? "Overwriting end of array!\n" : "OK",
12539               (UV)(place - RExC_emit_start),
12540               (UV)(RExC_parse - RExC_start),
12541               (UV)RExC_offsets[0]));
12542         Set_Node_Offset(place, RExC_parse);
12543         Set_Node_Length(place, 1);
12544     }
12545 #endif
12546     src = NEXTOPER(place);
12547     FILL_ADVANCE_NODE(place, op);
12548     Zero(src, offset, regnode);
12549 }
12550
12551 /*
12552 - regtail - set the next-pointer at the end of a node chain of p to val.
12553 - SEE ALSO: regtail_study
12554 */
12555 /* TODO: All three parms should be const */
12556 STATIC void
12557 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12558 {
12559     dVAR;
12560     register regnode *scan;
12561     GET_RE_DEBUG_FLAGS_DECL;
12562
12563     PERL_ARGS_ASSERT_REGTAIL;
12564 #ifndef DEBUGGING
12565     PERL_UNUSED_ARG(depth);
12566 #endif
12567
12568     if (SIZE_ONLY)
12569         return;
12570
12571     /* Find last node. */
12572     scan = p;
12573     for (;;) {
12574         regnode * const temp = regnext(scan);
12575         DEBUG_PARSE_r({
12576             SV * const mysv=sv_newmortal();
12577             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12578             regprop(RExC_rx, mysv, scan);
12579             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12580                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12581                     (temp == NULL ? "->" : ""),
12582                     (temp == NULL ? PL_reg_name[OP(val)] : "")
12583             );
12584         });
12585         if (temp == NULL)
12586             break;
12587         scan = temp;
12588     }
12589
12590     if (reg_off_by_arg[OP(scan)]) {
12591         ARG_SET(scan, val - scan);
12592     }
12593     else {
12594         NEXT_OFF(scan) = val - scan;
12595     }
12596 }
12597
12598 #ifdef DEBUGGING
12599 /*
12600 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12601 - Look for optimizable sequences at the same time.
12602 - currently only looks for EXACT chains.
12603
12604 This is experimental code. The idea is to use this routine to perform
12605 in place optimizations on branches and groups as they are constructed,
12606 with the long term intention of removing optimization from study_chunk so
12607 that it is purely analytical.
12608
12609 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12610 to control which is which.
12611
12612 */
12613 /* TODO: All four parms should be const */
12614
12615 STATIC U8
12616 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12617 {
12618     dVAR;
12619     register regnode *scan;
12620     U8 exact = PSEUDO;
12621 #ifdef EXPERIMENTAL_INPLACESCAN
12622     I32 min = 0;
12623 #endif
12624     GET_RE_DEBUG_FLAGS_DECL;
12625
12626     PERL_ARGS_ASSERT_REGTAIL_STUDY;
12627
12628
12629     if (SIZE_ONLY)
12630         return exact;
12631
12632     /* Find last node. */
12633
12634     scan = p;
12635     for (;;) {
12636         regnode * const temp = regnext(scan);
12637 #ifdef EXPERIMENTAL_INPLACESCAN
12638         if (PL_regkind[OP(scan)] == EXACT) {
12639             bool has_exactf_sharp_s;    /* Unexamined in this routine */
12640             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12641                 return EXACT;
12642         }
12643 #endif
12644         if ( exact ) {
12645             switch (OP(scan)) {
12646                 case EXACT:
12647                 case EXACTF:
12648                 case EXACTFA:
12649                 case EXACTFU:
12650                 case EXACTFU_SS:
12651                 case EXACTFU_TRICKYFOLD:
12652                 case EXACTFL:
12653                         if( exact == PSEUDO )
12654                             exact= OP(scan);
12655                         else if ( exact != OP(scan) )
12656                             exact= 0;
12657                 case NOTHING:
12658                     break;
12659                 default:
12660                     exact= 0;
12661             }
12662         }
12663         DEBUG_PARSE_r({
12664             SV * const mysv=sv_newmortal();
12665             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12666             regprop(RExC_rx, mysv, scan);
12667             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12668                 SvPV_nolen_const(mysv),
12669                 REG_NODE_NUM(scan),
12670                 PL_reg_name[exact]);
12671         });
12672         if (temp == NULL)
12673             break;
12674         scan = temp;
12675     }
12676     DEBUG_PARSE_r({
12677         SV * const mysv_val=sv_newmortal();
12678         DEBUG_PARSE_MSG("");
12679         regprop(RExC_rx, mysv_val, val);
12680         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12681                       SvPV_nolen_const(mysv_val),
12682                       (IV)REG_NODE_NUM(val),
12683                       (IV)(val - scan)
12684         );
12685     });
12686     if (reg_off_by_arg[OP(scan)]) {
12687         ARG_SET(scan, val - scan);
12688     }
12689     else {
12690         NEXT_OFF(scan) = val - scan;
12691     }
12692
12693     return exact;
12694 }
12695 #endif
12696
12697 /*
12698  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12699  */
12700 #ifdef DEBUGGING
12701 static void
12702 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12703 {
12704     int bit;
12705     int set=0;
12706     regex_charset cs;
12707
12708     for (bit=0; bit<32; bit++) {
12709         if (flags & (1<<bit)) {
12710             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
12711                 continue;
12712             }
12713             if (!set++ && lead)
12714                 PerlIO_printf(Perl_debug_log, "%s",lead);
12715             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12716         }
12717     }
12718     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12719             if (!set++ && lead) {
12720                 PerlIO_printf(Perl_debug_log, "%s",lead);
12721             }
12722             switch (cs) {
12723                 case REGEX_UNICODE_CHARSET:
12724                     PerlIO_printf(Perl_debug_log, "UNICODE");
12725                     break;
12726                 case REGEX_LOCALE_CHARSET:
12727                     PerlIO_printf(Perl_debug_log, "LOCALE");
12728                     break;
12729                 case REGEX_ASCII_RESTRICTED_CHARSET:
12730                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12731                     break;
12732                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12733                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12734                     break;
12735                 default:
12736                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12737                     break;
12738             }
12739     }
12740     if (lead)  {
12741         if (set)
12742             PerlIO_printf(Perl_debug_log, "\n");
12743         else
12744             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12745     }
12746 }
12747 #endif
12748
12749 void
12750 Perl_regdump(pTHX_ const regexp *r)
12751 {
12752 #ifdef DEBUGGING
12753     dVAR;
12754     SV * const sv = sv_newmortal();
12755     SV *dsv= sv_newmortal();
12756     RXi_GET_DECL(r,ri);
12757     GET_RE_DEBUG_FLAGS_DECL;
12758
12759     PERL_ARGS_ASSERT_REGDUMP;
12760
12761     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12762
12763     /* Header fields of interest. */
12764     if (r->anchored_substr) {
12765         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12766             RE_SV_DUMPLEN(r->anchored_substr), 30);
12767         PerlIO_printf(Perl_debug_log,
12768                       "anchored %s%s at %"IVdf" ",
12769                       s, RE_SV_TAIL(r->anchored_substr),
12770                       (IV)r->anchored_offset);
12771     } else if (r->anchored_utf8) {
12772         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12773             RE_SV_DUMPLEN(r->anchored_utf8), 30);
12774         PerlIO_printf(Perl_debug_log,
12775                       "anchored utf8 %s%s at %"IVdf" ",
12776                       s, RE_SV_TAIL(r->anchored_utf8),
12777                       (IV)r->anchored_offset);
12778     }
12779     if (r->float_substr) {
12780         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12781             RE_SV_DUMPLEN(r->float_substr), 30);
12782         PerlIO_printf(Perl_debug_log,
12783                       "floating %s%s at %"IVdf"..%"UVuf" ",
12784                       s, RE_SV_TAIL(r->float_substr),
12785                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12786     } else if (r->float_utf8) {
12787         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12788             RE_SV_DUMPLEN(r->float_utf8), 30);
12789         PerlIO_printf(Perl_debug_log,
12790                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12791                       s, RE_SV_TAIL(r->float_utf8),
12792                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12793     }
12794     if (r->check_substr || r->check_utf8)
12795         PerlIO_printf(Perl_debug_log,
12796                       (const char *)
12797                       (r->check_substr == r->float_substr
12798                        && r->check_utf8 == r->float_utf8
12799                        ? "(checking floating" : "(checking anchored"));
12800     if (r->extflags & RXf_NOSCAN)
12801         PerlIO_printf(Perl_debug_log, " noscan");
12802     if (r->extflags & RXf_CHECK_ALL)
12803         PerlIO_printf(Perl_debug_log, " isall");
12804     if (r->check_substr || r->check_utf8)
12805         PerlIO_printf(Perl_debug_log, ") ");
12806
12807     if (ri->regstclass) {
12808         regprop(r, sv, ri->regstclass);
12809         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12810     }
12811     if (r->extflags & RXf_ANCH) {
12812         PerlIO_printf(Perl_debug_log, "anchored");
12813         if (r->extflags & RXf_ANCH_BOL)
12814             PerlIO_printf(Perl_debug_log, "(BOL)");
12815         if (r->extflags & RXf_ANCH_MBOL)
12816             PerlIO_printf(Perl_debug_log, "(MBOL)");
12817         if (r->extflags & RXf_ANCH_SBOL)
12818             PerlIO_printf(Perl_debug_log, "(SBOL)");
12819         if (r->extflags & RXf_ANCH_GPOS)
12820             PerlIO_printf(Perl_debug_log, "(GPOS)");
12821         PerlIO_putc(Perl_debug_log, ' ');
12822     }
12823     if (r->extflags & RXf_GPOS_SEEN)
12824         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12825     if (r->intflags & PREGf_SKIP)
12826         PerlIO_printf(Perl_debug_log, "plus ");
12827     if (r->intflags & PREGf_IMPLICIT)
12828         PerlIO_printf(Perl_debug_log, "implicit ");
12829     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12830     if (r->extflags & RXf_EVAL_SEEN)
12831         PerlIO_printf(Perl_debug_log, "with eval ");
12832     PerlIO_printf(Perl_debug_log, "\n");
12833     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12834 #else
12835     PERL_ARGS_ASSERT_REGDUMP;
12836     PERL_UNUSED_CONTEXT;
12837     PERL_UNUSED_ARG(r);
12838 #endif  /* DEBUGGING */
12839 }
12840
12841 /*
12842 - regprop - printable representation of opcode
12843 */
12844 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12845 STMT_START { \
12846         if (do_sep) {                           \
12847             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12848             if (flags & ANYOF_INVERT)           \
12849                 /*make sure the invert info is in each */ \
12850                 sv_catpvs(sv, "^");             \
12851             do_sep = 0;                         \
12852         }                                       \
12853 } STMT_END
12854
12855 void
12856 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12857 {
12858 #ifdef DEBUGGING
12859     dVAR;
12860     register int k;
12861     RXi_GET_DECL(prog,progi);
12862     GET_RE_DEBUG_FLAGS_DECL;
12863
12864     PERL_ARGS_ASSERT_REGPROP;
12865
12866     sv_setpvs(sv, "");
12867
12868     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
12869         /* It would be nice to FAIL() here, but this may be called from
12870            regexec.c, and it would be hard to supply pRExC_state. */
12871         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12872     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12873
12874     k = PL_regkind[OP(o)];
12875
12876     if (k == EXACT) {
12877         sv_catpvs(sv, " ");
12878         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12879          * is a crude hack but it may be the best for now since
12880          * we have no flag "this EXACTish node was UTF-8"
12881          * --jhi */
12882         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12883                   PERL_PV_ESCAPE_UNI_DETECT |
12884                   PERL_PV_ESCAPE_NONASCII   |
12885                   PERL_PV_PRETTY_ELLIPSES   |
12886                   PERL_PV_PRETTY_LTGT       |
12887                   PERL_PV_PRETTY_NOCLEAR
12888                   );
12889     } else if (k == TRIE) {
12890         /* print the details of the trie in dumpuntil instead, as
12891          * progi->data isn't available here */
12892         const char op = OP(o);
12893         const U32 n = ARG(o);
12894         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12895                (reg_ac_data *)progi->data->data[n] :
12896                NULL;
12897         const reg_trie_data * const trie
12898             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12899
12900         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12901         DEBUG_TRIE_COMPILE_r(
12902             Perl_sv_catpvf(aTHX_ sv,
12903                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12904                 (UV)trie->startstate,
12905                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12906                 (UV)trie->wordcount,
12907                 (UV)trie->minlen,
12908                 (UV)trie->maxlen,
12909                 (UV)TRIE_CHARCOUNT(trie),
12910                 (UV)trie->uniquecharcount
12911             )
12912         );
12913         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12914             int i;
12915             int rangestart = -1;
12916             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12917             sv_catpvs(sv, "[");
12918             for (i = 0; i <= 256; i++) {
12919                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12920                     if (rangestart == -1)
12921                         rangestart = i;
12922                 } else if (rangestart != -1) {
12923                     if (i <= rangestart + 3)
12924                         for (; rangestart < i; rangestart++)
12925                             put_byte(sv, rangestart);
12926                     else {
12927                         put_byte(sv, rangestart);
12928                         sv_catpvs(sv, "-");
12929                         put_byte(sv, i - 1);
12930                     }
12931                     rangestart = -1;
12932                 }
12933             }
12934             sv_catpvs(sv, "]");
12935         }
12936
12937     } else if (k == CURLY) {
12938         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12939             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12940         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12941     }
12942     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
12943         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12944     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12945         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
12946         if ( RXp_PAREN_NAMES(prog) ) {
12947             if ( k != REF || (OP(o) < NREF)) {
12948                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12949                 SV **name= av_fetch(list, ARG(o), 0 );
12950                 if (name)
12951                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12952             }
12953             else {
12954                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12955                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12956                 I32 *nums=(I32*)SvPVX(sv_dat);
12957                 SV **name= av_fetch(list, nums[0], 0 );
12958                 I32 n;
12959                 if (name) {
12960                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
12961                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12962                                     (n ? "," : ""), (IV)nums[n]);
12963                     }
12964                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12965                 }
12966             }
12967         }
12968     } else if (k == GOSUB)
12969         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12970     else if (k == VERB) {
12971         if (!o->flags)
12972             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12973                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12974     } else if (k == LOGICAL)
12975         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
12976     else if (k == ANYOF) {
12977         int i, rangestart = -1;
12978         const U8 flags = ANYOF_FLAGS(o);
12979         int do_sep = 0;
12980
12981         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12982         static const char * const anyofs[] = {
12983             "\\w",
12984             "\\W",
12985             "\\s",
12986             "\\S",
12987             "\\d",
12988             "\\D",
12989             "[:alnum:]",
12990             "[:^alnum:]",
12991             "[:alpha:]",
12992             "[:^alpha:]",
12993             "[:ascii:]",
12994             "[:^ascii:]",
12995             "[:cntrl:]",
12996             "[:^cntrl:]",
12997             "[:graph:]",
12998             "[:^graph:]",
12999             "[:lower:]",
13000             "[:^lower:]",
13001             "[:print:]",
13002             "[:^print:]",
13003             "[:punct:]",
13004             "[:^punct:]",
13005             "[:upper:]",
13006             "[:^upper:]",
13007             "[:xdigit:]",
13008             "[:^xdigit:]",
13009             "[:space:]",
13010             "[:^space:]",
13011             "[:blank:]",
13012             "[:^blank:]"
13013         };
13014
13015         if (flags & ANYOF_LOCALE)
13016             sv_catpvs(sv, "{loc}");
13017         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13018             sv_catpvs(sv, "{i}");
13019         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13020         if (flags & ANYOF_INVERT)
13021             sv_catpvs(sv, "^");
13022
13023         /* output what the standard cp 0-255 bitmap matches */
13024         for (i = 0; i <= 256; i++) {
13025             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13026                 if (rangestart == -1)
13027                     rangestart = i;
13028             } else if (rangestart != -1) {
13029                 if (i <= rangestart + 3)
13030                     for (; rangestart < i; rangestart++)
13031                         put_byte(sv, rangestart);
13032                 else {
13033                     put_byte(sv, rangestart);
13034                     sv_catpvs(sv, "-");
13035                     put_byte(sv, i - 1);
13036                 }
13037                 do_sep = 1;
13038                 rangestart = -1;
13039             }
13040         }
13041
13042         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13043         /* output any special charclass tests (used entirely under use locale) */
13044         if (ANYOF_CLASS_TEST_ANY_SET(o))
13045             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13046                 if (ANYOF_CLASS_TEST(o,i)) {
13047                     sv_catpv(sv, anyofs[i]);
13048                     do_sep = 1;
13049                 }
13050
13051         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13052
13053         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13054             sv_catpvs(sv, "{non-utf8-latin1-all}");
13055         }
13056
13057         /* output information about the unicode matching */
13058         if (flags & ANYOF_UNICODE_ALL)
13059             sv_catpvs(sv, "{unicode_all}");
13060         else if (ANYOF_NONBITMAP(o))
13061             sv_catpvs(sv, "{unicode}");
13062         if (flags & ANYOF_NONBITMAP_NON_UTF8)
13063             sv_catpvs(sv, "{outside bitmap}");
13064
13065         if (ANYOF_NONBITMAP(o)) {
13066             SV *lv; /* Set if there is something outside the bit map */
13067             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13068             bool byte_output = FALSE;   /* If something in the bitmap has been
13069                                            output */
13070
13071             if (lv && lv != &PL_sv_undef) {
13072                 if (sw) {
13073                     U8 s[UTF8_MAXBYTES_CASE+1];
13074
13075                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13076                         uvchr_to_utf8(s, i);
13077
13078                         if (i < 256
13079                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
13080                                                                things already
13081                                                                output as part
13082                                                                of the bitmap */
13083                             && swash_fetch(sw, s, TRUE))
13084                         {
13085                             if (rangestart == -1)
13086                                 rangestart = i;
13087                         } else if (rangestart != -1) {
13088                             byte_output = TRUE;
13089                             if (i <= rangestart + 3)
13090                                 for (; rangestart < i; rangestart++) {
13091                                     put_byte(sv, rangestart);
13092                                 }
13093                             else {
13094                                 put_byte(sv, rangestart);
13095                                 sv_catpvs(sv, "-");
13096                                 put_byte(sv, i-1);
13097                             }
13098                             rangestart = -1;
13099                         }
13100                     }
13101                 }
13102
13103                 {
13104                     char *s = savesvpv(lv);
13105                     char * const origs = s;
13106
13107                     while (*s && *s != '\n')
13108                         s++;
13109
13110                     if (*s == '\n') {
13111                         const char * const t = ++s;
13112
13113                         if (byte_output) {
13114                             sv_catpvs(sv, " ");
13115                         }
13116
13117                         while (*s) {
13118                             if (*s == '\n') {
13119
13120                                 /* Truncate very long output */
13121                                 if (s - origs > 256) {
13122                                     Perl_sv_catpvf(aTHX_ sv,
13123                                                    "%.*s...",
13124                                                    (int) (s - origs - 1),
13125                                                    t);
13126                                     goto out_dump;
13127                                 }
13128                                 *s = ' ';
13129                             }
13130                             else if (*s == '\t') {
13131                                 *s = '-';
13132                             }
13133                             s++;
13134                         }
13135                         if (s[-1] == ' ')
13136                             s[-1] = 0;
13137
13138                         sv_catpv(sv, t);
13139                     }
13140
13141                 out_dump:
13142
13143                     Safefree(origs);
13144                 }
13145                 SvREFCNT_dec(lv);
13146             }
13147         }
13148
13149         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13150     }
13151     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13152         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13153 #else
13154     PERL_UNUSED_CONTEXT;
13155     PERL_UNUSED_ARG(sv);
13156     PERL_UNUSED_ARG(o);
13157     PERL_UNUSED_ARG(prog);
13158 #endif  /* DEBUGGING */
13159 }
13160
13161 SV *
13162 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13163 {                               /* Assume that RE_INTUIT is set */
13164     dVAR;
13165     struct regexp *const prog = (struct regexp *)SvANY(r);
13166     GET_RE_DEBUG_FLAGS_DECL;
13167
13168     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13169     PERL_UNUSED_CONTEXT;
13170
13171     DEBUG_COMPILE_r(
13172         {
13173             const char * const s = SvPV_nolen_const(prog->check_substr
13174                       ? prog->check_substr : prog->check_utf8);
13175
13176             if (!PL_colorset) reginitcolors();
13177             PerlIO_printf(Perl_debug_log,
13178                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13179                       PL_colors[4],
13180                       prog->check_substr ? "" : "utf8 ",
13181                       PL_colors[5],PL_colors[0],
13182                       s,
13183                       PL_colors[1],
13184                       (strlen(s) > 60 ? "..." : ""));
13185         } );
13186
13187     return prog->check_substr ? prog->check_substr : prog->check_utf8;
13188 }
13189
13190 /*
13191    pregfree()
13192
13193    handles refcounting and freeing the perl core regexp structure. When
13194    it is necessary to actually free the structure the first thing it
13195    does is call the 'free' method of the regexp_engine associated to
13196    the regexp, allowing the handling of the void *pprivate; member
13197    first. (This routine is not overridable by extensions, which is why
13198    the extensions free is called first.)
13199
13200    See regdupe and regdupe_internal if you change anything here.
13201 */
13202 #ifndef PERL_IN_XSUB_RE
13203 void
13204 Perl_pregfree(pTHX_ REGEXP *r)
13205 {
13206     SvREFCNT_dec(r);
13207 }
13208
13209 void
13210 Perl_pregfree2(pTHX_ REGEXP *rx)
13211 {
13212     dVAR;
13213     struct regexp *const r = (struct regexp *)SvANY(rx);
13214     GET_RE_DEBUG_FLAGS_DECL;
13215
13216     PERL_ARGS_ASSERT_PREGFREE2;
13217
13218     if (r->mother_re) {
13219         ReREFCNT_dec(r->mother_re);
13220     } else {
13221         CALLREGFREE_PVT(rx); /* free the private data */
13222         SvREFCNT_dec(RXp_PAREN_NAMES(r));
13223     }
13224     if (r->substrs) {
13225         SvREFCNT_dec(r->anchored_substr);
13226         SvREFCNT_dec(r->anchored_utf8);
13227         SvREFCNT_dec(r->float_substr);
13228         SvREFCNT_dec(r->float_utf8);
13229         Safefree(r->substrs);
13230     }
13231     RX_MATCH_COPY_FREE(rx);
13232 #ifdef PERL_OLD_COPY_ON_WRITE
13233     SvREFCNT_dec(r->saved_copy);
13234 #endif
13235     Safefree(r->offs);
13236     SvREFCNT_dec(r->qr_anoncv);
13237 }
13238
13239 /*  reg_temp_copy()
13240
13241     This is a hacky workaround to the structural issue of match results
13242     being stored in the regexp structure which is in turn stored in
13243     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13244     could be PL_curpm in multiple contexts, and could require multiple
13245     result sets being associated with the pattern simultaneously, such
13246     as when doing a recursive match with (??{$qr})
13247
13248     The solution is to make a lightweight copy of the regexp structure
13249     when a qr// is returned from the code executed by (??{$qr}) this
13250     lightweight copy doesn't actually own any of its data except for
13251     the starp/end and the actual regexp structure itself.
13252
13253 */
13254
13255
13256 REGEXP *
13257 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13258 {
13259     struct regexp *ret;
13260     struct regexp *const r = (struct regexp *)SvANY(rx);
13261     register const I32 npar = r->nparens+1;
13262
13263     PERL_ARGS_ASSERT_REG_TEMP_COPY;
13264
13265     if (!ret_x)
13266         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13267     ret = (struct regexp *)SvANY(ret_x);
13268
13269     (void)ReREFCNT_inc(rx);
13270     /* We can take advantage of the existing "copied buffer" mechanism in SVs
13271        by pointing directly at the buffer, but flagging that the allocated
13272        space in the copy is zero. As we've just done a struct copy, it's now
13273        a case of zero-ing that, rather than copying the current length.  */
13274     SvPV_set(ret_x, RX_WRAPPED(rx));
13275     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
13276     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13277            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13278     SvLEN_set(ret_x, 0);
13279     SvSTASH_set(ret_x, NULL);
13280     SvMAGIC_set(ret_x, NULL);
13281     Newx(ret->offs, npar, regexp_paren_pair);
13282     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13283     if (r->substrs) {
13284         Newx(ret->substrs, 1, struct reg_substr_data);
13285         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13286
13287         SvREFCNT_inc_void(ret->anchored_substr);
13288         SvREFCNT_inc_void(ret->anchored_utf8);
13289         SvREFCNT_inc_void(ret->float_substr);
13290         SvREFCNT_inc_void(ret->float_utf8);
13291
13292         /* check_substr and check_utf8, if non-NULL, point to either their
13293            anchored or float namesakes, and don't hold a second reference.  */
13294     }
13295     RX_MATCH_COPIED_off(ret_x);
13296 #ifdef PERL_OLD_COPY_ON_WRITE
13297     ret->saved_copy = NULL;
13298 #endif
13299     ret->mother_re = rx;
13300     SvREFCNT_inc_void(ret->qr_anoncv);
13301
13302     return ret_x;
13303 }
13304 #endif
13305
13306 /* regfree_internal()
13307
13308    Free the private data in a regexp. This is overloadable by
13309    extensions. Perl takes care of the regexp structure in pregfree(),
13310    this covers the *pprivate pointer which technically perl doesn't
13311    know about, however of course we have to handle the
13312    regexp_internal structure when no extension is in use.
13313
13314    Note this is called before freeing anything in the regexp
13315    structure.
13316  */
13317
13318 void
13319 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13320 {
13321     dVAR;
13322     struct regexp *const r = (struct regexp *)SvANY(rx);
13323     RXi_GET_DECL(r,ri);
13324     GET_RE_DEBUG_FLAGS_DECL;
13325
13326     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13327
13328     DEBUG_COMPILE_r({
13329         if (!PL_colorset)
13330             reginitcolors();
13331         {
13332             SV *dsv= sv_newmortal();
13333             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13334                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
13335             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
13336                 PL_colors[4],PL_colors[5],s);
13337         }
13338     });
13339 #ifdef RE_TRACK_PATTERN_OFFSETS
13340     if (ri->u.offsets)
13341         Safefree(ri->u.offsets);             /* 20010421 MJD */
13342 #endif
13343     if (ri->code_blocks) {
13344         int n;
13345         for (n = 0; n < ri->num_code_blocks; n++)
13346             SvREFCNT_dec(ri->code_blocks[n].src_regex);
13347         Safefree(ri->code_blocks);
13348     }
13349
13350     if (ri->data) {
13351         int n = ri->data->count;
13352         PAD* new_comppad = NULL;
13353         PAD* old_comppad;
13354         PADOFFSET refcnt;
13355
13356         while (--n >= 0) {
13357           /* If you add a ->what type here, update the comment in regcomp.h */
13358             switch (ri->data->what[n]) {
13359             case 'a':
13360             case 'r':
13361             case 's':
13362             case 'S':
13363             case 'u':
13364                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13365                 break;
13366             case 'f':
13367                 Safefree(ri->data->data[n]);
13368                 break;
13369             case 'p':
13370                 new_comppad = MUTABLE_AV(ri->data->data[n]);
13371                 break;
13372             case 'o':
13373                 if (new_comppad == NULL)
13374                     Perl_croak(aTHX_ "panic: pregfree comppad");
13375                 PAD_SAVE_LOCAL(old_comppad,
13376                     /* Watch out for global destruction's random ordering. */
13377                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
13378                 );
13379                 OP_REFCNT_LOCK;
13380                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
13381                 OP_REFCNT_UNLOCK;
13382                 if (!refcnt)
13383                     op_free((OP_4tree*)ri->data->data[n]);
13384
13385                 PAD_RESTORE_LOCAL(old_comppad);
13386                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
13387                 new_comppad = NULL;
13388                 break;
13389             case 'l':
13390             case 'L':
13391             case 'n':
13392                 break;
13393             case 'T':
13394                 { /* Aho Corasick add-on structure for a trie node.
13395                      Used in stclass optimization only */
13396                     U32 refcount;
13397                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13398                     OP_REFCNT_LOCK;
13399                     refcount = --aho->refcount;
13400                     OP_REFCNT_UNLOCK;
13401                     if ( !refcount ) {
13402                         PerlMemShared_free(aho->states);
13403                         PerlMemShared_free(aho->fail);
13404                          /* do this last!!!! */
13405                         PerlMemShared_free(ri->data->data[n]);
13406                         PerlMemShared_free(ri->regstclass);
13407                     }
13408                 }
13409                 break;
13410             case 't':
13411                 {
13412                     /* trie structure. */
13413                     U32 refcount;
13414                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13415                     OP_REFCNT_LOCK;
13416                     refcount = --trie->refcount;
13417                     OP_REFCNT_UNLOCK;
13418                     if ( !refcount ) {
13419                         PerlMemShared_free(trie->charmap);
13420                         PerlMemShared_free(trie->states);
13421                         PerlMemShared_free(trie->trans);
13422                         if (trie->bitmap)
13423                             PerlMemShared_free(trie->bitmap);
13424                         if (trie->jump)
13425                             PerlMemShared_free(trie->jump);
13426                         PerlMemShared_free(trie->wordinfo);
13427                         /* do this last!!!! */
13428                         PerlMemShared_free(ri->data->data[n]);
13429                     }
13430                 }
13431                 break;
13432             default:
13433                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
13434             }
13435         }
13436         Safefree(ri->data->what);
13437         Safefree(ri->data);
13438     }
13439
13440     Safefree(ri);
13441 }
13442
13443 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
13444 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
13445 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
13446
13447 /*
13448    re_dup - duplicate a regexp.
13449
13450    This routine is expected to clone a given regexp structure. It is only
13451    compiled under USE_ITHREADS.
13452
13453    After all of the core data stored in struct regexp is duplicated
13454    the regexp_engine.dupe method is used to copy any private data
13455    stored in the *pprivate pointer. This allows extensions to handle
13456    any duplication it needs to do.
13457
13458    See pregfree() and regfree_internal() if you change anything here.
13459 */
13460 #if defined(USE_ITHREADS)
13461 #ifndef PERL_IN_XSUB_RE
13462 void
13463 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13464 {
13465     dVAR;
13466     I32 npar;
13467     const struct regexp *r = (const struct regexp *)SvANY(sstr);
13468     struct regexp *ret = (struct regexp *)SvANY(dstr);
13469
13470     PERL_ARGS_ASSERT_RE_DUP_GUTS;
13471
13472     npar = r->nparens+1;
13473     Newx(ret->offs, npar, regexp_paren_pair);
13474     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13475     if(ret->swap) {
13476         /* no need to copy these */
13477         Newx(ret->swap, npar, regexp_paren_pair);
13478     }
13479
13480     if (ret->substrs) {
13481         /* Do it this way to avoid reading from *r after the StructCopy().
13482            That way, if any of the sv_dup_inc()s dislodge *r from the L1
13483            cache, it doesn't matter.  */
13484         const bool anchored = r->check_substr
13485             ? r->check_substr == r->anchored_substr
13486             : r->check_utf8 == r->anchored_utf8;
13487         Newx(ret->substrs, 1, struct reg_substr_data);
13488         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13489
13490         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
13491         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
13492         ret->float_substr = sv_dup_inc(ret->float_substr, param);
13493         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
13494
13495         /* check_substr and check_utf8, if non-NULL, point to either their
13496            anchored or float namesakes, and don't hold a second reference.  */
13497
13498         if (ret->check_substr) {
13499             if (anchored) {
13500                 assert(r->check_utf8 == r->anchored_utf8);
13501                 ret->check_substr = ret->anchored_substr;
13502                 ret->check_utf8 = ret->anchored_utf8;
13503             } else {
13504                 assert(r->check_substr == r->float_substr);
13505                 assert(r->check_utf8 == r->float_utf8);
13506                 ret->check_substr = ret->float_substr;
13507                 ret->check_utf8 = ret->float_utf8;
13508             }
13509         } else if (ret->check_utf8) {
13510             if (anchored) {
13511                 ret->check_utf8 = ret->anchored_utf8;
13512             } else {
13513                 ret->check_utf8 = ret->float_utf8;
13514             }
13515         }
13516     }
13517
13518     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13519     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13520
13521     if (ret->pprivate)
13522         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
13523
13524     if (RX_MATCH_COPIED(dstr))
13525         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
13526     else
13527         ret->subbeg = NULL;
13528 #ifdef PERL_OLD_COPY_ON_WRITE
13529     ret->saved_copy = NULL;
13530 #endif
13531
13532     if (ret->mother_re) {
13533         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
13534             /* Our storage points directly to our mother regexp, but that's
13535                1: a buffer in a different thread
13536                2: something we no longer hold a reference on
13537                so we need to copy it locally.  */
13538             /* Note we need to use SvCUR(), rather than
13539                SvLEN(), on our mother_re, because it, in
13540                turn, may well be pointing to its own mother_re.  */
13541             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
13542                                    SvCUR(ret->mother_re)+1));
13543             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
13544         }
13545         ret->mother_re      = NULL;
13546     }
13547     ret->gofs = 0;
13548 }
13549 #endif /* PERL_IN_XSUB_RE */
13550
13551 /*
13552    regdupe_internal()
13553
13554    This is the internal complement to regdupe() which is used to copy
13555    the structure pointed to by the *pprivate pointer in the regexp.
13556    This is the core version of the extension overridable cloning hook.
13557    The regexp structure being duplicated will be copied by perl prior
13558    to this and will be provided as the regexp *r argument, however
13559    with the /old/ structures pprivate pointer value. Thus this routine
13560    may override any copying normally done by perl.
13561
13562    It returns a pointer to the new regexp_internal structure.
13563 */
13564
13565 void *
13566 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13567 {
13568     dVAR;
13569     struct regexp *const r = (struct regexp *)SvANY(rx);
13570     regexp_internal *reti;
13571     int len;
13572     RXi_GET_DECL(r,ri);
13573
13574     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13575
13576     len = ProgLen(ri);
13577
13578     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
13579     Copy(ri->program, reti->program, len+1, regnode);
13580
13581     reti->num_code_blocks = ri->num_code_blocks;
13582     if (ri->code_blocks) {
13583         int n;
13584         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
13585                 struct reg_code_block);
13586         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
13587                 struct reg_code_block);
13588         for (n = 0; n < ri->num_code_blocks; n++)
13589              reti->code_blocks[n].src_regex = (REGEXP*)
13590                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
13591     }
13592     else
13593         reti->code_blocks = NULL;
13594
13595     reti->regstclass = NULL;
13596
13597     if (ri->data) {
13598         struct reg_data *d;
13599         const int count = ri->data->count;
13600         int i;
13601
13602         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13603                 char, struct reg_data);
13604         Newx(d->what, count, U8);
13605
13606         d->count = count;
13607         for (i = 0; i < count; i++) {
13608             d->what[i] = ri->data->what[i];
13609             switch (d->what[i]) {
13610                 /* legal options are one of: sSfpontTua
13611                    see also regcomp.h and pregfree() */
13612             case 'a': /* actually an AV, but the dup function is identical.  */
13613             case 'r':
13614             case 's':
13615             case 'S':
13616             case 'p': /* actually an AV, but the dup function is identical.  */
13617             case 'u': /* actually an HV, but the dup function is identical.  */
13618                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13619                 break;
13620             case 'f':
13621                 /* This is cheating. */
13622                 Newx(d->data[i], 1, struct regnode_charclass_class);
13623                 StructCopy(ri->data->data[i], d->data[i],
13624                             struct regnode_charclass_class);
13625                 reti->regstclass = (regnode*)d->data[i];
13626                 break;
13627             case 'o':
13628                 /* Compiled op trees are readonly and in shared memory,
13629                    and can thus be shared without duplication. */
13630                 OP_REFCNT_LOCK;
13631                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
13632                 OP_REFCNT_UNLOCK;
13633                 break;
13634             case 'T':
13635                 /* Trie stclasses are readonly and can thus be shared
13636                  * without duplication. We free the stclass in pregfree
13637                  * when the corresponding reg_ac_data struct is freed.
13638                  */
13639                 reti->regstclass= ri->regstclass;
13640                 /* Fall through */
13641             case 't':
13642                 OP_REFCNT_LOCK;
13643                 ((reg_trie_data*)ri->data->data[i])->refcount++;
13644                 OP_REFCNT_UNLOCK;
13645                 /* Fall through */
13646             case 'l':
13647             case 'L':
13648             case 'n':
13649                 d->data[i] = ri->data->data[i];
13650                 break;
13651             default:
13652                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13653             }
13654         }
13655
13656         reti->data = d;
13657     }
13658     else
13659         reti->data = NULL;
13660
13661     reti->name_list_idx = ri->name_list_idx;
13662
13663 #ifdef RE_TRACK_PATTERN_OFFSETS
13664     if (ri->u.offsets) {
13665         Newx(reti->u.offsets, 2*len+1, U32);
13666         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13667     }
13668 #else
13669     SetProgLen(reti,len);
13670 #endif
13671
13672     return (void*)reti;
13673 }
13674
13675 #endif    /* USE_ITHREADS */
13676
13677 #ifndef PERL_IN_XSUB_RE
13678
13679 /*
13680  - regnext - dig the "next" pointer out of a node
13681  */
13682 regnode *
13683 Perl_regnext(pTHX_ register regnode *p)
13684 {
13685     dVAR;
13686     register I32 offset;
13687
13688     if (!p)
13689         return(NULL);
13690
13691     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
13692         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13693     }
13694
13695     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13696     if (offset == 0)
13697         return(NULL);
13698
13699     return(p+offset);
13700 }
13701 #endif
13702
13703 STATIC void
13704 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13705 {
13706     va_list args;
13707     STRLEN l1 = strlen(pat1);
13708     STRLEN l2 = strlen(pat2);
13709     char buf[512];
13710     SV *msv;
13711     const char *message;
13712
13713     PERL_ARGS_ASSERT_RE_CROAK2;
13714
13715     if (l1 > 510)
13716         l1 = 510;
13717     if (l1 + l2 > 510)
13718         l2 = 510 - l1;
13719     Copy(pat1, buf, l1 , char);
13720     Copy(pat2, buf + l1, l2 , char);
13721     buf[l1 + l2] = '\n';
13722     buf[l1 + l2 + 1] = '\0';
13723 #ifdef I_STDARG
13724     /* ANSI variant takes additional second argument */
13725     va_start(args, pat2);
13726 #else
13727     va_start(args);
13728 #endif
13729     msv = vmess(buf, &args);
13730     va_end(args);
13731     message = SvPV_const(msv,l1);
13732     if (l1 > 512)
13733         l1 = 512;
13734     Copy(message, buf, l1 , char);
13735     buf[l1-1] = '\0';                   /* Overwrite \n */
13736     Perl_croak(aTHX_ "%s", buf);
13737 }
13738
13739 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
13740
13741 #ifndef PERL_IN_XSUB_RE
13742 void
13743 Perl_save_re_context(pTHX)
13744 {
13745     dVAR;
13746
13747     struct re_save_state *state;
13748
13749     SAVEVPTR(PL_curcop);
13750     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13751
13752     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13753     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13754     SSPUSHUV(SAVEt_RE_STATE);
13755
13756     Copy(&PL_reg_state, state, 1, struct re_save_state);
13757
13758     PL_reg_start_tmp = 0;
13759     PL_reg_start_tmpl = 0;
13760     PL_reg_oldsaved = NULL;
13761     PL_reg_oldsavedlen = 0;
13762     PL_reg_maxiter = 0;
13763     PL_reg_leftiter = 0;
13764     PL_reg_poscache = NULL;
13765     PL_reg_poscache_size = 0;
13766 #ifdef PERL_OLD_COPY_ON_WRITE
13767     PL_nrs = NULL;
13768 #endif
13769
13770     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13771     if (PL_curpm) {
13772         const REGEXP * const rx = PM_GETRE(PL_curpm);
13773         if (rx) {
13774             U32 i;
13775             for (i = 1; i <= RX_NPARENS(rx); i++) {
13776                 char digits[TYPE_CHARS(long)];
13777                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13778                 GV *const *const gvp
13779                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13780
13781                 if (gvp) {
13782                     GV * const gv = *gvp;
13783                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13784                         save_scalar(gv);
13785                 }
13786             }
13787         }
13788     }
13789 }
13790 #endif
13791
13792 static void
13793 clear_re(pTHX_ void *r)
13794 {
13795     dVAR;
13796     ReREFCNT_dec((REGEXP *)r);
13797 }
13798
13799 #ifdef DEBUGGING
13800
13801 STATIC void
13802 S_put_byte(pTHX_ SV *sv, int c)
13803 {
13804     PERL_ARGS_ASSERT_PUT_BYTE;
13805
13806     /* Our definition of isPRINT() ignores locales, so only bytes that are
13807        not part of UTF-8 are considered printable. I assume that the same
13808        holds for UTF-EBCDIC.
13809        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13810        which Wikipedia says:
13811
13812        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13813        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13814        identical, to the ASCII delete (DEL) or rubout control character.
13815        ) So the old condition can be simplified to !isPRINT(c)  */
13816     if (!isPRINT(c)) {
13817         if (c < 256) {
13818             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13819         }
13820         else {
13821             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13822         }
13823     }
13824     else {
13825         const char string = c;
13826         if (c == '-' || c == ']' || c == '\\' || c == '^')
13827             sv_catpvs(sv, "\\");
13828         sv_catpvn(sv, &string, 1);
13829     }
13830 }
13831
13832
13833 #define CLEAR_OPTSTART \
13834     if (optstart) STMT_START { \
13835             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13836             optstart=NULL; \
13837     } STMT_END
13838
13839 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13840
13841 STATIC const regnode *
13842 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13843             const regnode *last, const regnode *plast,
13844             SV* sv, I32 indent, U32 depth)
13845 {
13846     dVAR;
13847     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
13848     register const regnode *next;
13849     const regnode *optstart= NULL;
13850
13851     RXi_GET_DECL(r,ri);
13852     GET_RE_DEBUG_FLAGS_DECL;
13853
13854     PERL_ARGS_ASSERT_DUMPUNTIL;
13855
13856 #ifdef DEBUG_DUMPUNTIL
13857     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13858         last ? last-start : 0,plast ? plast-start : 0);
13859 #endif
13860
13861     if (plast && plast < last)
13862         last= plast;
13863
13864     while (PL_regkind[op] != END && (!last || node < last)) {
13865         /* While that wasn't END last time... */
13866         NODE_ALIGN(node);
13867         op = OP(node);
13868         if (op == CLOSE || op == WHILEM)
13869             indent--;
13870         next = regnext((regnode *)node);
13871
13872         /* Where, what. */
13873         if (OP(node) == OPTIMIZED) {
13874             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13875                 optstart = node;
13876             else
13877                 goto after_print;
13878         } else
13879             CLEAR_OPTSTART;
13880
13881         regprop(r, sv, node);
13882         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13883                       (int)(2*indent + 1), "", SvPVX_const(sv));
13884
13885         if (OP(node) != OPTIMIZED) {
13886             if (next == NULL)           /* Next ptr. */
13887                 PerlIO_printf(Perl_debug_log, " (0)");
13888             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13889                 PerlIO_printf(Perl_debug_log, " (FAIL)");
13890             else
13891                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13892             (void)PerlIO_putc(Perl_debug_log, '\n');
13893         }
13894
13895       after_print:
13896         if (PL_regkind[(U8)op] == BRANCHJ) {
13897             assert(next);
13898             {
13899                 register const regnode *nnode = (OP(next) == LONGJMP
13900                                              ? regnext((regnode *)next)
13901                                              : next);
13902                 if (last && nnode > last)
13903                     nnode = last;
13904                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13905             }
13906         }
13907         else if (PL_regkind[(U8)op] == BRANCH) {
13908             assert(next);
13909             DUMPUNTIL(NEXTOPER(node), next);
13910         }
13911         else if ( PL_regkind[(U8)op]  == TRIE ) {
13912             const regnode *this_trie = node;
13913             const char op = OP(node);
13914             const U32 n = ARG(node);
13915             const reg_ac_data * const ac = op>=AHOCORASICK ?
13916                (reg_ac_data *)ri->data->data[n] :
13917                NULL;
13918             const reg_trie_data * const trie =
13919                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13920 #ifdef DEBUGGING
13921             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13922 #endif
13923             const regnode *nextbranch= NULL;
13924             I32 word_idx;
13925             sv_setpvs(sv, "");
13926             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13927                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13928
13929                 PerlIO_printf(Perl_debug_log, "%*s%s ",
13930                    (int)(2*(indent+3)), "",
13931                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13932                             PL_colors[0], PL_colors[1],
13933                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13934                             PERL_PV_PRETTY_ELLIPSES    |
13935                             PERL_PV_PRETTY_LTGT
13936                             )
13937                             : "???"
13938                 );
13939                 if (trie->jump) {
13940                     U16 dist= trie->jump[word_idx+1];
13941                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13942                                   (UV)((dist ? this_trie + dist : next) - start));
13943                     if (dist) {
13944                         if (!nextbranch)
13945                             nextbranch= this_trie + trie->jump[0];
13946                         DUMPUNTIL(this_trie + dist, nextbranch);
13947                     }
13948                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13949                         nextbranch= regnext((regnode *)nextbranch);
13950                 } else {
13951                     PerlIO_printf(Perl_debug_log, "\n");
13952                 }
13953             }
13954             if (last && next > last)
13955                 node= last;
13956             else
13957                 node= next;
13958         }
13959         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
13960             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13961                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13962         }
13963         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13964             assert(next);
13965             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13966         }
13967         else if ( op == PLUS || op == STAR) {
13968             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13969         }
13970         else if (PL_regkind[(U8)op] == ANYOF) {
13971             /* arglen 1 + class block */
13972             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13973                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13974             node = NEXTOPER(node);
13975         }
13976         else if (PL_regkind[(U8)op] == EXACT) {
13977             /* Literal string, where present. */
13978             node += NODE_SZ_STR(node) - 1;
13979             node = NEXTOPER(node);
13980         }
13981         else {
13982             node = NEXTOPER(node);
13983             node += regarglen[(U8)op];
13984         }
13985         if (op == CURLYX || op == OPEN)
13986             indent++;
13987     }
13988     CLEAR_OPTSTART;
13989 #ifdef DEBUG_DUMPUNTIL
13990     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13991 #endif
13992     return node;
13993 }
13994
13995 #endif  /* DEBUGGING */
13996
13997 /*
13998  * Local variables:
13999  * c-indentation-style: bsd
14000  * c-basic-offset: 4
14001  * indent-tabs-mode: nil
14002  * End:
14003  *
14004  * ex: set ts=8 sts=4 sw=4 et:
14005  */