regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89 #ifndef PERL_IN_XSUB_RE
  90 #  include "charclass_invlists.h"
  91 #endif
  92
  93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  94
  95 #ifdef op
  96 #undef op
  97 #endif /* op */
  98
  99 #ifdef MSDOS
 100 #  if defined(BUGGY_MSC6)
 101  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 102 #    pragma optimize("a",off)
 103  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 104 #    pragma optimize("w",on )
 105 #  endif /* BUGGY_MSC6 */
 106 #endif /* MSDOS */
 107
 108 #ifndef STATIC
 109 #define STATIC  static
 110 #endif
 111
 112
 113 typedef struct RExC_state_t {
 114     U32         flags;                  /* are we folding, multilining? */
 115     char        *precomp;               /* uncompiled string. */
 116     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 117     regexp      *rx;                    /* perl core regexp structure */
 118     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 119     char        *start;                 /* Start of input for compile */
 120     char        *end;                   /* End of input for compile */
 121     char        *parse;                 /* Input-scan pointer. */
 122     I32         whilem_seen;            /* number of WHILEM in this expr */
 123     regnode     *emit_start;            /* Start of emitted-code area */
 124     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 125     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 126     I32         naughty;                /* How bad is this pattern? */
 127     I32         sawback;                /* Did we see \1, ...? */
 128     U32         seen;
 129     I32         size;                   /* Code size. */
 130     I32         npar;                   /* Capture buffer count, (OPEN). */
 131     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 132     I32         nestroot;               /* root parens we are in - used by accept */
 133     I32         extralen;
 134     I32         seen_zerolen;
 135     I32         seen_evals;
 136     regnode     **open_parens;          /* pointers to open parens */
 137     regnode     **close_parens;         /* pointers to close parens */
 138     regnode     *opend;                 /* END node in program */
 139     I32         utf8;           /* whether the pattern is utf8 or not */
 140     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 141                                 /* XXX use this for future optimisation of case
 142                                  * where pattern must be upgraded to utf8. */
 143     I32         uni_semantics;  /* If a d charset modifier should use unicode
 144                                    rules, even if the pattern is not in
 145                                    utf8 */
 146     HV          *paren_names;           /* Paren names */
 147
 148     regnode     **recurse;              /* Recurse regops */
 149     I32         recurse_count;          /* Number of recurse regops */
 150     I32         in_lookbehind;
 151     I32         contains_locale;
 152     I32         override_recoding;
 153     struct reg_code_block *code_blocks; /* positions of literal (?{})
 154                                             within pattern */
 155     int         num_code_blocks;        /* size of code_blocks[] */
 156     int         code_index;             /* next code_blocks[] slot */
 157 #if ADD_TO_REGEXEC
 158     char        *starttry;              /* -Dr: where regtry was called. */
 159 #define RExC_starttry   (pRExC_state->starttry)
 160 #endif
 161 #ifdef DEBUGGING
 162     const char  *lastparse;
 163     I32         lastnum;
 164     AV          *paren_name_list;       /* idx -> name */
 165 #define RExC_lastparse  (pRExC_state->lastparse)
 166 #define RExC_lastnum    (pRExC_state->lastnum)
 167 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 168 #endif
 169 } RExC_state_t;
 170
 171 #define RExC_flags      (pRExC_state->flags)
 172 #define RExC_precomp    (pRExC_state->precomp)
 173 #define RExC_rx_sv      (pRExC_state->rx_sv)
 174 #define RExC_rx         (pRExC_state->rx)
 175 #define RExC_rxi        (pRExC_state->rxi)
 176 #define RExC_start      (pRExC_state->start)
 177 #define RExC_end        (pRExC_state->end)
 178 #define RExC_parse      (pRExC_state->parse)
 179 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 180 #ifdef RE_TRACK_PATTERN_OFFSETS
 181 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 182 #endif
 183 #define RExC_emit       (pRExC_state->emit)
 184 #define RExC_emit_start (pRExC_state->emit_start)
 185 #define RExC_emit_bound (pRExC_state->emit_bound)
 186 #define RExC_naughty    (pRExC_state->naughty)
 187 #define RExC_sawback    (pRExC_state->sawback)
 188 #define RExC_seen       (pRExC_state->seen)
 189 #define RExC_size       (pRExC_state->size)
 190 #define RExC_npar       (pRExC_state->npar)
 191 #define RExC_nestroot   (pRExC_state->nestroot)
 192 #define RExC_extralen   (pRExC_state->extralen)
 193 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 194 #define RExC_seen_evals (pRExC_state->seen_evals)
 195 #define RExC_utf8       (pRExC_state->utf8)
 196 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 197 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 198 #define RExC_open_parens        (pRExC_state->open_parens)
 199 #define RExC_close_parens       (pRExC_state->close_parens)
 200 #define RExC_opend      (pRExC_state->opend)
 201 #define RExC_paren_names        (pRExC_state->paren_names)
 202 #define RExC_recurse    (pRExC_state->recurse)
 203 #define RExC_recurse_count      (pRExC_state->recurse_count)
 204 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 205 #define RExC_contains_locale    (pRExC_state->contains_locale)
 206 #define RExC_override_recoding  (pRExC_state->override_recoding)
 207
 208
 209 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 210 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 211         ((*s) == '{' && regcurly(s)))
 212
 213 #ifdef SPSTART
 214 #undef SPSTART          /* dratted cpp namespace... */
 215 #endif
 216 /*
 217  * Flags to be passed up and down.
 218  */
 219 #define WORST           0       /* Worst case. */
 220 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 221
 222 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 223  * character, and if utf8, must be invariant.  Note that this is not the same
 224  * thing as REGNODE_SIMPLE */
 225 #define SIMPLE          0x02
 226 #define SPSTART         0x04    /* Starts with * or +. */
 227 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 228 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 229
 230 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 231
 232 /* whether trie related optimizations are enabled */
 233 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 234 #define TRIE_STUDY_OPT
 235 #define FULL_TRIE_STUDY
 236 #define TRIE_STCLASS
 237 #endif
 238
 239
 240
 241 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 242 #define PBITVAL(paren) (1 << ((paren) & 7))
 243 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 244 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 245 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 246
 247 /* If not already in utf8, do a longjmp back to the beginning */
 248 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 249 #define REQUIRE_UTF8    STMT_START {                                       \
 250                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 251                         } STMT_END
 252
 253 /* About scan_data_t.
 254
 255   During optimisation we recurse through the regexp program performing
 256   various inplace (keyhole style) optimisations. In addition study_chunk
 257   and scan_commit populate this data structure with information about
 258   what strings MUST appear in the pattern. We look for the longest
 259   string that must appear at a fixed location, and we look for the
 260   longest string that may appear at a floating location. So for instance
 261   in the pattern:
 262
 263     /FOO[xX]A.*B[xX]BAR/
 264
 265   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 266   strings (because they follow a .* construct). study_chunk will identify
 267   both FOO and BAR as being the longest fixed and floating strings respectively.
 268
 269   The strings can be composites, for instance
 270
 271      /(f)(o)(o)/
 272
 273   will result in a composite fixed substring 'foo'.
 274
 275   For each string some basic information is maintained:
 276
 277   - offset or min_offset
 278     This is the position the string must appear at, or not before.
 279     It also implicitly (when combined with minlenp) tells us how many
 280     characters must match before the string we are searching for.
 281     Likewise when combined with minlenp and the length of the string it
 282     tells us how many characters must appear after the string we have
 283     found.
 284
 285   - max_offset
 286     Only used for floating strings. This is the rightmost point that
 287     the string can appear at. If set to I32 max it indicates that the
 288     string can occur infinitely far to the right.
 289
 290   - minlenp
 291     A pointer to the minimum length of the pattern that the string
 292     was found inside. This is important as in the case of positive
 293     lookahead or positive lookbehind we can have multiple patterns
 294     involved. Consider
 295
 296     /(?=FOO).*F/
 297
 298     The minimum length of the pattern overall is 3, the minimum length
 299     of the lookahead part is 3, but the minimum length of the part that
 300     will actually match is 1. So 'FOO's minimum length is 3, but the
 301     minimum length for the F is 1. This is important as the minimum length
 302     is used to determine offsets in front of and behind the string being
 303     looked for.  Since strings can be composites this is the length of the
 304     pattern at the time it was committed with a scan_commit. Note that
 305     the length is calculated by study_chunk, so that the minimum lengths
 306     are not known until the full pattern has been compiled, thus the
 307     pointer to the value.
 308
 309   - lookbehind
 310
 311     In the case of lookbehind the string being searched for can be
 312     offset past the start point of the final matching string.
 313     If this value was just blithely removed from the min_offset it would
 314     invalidate some of the calculations for how many chars must match
 315     before or after (as they are derived from min_offset and minlen and
 316     the length of the string being searched for).
 317     When the final pattern is compiled and the data is moved from the
 318     scan_data_t structure into the regexp structure the information
 319     about lookbehind is factored in, with the information that would
 320     have been lost precalculated in the end_shift field for the
 321     associated string.
 322
 323   The fields pos_min and pos_delta are used to store the minimum offset
 324   and the delta to the maximum offset at the current point in the pattern.
 325
 326 */
 327
 328 typedef struct scan_data_t {
 329     /*I32 len_min;      unused */
 330     /*I32 len_delta;    unused */
 331     I32 pos_min;
 332     I32 pos_delta;
 333     SV *last_found;
 334     I32 last_end;           /* min value, <0 unless valid. */
 335     I32 last_start_min;
 336     I32 last_start_max;
 337     SV **longest;           /* Either &l_fixed, or &l_float. */
 338     SV *longest_fixed;      /* longest fixed string found in pattern */
 339     I32 offset_fixed;       /* offset where it starts */
 340     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 341     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 342     SV *longest_float;      /* longest floating string found in pattern */
 343     I32 offset_float_min;   /* earliest point in string it can appear */
 344     I32 offset_float_max;   /* latest point in string it can appear */
 345     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 346     I32 lookbehind_float;   /* is the position of the string modified by LB */
 347     I32 flags;
 348     I32 whilem_c;
 349     I32 *last_closep;
 350     struct regnode_charclass_class *start_class;
 351 } scan_data_t;
 352
 353 /*
 354  * Forward declarations for pregcomp()'s friends.
 355  */
 356
 357 static const scan_data_t zero_scan_data =
 358   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 359
 360 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 361 #define SF_BEFORE_SEOL          0x0001
 362 #define SF_BEFORE_MEOL          0x0002
 363 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 364 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 365
 366 #ifdef NO_UNARY_PLUS
 367 #  define SF_FIX_SHIFT_EOL      (0+2)
 368 #  define SF_FL_SHIFT_EOL               (0+4)
 369 #else
 370 #  define SF_FIX_SHIFT_EOL      (+2)
 371 #  define SF_FL_SHIFT_EOL               (+4)
 372 #endif
 373
 374 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 375 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 376
 377 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 378 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 379 #define SF_IS_INF               0x0040
 380 #define SF_HAS_PAR              0x0080
 381 #define SF_IN_PAR               0x0100
 382 #define SF_HAS_EVAL             0x0200
 383 #define SCF_DO_SUBSTR           0x0400
 384 #define SCF_DO_STCLASS_AND      0x0800
 385 #define SCF_DO_STCLASS_OR       0x1000
 386 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 387 #define SCF_WHILEM_VISITED_POS  0x2000
 388
 389 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 390 #define SCF_SEEN_ACCEPT         0x8000
 391
 392 #define UTF cBOOL(RExC_utf8)
 393
 394 /* The enums for all these are ordered so things work out correctly */
 395 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 396 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 397 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 398 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 399 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 400 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 401 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 402
 403 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 404
 405 #define OOB_UNICODE             12345678
 406 #define OOB_NAMEDCLASS          -1
 407
 408 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 409 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 410
 411
 412 /* length of regex to show in messages that don't mark a position within */
 413 #define RegexLengthToShowInErrorMessages 127
 414
 415 /*
 416  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 417  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 418  * op/pragma/warn/regcomp.
 419  */
 420 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 421 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 422
 423 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 424
 425 /*
 426  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 427  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 428  * "...".
 429  */
 430 #define _FAIL(code) STMT_START {                                        \
 431     const char *ellipses = "";                                          \
 432     IV len = RExC_end - RExC_precomp;                                   \
 433                                                                         \
 434     if (!SIZE_ONLY)                                                     \
 435         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 436     if (len > RegexLengthToShowInErrorMessages) {                       \
 437         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 438         len = RegexLengthToShowInErrorMessages - 10;                    \
 439         ellipses = "...";                                               \
 440     }                                                                   \
 441     code;                                                               \
 442 } STMT_END
 443
 444 #define FAIL(msg) _FAIL(                            \
 445     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 446             msg, (int)len, RExC_precomp, ellipses))
 447
 448 #define FAIL2(msg,arg) _FAIL(                       \
 449     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 450             arg, (int)len, RExC_precomp, ellipses))
 451
 452 /*
 453  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 454  */
 455 #define Simple_vFAIL(m) STMT_START {                                    \
 456     const IV offset = RExC_parse - RExC_precomp;                        \
 457     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 458             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 459 } STMT_END
 460
 461 /*
 462  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 463  */
 464 #define vFAIL(m) STMT_START {                           \
 465     if (!SIZE_ONLY)                                     \
 466         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 467     Simple_vFAIL(m);                                    \
 468 } STMT_END
 469
 470 /*
 471  * Like Simple_vFAIL(), but accepts two arguments.
 472  */
 473 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 474     const IV offset = RExC_parse - RExC_precomp;                        \
 475     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 476             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 477 } STMT_END
 478
 479 /*
 480  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 481  */
 482 #define vFAIL2(m,a1) STMT_START {                       \
 483     if (!SIZE_ONLY)                                     \
 484         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 485     Simple_vFAIL2(m, a1);                               \
 486 } STMT_END
 487
 488
 489 /*
 490  * Like Simple_vFAIL(), but accepts three arguments.
 491  */
 492 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 493     const IV offset = RExC_parse - RExC_precomp;                \
 494     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 495             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 496 } STMT_END
 497
 498 /*
 499  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 500  */
 501 #define vFAIL3(m,a1,a2) STMT_START {                    \
 502     if (!SIZE_ONLY)                                     \
 503         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 504     Simple_vFAIL3(m, a1, a2);                           \
 505 } STMT_END
 506
 507 /*
 508  * Like Simple_vFAIL(), but accepts four arguments.
 509  */
 510 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 511     const IV offset = RExC_parse - RExC_precomp;                \
 512     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 513             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 514 } STMT_END
 515
 516 #define ckWARNreg(loc,m) STMT_START {                                   \
 517     const IV offset = loc - RExC_precomp;                               \
 518     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 519             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 520 } STMT_END
 521
 522 #define ckWARNregdep(loc,m) STMT_START {                                \
 523     const IV offset = loc - RExC_precomp;                               \
 524     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 525             m REPORT_LOCATION,                                          \
 526             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 527 } STMT_END
 528
 529 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 530     const IV offset = loc - RExC_precomp;                               \
 531     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 532             m REPORT_LOCATION,                                          \
 533             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 534 } STMT_END
 535
 536 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 537     const IV offset = loc - RExC_precomp;                               \
 538     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 539             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 540 } STMT_END
 541
 542 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 543     const IV offset = loc - RExC_precomp;                               \
 544     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 545             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 546 } STMT_END
 547
 548 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 549     const IV offset = loc - RExC_precomp;                               \
 550     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 551             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 552 } STMT_END
 553
 554 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 555     const IV offset = loc - RExC_precomp;                               \
 556     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 557             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 558 } STMT_END
 559
 560 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 561     const IV offset = loc - RExC_precomp;                               \
 562     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 563             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 564 } STMT_END
 565
 566 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 567     const IV offset = loc - RExC_precomp;                               \
 568     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 569             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 570 } STMT_END
 571
 572
 573 /* Allow for side effects in s */
 574 #define REGC(c,s) STMT_START {                  \
 575     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 576 } STMT_END
 577
 578 /* Macros for recording node offsets.   20001227 mjd@plover.com
 579  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 580  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 581  * Element 0 holds the number n.
 582  * Position is 1 indexed.
 583  */
 584 #ifndef RE_TRACK_PATTERN_OFFSETS
 585 #define Set_Node_Offset_To_R(node,byte)
 586 #define Set_Node_Offset(node,byte)
 587 #define Set_Cur_Node_Offset
 588 #define Set_Node_Length_To_R(node,len)
 589 #define Set_Node_Length(node,len)
 590 #define Set_Node_Cur_Length(node)
 591 #define Node_Offset(n)
 592 #define Node_Length(n)
 593 #define Set_Node_Offset_Length(node,offset,len)
 594 #define ProgLen(ri) ri->u.proglen
 595 #define SetProgLen(ri,x) ri->u.proglen = x
 596 #else
 597 #define ProgLen(ri) ri->u.offsets[0]
 598 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 599 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 600     if (! SIZE_ONLY) {                                                  \
 601         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 602                     __LINE__, (int)(node), (int)(byte)));               \
 603         if((node) < 0) {                                                \
 604             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 605         } else {                                                        \
 606             RExC_offsets[2*(node)-1] = (byte);                          \
 607         }                                                               \
 608     }                                                                   \
 609 } STMT_END
 610
 611 #define Set_Node_Offset(node,byte) \
 612     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 613 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 614
 615 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 616     if (! SIZE_ONLY) {                                                  \
 617         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 618                 __LINE__, (int)(node), (int)(len)));                    \
 619         if((node) < 0) {                                                \
 620             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 621         } else {                                                        \
 622             RExC_offsets[2*(node)] = (len);                             \
 623         }                                                               \
 624     }                                                                   \
 625 } STMT_END
 626
 627 #define Set_Node_Length(node,len) \
 628     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 629 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 630 #define Set_Node_Cur_Length(node) \
 631     Set_Node_Length(node, RExC_parse - parse_start)
 632
 633 /* Get offsets and lengths */
 634 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 635 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 636
 637 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 638     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 639     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 640 } STMT_END
 641 #endif
 642
 643 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 644 #define EXPERIMENTAL_INPLACESCAN
 645 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 646
 647 #define DEBUG_STUDYDATA(str,data,depth)                              \
 648 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 649     PerlIO_printf(Perl_debug_log,                                    \
 650         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 651         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 652         (int)(depth)*2, "",                                          \
 653         (IV)((data)->pos_min),                                       \
 654         (IV)((data)->pos_delta),                                     \
 655         (UV)((data)->flags),                                         \
 656         (IV)((data)->whilem_c),                                      \
 657         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 658         is_inf ? "INF " : ""                                         \
 659     );                                                               \
 660     if ((data)->last_found)                                          \
 661         PerlIO_printf(Perl_debug_log,                                \
 662             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 663             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 664             SvPVX_const((data)->last_found),                         \
 665             (IV)((data)->last_end),                                  \
 666             (IV)((data)->last_start_min),                            \
 667             (IV)((data)->last_start_max),                            \
 668             ((data)->longest &&                                      \
 669              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 670             SvPVX_const((data)->longest_fixed),                      \
 671             (IV)((data)->offset_fixed),                              \
 672             ((data)->longest &&                                      \
 673              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 674             SvPVX_const((data)->longest_float),                      \
 675             (IV)((data)->offset_float_min),                          \
 676             (IV)((data)->offset_float_max)                           \
 677         );                                                           \
 678     PerlIO_printf(Perl_debug_log,"\n");                              \
 679 });
 680
 681 static void clear_re(pTHX_ void *r);
 682
 683 /* Mark that we cannot extend a found fixed substring at this point.
 684    Update the longest found anchored substring and the longest found
 685    floating substrings if needed. */
 686
 687 STATIC void
 688 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 689 {
 690     const STRLEN l = CHR_SVLEN(data->last_found);
 691     const STRLEN old_l = CHR_SVLEN(*data->longest);
 692     GET_RE_DEBUG_FLAGS_DECL;
 693
 694     PERL_ARGS_ASSERT_SCAN_COMMIT;
 695
 696     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 697         SvSetMagicSV(*data->longest, data->last_found);
 698         if (*data->longest == data->longest_fixed) {
 699             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 700             if (data->flags & SF_BEFORE_EOL)
 701                 data->flags
 702                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 703             else
 704                 data->flags &= ~SF_FIX_BEFORE_EOL;
 705             data->minlen_fixed=minlenp;
 706             data->lookbehind_fixed=0;
 707         }
 708         else { /* *data->longest == data->longest_float */
 709             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 710             data->offset_float_max = (l
 711                                       ? data->last_start_max
 712                                       : data->pos_min + data->pos_delta);
 713             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 714                 data->offset_float_max = I32_MAX;
 715             if (data->flags & SF_BEFORE_EOL)
 716                 data->flags
 717                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 718             else
 719                 data->flags &= ~SF_FL_BEFORE_EOL;
 720             data->minlen_float=minlenp;
 721             data->lookbehind_float=0;
 722         }
 723     }
 724     SvCUR_set(data->last_found, 0);
 725     {
 726         SV * const sv = data->last_found;
 727         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 728             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 729             if (mg)
 730                 mg->mg_len = 0;
 731         }
 732     }
 733     data->last_end = -1;
 734     data->flags &= ~SF_BEFORE_EOL;
 735     DEBUG_STUDYDATA("commit: ",data,0);
 736 }
 737
 738 /* Can match anything (initialization) */
 739 STATIC void
 740 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 741 {
 742     PERL_ARGS_ASSERT_CL_ANYTHING;
 743
 744     ANYOF_BITMAP_SETALL(cl);
 745     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 746                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 747
 748     /* If any portion of the regex is to operate under locale rules,
 749      * initialization includes it.  The reason this isn't done for all regexes
 750      * is that the optimizer was written under the assumption that locale was
 751      * all-or-nothing.  Given the complexity and lack of documentation in the
 752      * optimizer, and that there are inadequate test cases for locale, so many
 753      * parts of it may not work properly, it is safest to avoid locale unless
 754      * necessary. */
 755     if (RExC_contains_locale) {
 756         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 757         cl->flags |= ANYOF_LOCALE;
 758     }
 759     else {
 760         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 761     }
 762 }
 763
 764 /* Can match anything (initialization) */
 765 STATIC int
 766 S_cl_is_anything(const struct regnode_charclass_class *cl)
 767 {
 768     int value;
 769
 770     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 771
 772     for (value = 0; value <= ANYOF_MAX; value += 2)
 773         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 774             return 1;
 775     if (!(cl->flags & ANYOF_UNICODE_ALL))
 776         return 0;
 777     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 778         return 0;
 779     return 1;
 780 }
 781
 782 /* Can match anything (initialization) */
 783 STATIC void
 784 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 785 {
 786     PERL_ARGS_ASSERT_CL_INIT;
 787
 788     Zero(cl, 1, struct regnode_charclass_class);
 789     cl->type = ANYOF;
 790     cl_anything(pRExC_state, cl);
 791     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 792 }
 793
 794 /* These two functions currently do the exact same thing */
 795 #define cl_init_zero            S_cl_init
 796
 797 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 798  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 799  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 800 STATIC void
 801 S_cl_and(struct regnode_charclass_class *cl,
 802         const struct regnode_charclass_class *and_with)
 803 {
 804     PERL_ARGS_ASSERT_CL_AND;
 805
 806     assert(and_with->type == ANYOF);
 807
 808     /* I (khw) am not sure all these restrictions are necessary XXX */
 809     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 810         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 811         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 812         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 813         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 814         int i;
 815
 816         if (and_with->flags & ANYOF_INVERT)
 817             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 818                 cl->bitmap[i] &= ~and_with->bitmap[i];
 819         else
 820             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 821                 cl->bitmap[i] &= and_with->bitmap[i];
 822     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 823
 824     if (and_with->flags & ANYOF_INVERT) {
 825
 826         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 827          * aren't affected by the inversion.  Those that are affected are
 828          * handled individually below */
 829         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 830         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 831         cl->flags |= affected_flags;
 832
 833         /* We currently don't know how to deal with things that aren't in the
 834          * bitmap, but we know that the intersection is no greater than what
 835          * is already in cl, so let there be false positives that get sorted
 836          * out after the synthetic start class succeeds, and the node is
 837          * matched for real. */
 838
 839         /* The inversion of these two flags indicate that the resulting
 840          * intersection doesn't have them */
 841         if (and_with->flags & ANYOF_UNICODE_ALL) {
 842             cl->flags &= ~ANYOF_UNICODE_ALL;
 843         }
 844         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 845             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 846         }
 847     }
 848     else {   /* and'd node is not inverted */
 849         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 850
 851         if (! ANYOF_NONBITMAP(and_with)) {
 852
 853             /* Here 'and_with' doesn't match anything outside the bitmap
 854              * (except possibly ANYOF_UNICODE_ALL), which means the
 855              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 856              * which case we don't know what the intersection is, but it's no
 857              * greater than what cl already has, so can just leave it alone,
 858              * with possible false positives */
 859             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 860                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 861                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 862             }
 863         }
 864         else if (! ANYOF_NONBITMAP(cl)) {
 865
 866             /* Here, 'and_with' does match something outside the bitmap, and cl
 867              * doesn't have a list of things to match outside the bitmap.  If
 868              * cl can match all code points above 255, the intersection will
 869              * be those above-255 code points that 'and_with' matches.  If cl
 870              * can't match all Unicode code points, it means that it can't
 871              * match anything outside the bitmap (since the 'if' that got us
 872              * into this block tested for that), so we leave the bitmap empty.
 873              */
 874             if (cl->flags & ANYOF_UNICODE_ALL) {
 875                 ARG_SET(cl, ARG(and_with));
 876
 877                 /* and_with's ARG may match things that don't require UTF8.
 878                  * And now cl's will too, in spite of this being an 'and'.  See
 879                  * the comments below about the kludge */
 880                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 881             }
 882         }
 883         else {
 884             /* Here, both 'and_with' and cl match something outside the
 885              * bitmap.  Currently we do not do the intersection, so just match
 886              * whatever cl had at the beginning.  */
 887         }
 888
 889
 890         /* Take the intersection of the two sets of flags.  However, the
 891          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 892          * kludge around the fact that this flag is not treated like the others
 893          * which are initialized in cl_anything().  The way the optimizer works
 894          * is that the synthetic start class (SSC) is initialized to match
 895          * anything, and then the first time a real node is encountered, its
 896          * values are AND'd with the SSC's with the result being the values of
 897          * the real node.  However, there are paths through the optimizer where
 898          * the AND never gets called, so those initialized bits are set
 899          * inappropriately, which is not usually a big deal, as they just cause
 900          * false positives in the SSC, which will just mean a probably
 901          * imperceptible slow down in execution.  However this bit has a
 902          * higher false positive consequence in that it can cause utf8.pm,
 903          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 904          * bigger slowdown and also causes significant extra memory to be used.
 905          * In order to prevent this, the code now takes a different tack.  The
 906          * bit isn't set unless some part of the regular expression needs it,
 907          * but once set it won't get cleared.  This means that these extra
 908          * modules won't get loaded unless there was some path through the
 909          * pattern that would have required them anyway, and  so any false
 910          * positives that occur by not ANDing them out when they could be
 911          * aren't as severe as they would be if we treated this bit like all
 912          * the others */
 913         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 914                                       & ANYOF_NONBITMAP_NON_UTF8;
 915         cl->flags &= and_with->flags;
 916         cl->flags |= outside_bitmap_but_not_utf8;
 917     }
 918 }
 919
 920 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 921  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 922  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 923 STATIC void
 924 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 925 {
 926     PERL_ARGS_ASSERT_CL_OR;
 927
 928     if (or_with->flags & ANYOF_INVERT) {
 929
 930         /* Here, the or'd node is to be inverted.  This means we take the
 931          * complement of everything not in the bitmap, but currently we don't
 932          * know what that is, so give up and match anything */
 933         if (ANYOF_NONBITMAP(or_with)) {
 934             cl_anything(pRExC_state, cl);
 935         }
 936         /* We do not use
 937          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 938          *   <= (B1 | !B2) | (CL1 | !CL2)
 939          * which is wasteful if CL2 is small, but we ignore CL2:
 940          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 941          * XXXX Can we handle case-fold?  Unclear:
 942          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 943          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 944          */
 945         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 946              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 947              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 948             int i;
 949
 950             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 951                 cl->bitmap[i] |= ~or_with->bitmap[i];
 952         } /* XXXX: logic is complicated otherwise */
 953         else {
 954             cl_anything(pRExC_state, cl);
 955         }
 956
 957         /* And, we can just take the union of the flags that aren't affected
 958          * by the inversion */
 959         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 960
 961         /* For the remaining flags:
 962             ANYOF_UNICODE_ALL and inverted means to not match anything above
 963                     255, which means that the union with cl should just be
 964                     what cl has in it, so can ignore this flag
 965             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 966                     is 127-255 to match them, but then invert that, so the
 967                     union with cl should just be what cl has in it, so can
 968                     ignore this flag
 969          */
 970     } else {    /* 'or_with' is not inverted */
 971         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 972         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 973              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 974                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 975             int i;
 976
 977             /* OR char bitmap and class bitmap separately */
 978             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 979                 cl->bitmap[i] |= or_with->bitmap[i];
 980             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 981                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 982                     cl->classflags[i] |= or_with->classflags[i];
 983                 cl->flags |= ANYOF_CLASS;
 984             }
 985         }
 986         else { /* XXXX: logic is complicated, leave it along for a moment. */
 987             cl_anything(pRExC_state, cl);
 988         }
 989
 990         if (ANYOF_NONBITMAP(or_with)) {
 991
 992             /* Use the added node's outside-the-bit-map match if there isn't a
 993              * conflict.  If there is a conflict (both nodes match something
 994              * outside the bitmap, but what they match outside is not the same
 995              * pointer, and hence not easily compared until XXX we extend
 996              * inversion lists this far), give up and allow the start class to
 997              * match everything outside the bitmap.  If that stuff is all above
 998              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 999             if (! ANYOF_NONBITMAP(cl)) {
1000                 ARG_SET(cl, ARG(or_with));
1001             }
1002             else if (ARG(cl) != ARG(or_with)) {
1003
1004                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1005                     cl_anything(pRExC_state, cl);
1006                 }
1007                 else {
1008                     cl->flags |= ANYOF_UNICODE_ALL;
1009                 }
1010             }
1011         }
1012
1013         /* Take the union */
1014         cl->flags |= or_with->flags;
1015     }
1016 }
1017
1018 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1019 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1020 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1021 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1022
1023
1024 #ifdef DEBUGGING
1025 /*
1026    dump_trie(trie,widecharmap,revcharmap)
1027    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1028    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1029
1030    These routines dump out a trie in a somewhat readable format.
1031    The _interim_ variants are used for debugging the interim
1032    tables that are used to generate the final compressed
1033    representation which is what dump_trie expects.
1034
1035    Part of the reason for their existence is to provide a form
1036    of documentation as to how the different representations function.
1037
1038 */
1039
1040 /*
1041   Dumps the final compressed table form of the trie to Perl_debug_log.
1042   Used for debugging make_trie().
1043 */
1044
1045 STATIC void
1046 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1047             AV *revcharmap, U32 depth)
1048 {
1049     U32 state;
1050     SV *sv=sv_newmortal();
1051     int colwidth= widecharmap ? 6 : 4;
1052     U16 word;
1053     GET_RE_DEBUG_FLAGS_DECL;
1054
1055     PERL_ARGS_ASSERT_DUMP_TRIE;
1056
1057     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1058         (int)depth * 2 + 2,"",
1059         "Match","Base","Ofs" );
1060
1061     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1062         SV ** const tmp = av_fetch( revcharmap, state, 0);
1063         if ( tmp ) {
1064             PerlIO_printf( Perl_debug_log, "%*s",
1065                 colwidth,
1066                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1067                             PL_colors[0], PL_colors[1],
1068                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1069                             PERL_PV_ESCAPE_FIRSTCHAR
1070                 )
1071             );
1072         }
1073     }
1074     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1075         (int)depth * 2 + 2,"");
1076
1077     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1078         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1079     PerlIO_printf( Perl_debug_log, "\n");
1080
1081     for( state = 1 ; state < trie->statecount ; state++ ) {
1082         const U32 base = trie->states[ state ].trans.base;
1083
1084         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1085
1086         if ( trie->states[ state ].wordnum ) {
1087             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1088         } else {
1089             PerlIO_printf( Perl_debug_log, "%6s", "" );
1090         }
1091
1092         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1093
1094         if ( base ) {
1095             U32 ofs = 0;
1096
1097             while( ( base + ofs  < trie->uniquecharcount ) ||
1098                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1099                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1100                     ofs++;
1101
1102             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1103
1104             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1105                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1106                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1107                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1108                 {
1109                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1110                     colwidth,
1111                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1112                 } else {
1113                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1114                 }
1115             }
1116
1117             PerlIO_printf( Perl_debug_log, "]");
1118
1119         }
1120         PerlIO_printf( Perl_debug_log, "\n" );
1121     }
1122     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1123     for (word=1; word <= trie->wordcount; word++) {
1124         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1125             (int)word, (int)(trie->wordinfo[word].prev),
1126             (int)(trie->wordinfo[word].len));
1127     }
1128     PerlIO_printf(Perl_debug_log, "\n" );
1129 }
1130 /*
1131   Dumps a fully constructed but uncompressed trie in list form.
1132   List tries normally only are used for construction when the number of
1133   possible chars (trie->uniquecharcount) is very high.
1134   Used for debugging make_trie().
1135 */
1136 STATIC void
1137 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1138                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1139                          U32 depth)
1140 {
1141     U32 state;
1142     SV *sv=sv_newmortal();
1143     int colwidth= widecharmap ? 6 : 4;
1144     GET_RE_DEBUG_FLAGS_DECL;
1145
1146     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1147
1148     /* print out the table precompression.  */
1149     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1150         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1151         "------:-----+-----------------\n" );
1152
1153     for( state=1 ; state < next_alloc ; state ++ ) {
1154         U16 charid;
1155
1156         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1157             (int)depth * 2 + 2,"", (UV)state  );
1158         if ( ! trie->states[ state ].wordnum ) {
1159             PerlIO_printf( Perl_debug_log, "%5s| ","");
1160         } else {
1161             PerlIO_printf( Perl_debug_log, "W%4x| ",
1162                 trie->states[ state ].wordnum
1163             );
1164         }
1165         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1166             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1167             if ( tmp ) {
1168                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1169                     colwidth,
1170                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1171                             PL_colors[0], PL_colors[1],
1172                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1173                             PERL_PV_ESCAPE_FIRSTCHAR
1174                     ) ,
1175                     TRIE_LIST_ITEM(state,charid).forid,
1176                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1177                 );
1178                 if (!(charid % 10))
1179                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1180                         (int)((depth * 2) + 14), "");
1181             }
1182         }
1183         PerlIO_printf( Perl_debug_log, "\n");
1184     }
1185 }
1186
1187 /*
1188   Dumps a fully constructed but uncompressed trie in table form.
1189   This is the normal DFA style state transition table, with a few
1190   twists to facilitate compression later.
1191   Used for debugging make_trie().
1192 */
1193 STATIC void
1194 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1195                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1196                           U32 depth)
1197 {
1198     U32 state;
1199     U16 charid;
1200     SV *sv=sv_newmortal();
1201     int colwidth= widecharmap ? 6 : 4;
1202     GET_RE_DEBUG_FLAGS_DECL;
1203
1204     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1205
1206     /*
1207        print out the table precompression so that we can do a visual check
1208        that they are identical.
1209      */
1210
1211     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1212
1213     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1214         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1215         if ( tmp ) {
1216             PerlIO_printf( Perl_debug_log, "%*s",
1217                 colwidth,
1218                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1219                             PL_colors[0], PL_colors[1],
1220                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1221                             PERL_PV_ESCAPE_FIRSTCHAR
1222                 )
1223             );
1224         }
1225     }
1226
1227     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1228
1229     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1230         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1231     }
1232
1233     PerlIO_printf( Perl_debug_log, "\n" );
1234
1235     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1236
1237         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1238             (int)depth * 2 + 2,"",
1239             (UV)TRIE_NODENUM( state ) );
1240
1241         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1242             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1243             if (v)
1244                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1245             else
1246                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1247         }
1248         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1249             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1250         } else {
1251             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1252             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1253         }
1254     }
1255 }
1256
1257 #endif
1258
1259
1260 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1261   startbranch: the first branch in the whole branch sequence
1262   first      : start branch of sequence of branch-exact nodes.
1263                May be the same as startbranch
1264   last       : Thing following the last branch.
1265                May be the same as tail.
1266   tail       : item following the branch sequence
1267   count      : words in the sequence
1268   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1269   depth      : indent depth
1270
1271 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1272
1273 A trie is an N'ary tree where the branches are determined by digital
1274 decomposition of the key. IE, at the root node you look up the 1st character and
1275 follow that branch repeat until you find the end of the branches. Nodes can be
1276 marked as "accepting" meaning they represent a complete word. Eg:
1277
1278   /he|she|his|hers/
1279
1280 would convert into the following structure. Numbers represent states, letters
1281 following numbers represent valid transitions on the letter from that state, if
1282 the number is in square brackets it represents an accepting state, otherwise it
1283 will be in parenthesis.
1284
1285       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1286       |    |
1287       |   (2)
1288       |    |
1289      (1)   +-i->(6)-+-s->[7]
1290       |
1291       +-s->(3)-+-h->(4)-+-e->[5]
1292
1293       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1294
1295 This shows that when matching against the string 'hers' we will begin at state 1
1296 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1297 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1298 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1299 single traverse. We store a mapping from accepting to state to which word was
1300 matched, and then when we have multiple possibilities we try to complete the
1301 rest of the regex in the order in which they occured in the alternation.
1302
1303 The only prior NFA like behaviour that would be changed by the TRIE support is
1304 the silent ignoring of duplicate alternations which are of the form:
1305
1306  / (DUPE|DUPE) X? (?{ ... }) Y /x
1307
1308 Thus EVAL blocks following a trie may be called a different number of times with
1309 and without the optimisation. With the optimisations dupes will be silently
1310 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1311 the following demonstrates:
1312
1313  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1314
1315 which prints out 'word' three times, but
1316
1317  'words'=~/(word|word|word)(?{ print $1 })S/
1318
1319 which doesnt print it out at all. This is due to other optimisations kicking in.
1320
1321 Example of what happens on a structural level:
1322
1323 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1324
1325    1: CURLYM[1] {1,32767}(18)
1326    5:   BRANCH(8)
1327    6:     EXACT <ac>(16)
1328    8:   BRANCH(11)
1329    9:     EXACT <ad>(16)
1330   11:   BRANCH(14)
1331   12:     EXACT <ab>(16)
1332   16:   SUCCEED(0)
1333   17:   NOTHING(18)
1334   18: END(0)
1335
1336 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1337 and should turn into:
1338
1339    1: CURLYM[1] {1,32767}(18)
1340    5:   TRIE(16)
1341         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1342           <ac>
1343           <ad>
1344           <ab>
1345   16:   SUCCEED(0)
1346   17:   NOTHING(18)
1347   18: END(0)
1348
1349 Cases where tail != last would be like /(?foo|bar)baz/:
1350
1351    1: BRANCH(4)
1352    2:   EXACT <foo>(8)
1353    4: BRANCH(7)
1354    5:   EXACT <bar>(8)
1355    7: TAIL(8)
1356    8: EXACT <baz>(10)
1357   10: END(0)
1358
1359 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1360 and would end up looking like:
1361
1362     1: TRIE(8)
1363       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1364         <foo>
1365         <bar>
1366    7: TAIL(8)
1367    8: EXACT <baz>(10)
1368   10: END(0)
1369
1370     d = uvuni_to_utf8_flags(d, uv, 0);
1371
1372 is the recommended Unicode-aware way of saying
1373
1374     *(d++) = uv;
1375 */
1376
1377 #define TRIE_STORE_REVCHAR(val)                                            \
1378     STMT_START {                                                           \
1379         if (UTF) {                                                         \
1380             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1381             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1382             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1383             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1384             SvPOK_on(zlopp);                                               \
1385             SvUTF8_on(zlopp);                                              \
1386             av_push(revcharmap, zlopp);                                    \
1387         } else {                                                           \
1388             char ooooff = (char)val;                                           \
1389             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1390         }                                                                  \
1391         } STMT_END
1392
1393 #define TRIE_READ_CHAR STMT_START {                                                     \
1394     wordlen++;                                                                          \
1395     if ( UTF ) {                                                                        \
1396         /* if it is UTF then it is either already folded, or does not need folding */   \
1397         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1398     }                                                                                   \
1399     else if (folder == PL_fold_latin1) {                                                \
1400         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1401         if ( foldlen > 0 ) {                                                            \
1402            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1403            foldlen -= len;                                                              \
1404            scan += len;                                                                 \
1405            len = 0;                                                                     \
1406         } else {                                                                        \
1407             len = 1;                                                                    \
1408             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1409             skiplen = UNISKIP(uvc);                                                     \
1410             foldlen -= skiplen;                                                         \
1411             scan = foldbuf + skiplen;                                                   \
1412         }                                                                               \
1413     } else {                                                                            \
1414         /* raw data, will be folded later if needed */                                  \
1415         uvc = (U32)*uc;                                                                 \
1416         len = 1;                                                                        \
1417     }                                                                                   \
1418 } STMT_END
1419
1420
1421
1422 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1423     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1424         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1425         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1426     }                                                           \
1427     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1428     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1429     TRIE_LIST_CUR( state )++;                                   \
1430 } STMT_END
1431
1432 #define TRIE_LIST_NEW(state) STMT_START {                       \
1433     Newxz( trie->states[ state ].trans.list,               \
1434         4, reg_trie_trans_le );                                 \
1435      TRIE_LIST_CUR( state ) = 1;                                \
1436      TRIE_LIST_LEN( state ) = 4;                                \
1437 } STMT_END
1438
1439 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1440     U16 dupe= trie->states[ state ].wordnum;                    \
1441     regnode * const noper_next = regnext( noper );              \
1442                                                                 \
1443     DEBUG_r({                                                   \
1444         /* store the word for dumping */                        \
1445         SV* tmp;                                                \
1446         if (OP(noper) != NOTHING)                               \
1447             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1448         else                                                    \
1449             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1450         av_push( trie_words, tmp );                             \
1451     });                                                         \
1452                                                                 \
1453     curword++;                                                  \
1454     trie->wordinfo[curword].prev   = 0;                         \
1455     trie->wordinfo[curword].len    = wordlen;                   \
1456     trie->wordinfo[curword].accept = state;                     \
1457                                                                 \
1458     if ( noper_next < tail ) {                                  \
1459         if (!trie->jump)                                        \
1460             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1461         trie->jump[curword] = (U16)(noper_next - convert);      \
1462         if (!jumper)                                            \
1463             jumper = noper_next;                                \
1464         if (!nextbranch)                                        \
1465             nextbranch= regnext(cur);                           \
1466     }                                                           \
1467                                                                 \
1468     if ( dupe ) {                                               \
1469         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1470         /* chain, so that when the bits of chain are later    */\
1471         /* linked together, the dups appear in the chain      */\
1472         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1473         trie->wordinfo[dupe].prev = curword;                    \
1474     } else {                                                    \
1475         /* we haven't inserted this word yet.                */ \
1476         trie->states[ state ].wordnum = curword;                \
1477     }                                                           \
1478 } STMT_END
1479
1480
1481 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1482      ( ( base + charid >=  ucharcount                                   \
1483          && base + charid < ubound                                      \
1484          && state == trie->trans[ base - ucharcount + charid ].check    \
1485          && trie->trans[ base - ucharcount + charid ].next )            \
1486            ? trie->trans[ base - ucharcount + charid ].next             \
1487            : ( state==1 ? special : 0 )                                 \
1488       )
1489
1490 #define MADE_TRIE       1
1491 #define MADE_JUMP_TRIE  2
1492 #define MADE_EXACT_TRIE 4
1493
1494 STATIC I32
1495 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1496 {
1497     dVAR;
1498     /* first pass, loop through and scan words */
1499     reg_trie_data *trie;
1500     HV *widecharmap = NULL;
1501     AV *revcharmap = newAV();
1502     regnode *cur;
1503     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1504     STRLEN len = 0;
1505     UV uvc = 0;
1506     U16 curword = 0;
1507     U32 next_alloc = 0;
1508     regnode *jumper = NULL;
1509     regnode *nextbranch = NULL;
1510     regnode *convert = NULL;
1511     U32 *prev_states; /* temp array mapping each state to previous one */
1512     /* we just use folder as a flag in utf8 */
1513     const U8 * folder = NULL;
1514
1515 #ifdef DEBUGGING
1516     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1517     AV *trie_words = NULL;
1518     /* along with revcharmap, this only used during construction but both are
1519      * useful during debugging so we store them in the struct when debugging.
1520      */
1521 #else
1522     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1523     STRLEN trie_charcount=0;
1524 #endif
1525     SV *re_trie_maxbuff;
1526     GET_RE_DEBUG_FLAGS_DECL;
1527
1528     PERL_ARGS_ASSERT_MAKE_TRIE;
1529 #ifndef DEBUGGING
1530     PERL_UNUSED_ARG(depth);
1531 #endif
1532
1533     switch (flags) {
1534         case EXACT: break;
1535         case EXACTFA:
1536         case EXACTFU_SS:
1537         case EXACTFU_TRICKYFOLD:
1538         case EXACTFU: folder = PL_fold_latin1; break;
1539         case EXACTF:  folder = PL_fold; break;
1540         case EXACTFL: folder = PL_fold_locale; break;
1541         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1542     }
1543
1544     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1545     trie->refcount = 1;
1546     trie->startstate = 1;
1547     trie->wordcount = word_count;
1548     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1549     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1550     if (flags == EXACT)
1551         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1552     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1553                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1554
1555     DEBUG_r({
1556         trie_words = newAV();
1557     });
1558
1559     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1560     if (!SvIOK(re_trie_maxbuff)) {
1561         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1562     }
1563     DEBUG_TRIE_COMPILE_r({
1564                 PerlIO_printf( Perl_debug_log,
1565                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1566                   (int)depth * 2 + 2, "",
1567                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1568                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1569                   (int)depth);
1570     });
1571
1572    /* Find the node we are going to overwrite */
1573     if ( first == startbranch && OP( last ) != BRANCH ) {
1574         /* whole branch chain */
1575         convert = first;
1576     } else {
1577         /* branch sub-chain */
1578         convert = NEXTOPER( first );
1579     }
1580
1581     /*  -- First loop and Setup --
1582
1583        We first traverse the branches and scan each word to determine if it
1584        contains widechars, and how many unique chars there are, this is
1585        important as we have to build a table with at least as many columns as we
1586        have unique chars.
1587
1588        We use an array of integers to represent the character codes 0..255
1589        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1590        native representation of the character value as the key and IV's for the
1591        coded index.
1592
1593        *TODO* If we keep track of how many times each character is used we can
1594        remap the columns so that the table compression later on is more
1595        efficient in terms of memory by ensuring the most common value is in the
1596        middle and the least common are on the outside.  IMO this would be better
1597        than a most to least common mapping as theres a decent chance the most
1598        common letter will share a node with the least common, meaning the node
1599        will not be compressible. With a middle is most common approach the worst
1600        case is when we have the least common nodes twice.
1601
1602      */
1603
1604     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1605         regnode *noper = NEXTOPER( cur );
1606         const U8 *uc = (U8*)STRING( noper );
1607         const U8 *e  = uc + STR_LEN( noper );
1608         STRLEN foldlen = 0;
1609         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1610         STRLEN skiplen = 0;
1611         const U8 *scan = (U8*)NULL;
1612         U32 wordlen      = 0;         /* required init */
1613         STRLEN chars = 0;
1614         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1615
1616         if (OP(noper) == NOTHING) {
1617             regnode *noper_next= regnext(noper);
1618             if (noper_next != tail && OP(noper_next) == flags) {
1619                 noper = noper_next;
1620                 uc= (U8*)STRING(noper);
1621                 e= uc + STR_LEN(noper);
1622                 trie->minlen= STR_LEN(noper);
1623             } else {
1624                 trie->minlen= 0;
1625                 continue;
1626             }
1627         }
1628
1629         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1630             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1631                                           regardless of encoding */
1632             if (OP( noper ) == EXACTFU_SS) {
1633                 /* false positives are ok, so just set this */
1634                 TRIE_BITMAP_SET(trie,0xDF);
1635             }
1636         }
1637         for ( ; uc < e ; uc += len ) {
1638             TRIE_CHARCOUNT(trie)++;
1639             TRIE_READ_CHAR;
1640             chars++;
1641             if ( uvc < 256 ) {
1642                 if ( folder ) {
1643                     U8 folded= folder[ (U8) uvc ];
1644                     if ( !trie->charmap[ folded ] ) {
1645                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1646                         TRIE_STORE_REVCHAR( folded );
1647                     }
1648                 }
1649                 if ( !trie->charmap[ uvc ] ) {
1650                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1651                     TRIE_STORE_REVCHAR( uvc );
1652                 }
1653                 if ( set_bit ) {
1654                     /* store the codepoint in the bitmap, and its folded
1655                      * equivalent. */
1656                     TRIE_BITMAP_SET(trie, uvc);
1657
1658                     /* store the folded codepoint */
1659                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1660
1661                     if ( !UTF ) {
1662                         /* store first byte of utf8 representation of
1663                            variant codepoints */
1664                         if (! UNI_IS_INVARIANT(uvc)) {
1665                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1666                         }
1667                     }
1668                     set_bit = 0; /* We've done our bit :-) */
1669                 }
1670             } else {
1671                 SV** svpp;
1672                 if ( !widecharmap )
1673                     widecharmap = newHV();
1674
1675                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1676
1677                 if ( !svpp )
1678                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1679
1680                 if ( !SvTRUE( *svpp ) ) {
1681                     sv_setiv( *svpp, ++trie->uniquecharcount );
1682                     TRIE_STORE_REVCHAR(uvc);
1683                 }
1684             }
1685         }
1686         if( cur == first ) {
1687             trie->minlen = chars;
1688             trie->maxlen = chars;
1689         } else if (chars < trie->minlen) {
1690             trie->minlen = chars;
1691         } else if (chars > trie->maxlen) {
1692             trie->maxlen = chars;
1693         }
1694         if (OP( noper ) == EXACTFU_SS) {
1695             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1696             if (trie->minlen > 1)
1697                 trie->minlen= 1;
1698         }
1699         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1700             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1701              *                - We assume that any such sequence might match a 2 byte string */
1702             if (trie->minlen > 2 )
1703                 trie->minlen= 2;
1704         }
1705
1706     } /* end first pass */
1707     DEBUG_TRIE_COMPILE_r(
1708         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1709                 (int)depth * 2 + 2,"",
1710                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1711                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1712                 (int)trie->minlen, (int)trie->maxlen )
1713     );
1714
1715     /*
1716         We now know what we are dealing with in terms of unique chars and
1717         string sizes so we can calculate how much memory a naive
1718         representation using a flat table  will take. If it's over a reasonable
1719         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1720         conservative but potentially much slower representation using an array
1721         of lists.
1722
1723         At the end we convert both representations into the same compressed
1724         form that will be used in regexec.c for matching with. The latter
1725         is a form that cannot be used to construct with but has memory
1726         properties similar to the list form and access properties similar
1727         to the table form making it both suitable for fast searches and
1728         small enough that its feasable to store for the duration of a program.
1729
1730         See the comment in the code where the compressed table is produced
1731         inplace from the flat tabe representation for an explanation of how
1732         the compression works.
1733
1734     */
1735
1736
1737     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1738     prev_states[1] = 0;
1739
1740     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1741         /*
1742             Second Pass -- Array Of Lists Representation
1743
1744             Each state will be represented by a list of charid:state records
1745             (reg_trie_trans_le) the first such element holds the CUR and LEN
1746             points of the allocated array. (See defines above).
1747
1748             We build the initial structure using the lists, and then convert
1749             it into the compressed table form which allows faster lookups
1750             (but cant be modified once converted).
1751         */
1752
1753         STRLEN transcount = 1;
1754
1755         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1756             "%*sCompiling trie using list compiler\n",
1757             (int)depth * 2 + 2, ""));
1758
1759         trie->states = (reg_trie_state *)
1760             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1761                                   sizeof(reg_trie_state) );
1762         TRIE_LIST_NEW(1);
1763         next_alloc = 2;
1764
1765         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1766
1767             regnode *noper   = NEXTOPER( cur );
1768             U8 *uc           = (U8*)STRING( noper );
1769             const U8 *e      = uc + STR_LEN( noper );
1770             U32 state        = 1;         /* required init */
1771             U16 charid       = 0;         /* sanity init */
1772             U8 *scan         = (U8*)NULL; /* sanity init */
1773             STRLEN foldlen   = 0;         /* required init */
1774             U32 wordlen      = 0;         /* required init */
1775             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1776             STRLEN skiplen   = 0;
1777
1778             if (OP(noper) == NOTHING) {
1779                 regnode *noper_next= regnext(noper);
1780                 if (noper_next != tail && OP(noper_next) == flags) {
1781                     noper = noper_next;
1782                     uc= (U8*)STRING(noper);
1783                     e= uc + STR_LEN(noper);
1784                 }
1785             }
1786
1787             if (OP(noper) != NOTHING) {
1788                 for ( ; uc < e ; uc += len ) {
1789
1790                     TRIE_READ_CHAR;
1791
1792                     if ( uvc < 256 ) {
1793                         charid = trie->charmap[ uvc ];
1794                     } else {
1795                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1796                         if ( !svpp ) {
1797                             charid = 0;
1798                         } else {
1799                             charid=(U16)SvIV( *svpp );
1800                         }
1801                     }
1802                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1803                     if ( charid ) {
1804
1805                         U16 check;
1806                         U32 newstate = 0;
1807
1808                         charid--;
1809                         if ( !trie->states[ state ].trans.list ) {
1810                             TRIE_LIST_NEW( state );
1811                         }
1812                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1813                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1814                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1815                                 break;
1816                             }
1817                         }
1818                         if ( ! newstate ) {
1819                             newstate = next_alloc++;
1820                             prev_states[newstate] = state;
1821                             TRIE_LIST_PUSH( state, charid, newstate );
1822                             transcount++;
1823                         }
1824                         state = newstate;
1825                     } else {
1826                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1827                     }
1828                 }
1829             }
1830             TRIE_HANDLE_WORD(state);
1831
1832         } /* end second pass */
1833
1834         /* next alloc is the NEXT state to be allocated */
1835         trie->statecount = next_alloc;
1836         trie->states = (reg_trie_state *)
1837             PerlMemShared_realloc( trie->states,
1838                                    next_alloc
1839                                    * sizeof(reg_trie_state) );
1840
1841         /* and now dump it out before we compress it */
1842         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1843                                                          revcharmap, next_alloc,
1844                                                          depth+1)
1845         );
1846
1847         trie->trans = (reg_trie_trans *)
1848             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1849         {
1850             U32 state;
1851             U32 tp = 0;
1852             U32 zp = 0;
1853
1854
1855             for( state=1 ; state < next_alloc ; state ++ ) {
1856                 U32 base=0;
1857
1858                 /*
1859                 DEBUG_TRIE_COMPILE_MORE_r(
1860                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1861                 );
1862                 */
1863
1864                 if (trie->states[state].trans.list) {
1865                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1866                     U16 maxid=minid;
1867                     U16 idx;
1868
1869                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1870                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1871                         if ( forid < minid ) {
1872                             minid=forid;
1873                         } else if ( forid > maxid ) {
1874                             maxid=forid;
1875                         }
1876                     }
1877                     if ( transcount < tp + maxid - minid + 1) {
1878                         transcount *= 2;
1879                         trie->trans = (reg_trie_trans *)
1880                             PerlMemShared_realloc( trie->trans,
1881                                                      transcount
1882                                                      * sizeof(reg_trie_trans) );
1883                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1884                     }
1885                     base = trie->uniquecharcount + tp - minid;
1886                     if ( maxid == minid ) {
1887                         U32 set = 0;
1888                         for ( ; zp < tp ; zp++ ) {
1889                             if ( ! trie->trans[ zp ].next ) {
1890                                 base = trie->uniquecharcount + zp - minid;
1891                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1892                                 trie->trans[ zp ].check = state;
1893                                 set = 1;
1894                                 break;
1895                             }
1896                         }
1897                         if ( !set ) {
1898                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1899                             trie->trans[ tp ].check = state;
1900                             tp++;
1901                             zp = tp;
1902                         }
1903                     } else {
1904                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1905                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1906                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1907                             trie->trans[ tid ].check = state;
1908                         }
1909                         tp += ( maxid - minid + 1 );
1910                     }
1911                     Safefree(trie->states[ state ].trans.list);
1912                 }
1913                 /*
1914                 DEBUG_TRIE_COMPILE_MORE_r(
1915                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1916                 );
1917                 */
1918                 trie->states[ state ].trans.base=base;
1919             }
1920             trie->lasttrans = tp + 1;
1921         }
1922     } else {
1923         /*
1924            Second Pass -- Flat Table Representation.
1925
1926            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1927            We know that we will need Charcount+1 trans at most to store the data
1928            (one row per char at worst case) So we preallocate both structures
1929            assuming worst case.
1930
1931            We then construct the trie using only the .next slots of the entry
1932            structs.
1933
1934            We use the .check field of the first entry of the node temporarily to
1935            make compression both faster and easier by keeping track of how many non
1936            zero fields are in the node.
1937
1938            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1939            transition.
1940
1941            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1942            number representing the first entry of the node, and state as a
1943            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1944            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1945            are 2 entrys per node. eg:
1946
1947              A B       A B
1948           1. 2 4    1. 3 7
1949           2. 0 3    3. 0 5
1950           3. 0 0    5. 0 0
1951           4. 0 0    7. 0 0
1952
1953            The table is internally in the right hand, idx form. However as we also
1954            have to deal with the states array which is indexed by nodenum we have to
1955            use TRIE_NODENUM() to convert.
1956
1957         */
1958         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1959             "%*sCompiling trie using table compiler\n",
1960             (int)depth * 2 + 2, ""));
1961
1962         trie->trans = (reg_trie_trans *)
1963             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1964                                   * trie->uniquecharcount + 1,
1965                                   sizeof(reg_trie_trans) );
1966         trie->states = (reg_trie_state *)
1967             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1968                                   sizeof(reg_trie_state) );
1969         next_alloc = trie->uniquecharcount + 1;
1970
1971
1972         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1973
1974             regnode *noper   = NEXTOPER( cur );
1975             const U8 *uc     = (U8*)STRING( noper );
1976             const U8 *e      = uc + STR_LEN( noper );
1977
1978             U32 state        = 1;         /* required init */
1979
1980             U16 charid       = 0;         /* sanity init */
1981             U32 accept_state = 0;         /* sanity init */
1982             U8 *scan         = (U8*)NULL; /* sanity init */
1983
1984             STRLEN foldlen   = 0;         /* required init */
1985             U32 wordlen      = 0;         /* required init */
1986             STRLEN skiplen   = 0;
1987             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1988
1989             if (OP(noper) == NOTHING) {
1990                 regnode *noper_next= regnext(noper);
1991                 if (noper_next != tail && OP(noper_next) == flags) {
1992                     noper = noper_next;
1993                     uc= (U8*)STRING(noper);
1994                     e= uc + STR_LEN(noper);
1995                 }
1996             }
1997
1998             if ( OP(noper) != NOTHING ) {
1999                 for ( ; uc < e ; uc += len ) {
2000
2001                     TRIE_READ_CHAR;
2002
2003                     if ( uvc < 256 ) {
2004                         charid = trie->charmap[ uvc ];
2005                     } else {
2006                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2007                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2008                     }
2009                     if ( charid ) {
2010                         charid--;
2011                         if ( !trie->trans[ state + charid ].next ) {
2012                             trie->trans[ state + charid ].next = next_alloc;
2013                             trie->trans[ state ].check++;
2014                             prev_states[TRIE_NODENUM(next_alloc)]
2015                                     = TRIE_NODENUM(state);
2016                             next_alloc += trie->uniquecharcount;
2017                         }
2018                         state = trie->trans[ state + charid ].next;
2019                     } else {
2020                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2021                     }
2022                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2023                 }
2024             }
2025             accept_state = TRIE_NODENUM( state );
2026             TRIE_HANDLE_WORD(accept_state);
2027
2028         } /* end second pass */
2029
2030         /* and now dump it out before we compress it */
2031         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2032                                                           revcharmap,
2033                                                           next_alloc, depth+1));
2034
2035         {
2036         /*
2037            * Inplace compress the table.*
2038
2039            For sparse data sets the table constructed by the trie algorithm will
2040            be mostly 0/FAIL transitions or to put it another way mostly empty.
2041            (Note that leaf nodes will not contain any transitions.)
2042
2043            This algorithm compresses the tables by eliminating most such
2044            transitions, at the cost of a modest bit of extra work during lookup:
2045
2046            - Each states[] entry contains a .base field which indicates the
2047            index in the state[] array wheres its transition data is stored.
2048
2049            - If .base is 0 there are no valid transitions from that node.
2050
2051            - If .base is nonzero then charid is added to it to find an entry in
2052            the trans array.
2053
2054            -If trans[states[state].base+charid].check!=state then the
2055            transition is taken to be a 0/Fail transition. Thus if there are fail
2056            transitions at the front of the node then the .base offset will point
2057            somewhere inside the previous nodes data (or maybe even into a node
2058            even earlier), but the .check field determines if the transition is
2059            valid.
2060
2061            XXX - wrong maybe?
2062            The following process inplace converts the table to the compressed
2063            table: We first do not compress the root node 1,and mark all its
2064            .check pointers as 1 and set its .base pointer as 1 as well. This
2065            allows us to do a DFA construction from the compressed table later,
2066            and ensures that any .base pointers we calculate later are greater
2067            than 0.
2068
2069            - We set 'pos' to indicate the first entry of the second node.
2070
2071            - We then iterate over the columns of the node, finding the first and
2072            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2073            and set the .check pointers accordingly, and advance pos
2074            appropriately and repreat for the next node. Note that when we copy
2075            the next pointers we have to convert them from the original
2076            NODEIDX form to NODENUM form as the former is not valid post
2077            compression.
2078
2079            - If a node has no transitions used we mark its base as 0 and do not
2080            advance the pos pointer.
2081
2082            - If a node only has one transition we use a second pointer into the
2083            structure to fill in allocated fail transitions from other states.
2084            This pointer is independent of the main pointer and scans forward
2085            looking for null transitions that are allocated to a state. When it
2086            finds one it writes the single transition into the "hole".  If the
2087            pointer doesnt find one the single transition is appended as normal.
2088
2089            - Once compressed we can Renew/realloc the structures to release the
2090            excess space.
2091
2092            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2093            specifically Fig 3.47 and the associated pseudocode.
2094
2095            demq
2096         */
2097         const U32 laststate = TRIE_NODENUM( next_alloc );
2098         U32 state, charid;
2099         U32 pos = 0, zp=0;
2100         trie->statecount = laststate;
2101
2102         for ( state = 1 ; state < laststate ; state++ ) {
2103             U8 flag = 0;
2104             const U32 stateidx = TRIE_NODEIDX( state );
2105             const U32 o_used = trie->trans[ stateidx ].check;
2106             U32 used = trie->trans[ stateidx ].check;
2107             trie->trans[ stateidx ].check = 0;
2108
2109             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2110                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2111                     if ( trie->trans[ stateidx + charid ].next ) {
2112                         if (o_used == 1) {
2113                             for ( ; zp < pos ; zp++ ) {
2114                                 if ( ! trie->trans[ zp ].next ) {
2115                                     break;
2116                                 }
2117                             }
2118                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2119                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2120                             trie->trans[ zp ].check = state;
2121                             if ( ++zp > pos ) pos = zp;
2122                             break;
2123                         }
2124                         used--;
2125                     }
2126                     if ( !flag ) {
2127                         flag = 1;
2128                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2129                     }
2130                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2131                     trie->trans[ pos ].check = state;
2132                     pos++;
2133                 }
2134             }
2135         }
2136         trie->lasttrans = pos + 1;
2137         trie->states = (reg_trie_state *)
2138             PerlMemShared_realloc( trie->states, laststate
2139                                    * sizeof(reg_trie_state) );
2140         DEBUG_TRIE_COMPILE_MORE_r(
2141                 PerlIO_printf( Perl_debug_log,
2142                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2143                     (int)depth * 2 + 2,"",
2144                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2145                     (IV)next_alloc,
2146                     (IV)pos,
2147                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2148             );
2149
2150         } /* end table compress */
2151     }
2152     DEBUG_TRIE_COMPILE_MORE_r(
2153             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2154                 (int)depth * 2 + 2, "",
2155                 (UV)trie->statecount,
2156                 (UV)trie->lasttrans)
2157     );
2158     /* resize the trans array to remove unused space */
2159     trie->trans = (reg_trie_trans *)
2160         PerlMemShared_realloc( trie->trans, trie->lasttrans
2161                                * sizeof(reg_trie_trans) );
2162
2163     {   /* Modify the program and insert the new TRIE node */
2164         U8 nodetype =(U8)(flags & 0xFF);
2165         char *str=NULL;
2166
2167 #ifdef DEBUGGING
2168         regnode *optimize = NULL;
2169 #ifdef RE_TRACK_PATTERN_OFFSETS
2170
2171         U32 mjd_offset = 0;
2172         U32 mjd_nodelen = 0;
2173 #endif /* RE_TRACK_PATTERN_OFFSETS */
2174 #endif /* DEBUGGING */
2175         /*
2176            This means we convert either the first branch or the first Exact,
2177            depending on whether the thing following (in 'last') is a branch
2178            or not and whther first is the startbranch (ie is it a sub part of
2179            the alternation or is it the whole thing.)
2180            Assuming its a sub part we convert the EXACT otherwise we convert
2181            the whole branch sequence, including the first.
2182          */
2183         /* Find the node we are going to overwrite */
2184         if ( first != startbranch || OP( last ) == BRANCH ) {
2185             /* branch sub-chain */
2186             NEXT_OFF( first ) = (U16)(last - first);
2187 #ifdef RE_TRACK_PATTERN_OFFSETS
2188             DEBUG_r({
2189                 mjd_offset= Node_Offset((convert));
2190                 mjd_nodelen= Node_Length((convert));
2191             });
2192 #endif
2193             /* whole branch chain */
2194         }
2195 #ifdef RE_TRACK_PATTERN_OFFSETS
2196         else {
2197             DEBUG_r({
2198                 const  regnode *nop = NEXTOPER( convert );
2199                 mjd_offset= Node_Offset((nop));
2200                 mjd_nodelen= Node_Length((nop));
2201             });
2202         }
2203         DEBUG_OPTIMISE_r(
2204             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2205                 (int)depth * 2 + 2, "",
2206                 (UV)mjd_offset, (UV)mjd_nodelen)
2207         );
2208 #endif
2209         /* But first we check to see if there is a common prefix we can
2210            split out as an EXACT and put in front of the TRIE node.  */
2211         trie->startstate= 1;
2212         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2213             U32 state;
2214             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2215                 U32 ofs = 0;
2216                 I32 idx = -1;
2217                 U32 count = 0;
2218                 const U32 base = trie->states[ state ].trans.base;
2219
2220                 if ( trie->states[state].wordnum )
2221                         count = 1;
2222
2223                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2224                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2225                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2226                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2227                     {
2228                         if ( ++count > 1 ) {
2229                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2230                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2231                             if ( state == 1 ) break;
2232                             if ( count == 2 ) {
2233                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2234                                 DEBUG_OPTIMISE_r(
2235                                     PerlIO_printf(Perl_debug_log,
2236                                         "%*sNew Start State=%"UVuf" Class: [",
2237                                         (int)depth * 2 + 2, "",
2238                                         (UV)state));
2239                                 if (idx >= 0) {
2240                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2241                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2242
2243                                     TRIE_BITMAP_SET(trie,*ch);
2244                                     if ( folder )
2245                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2246                                     DEBUG_OPTIMISE_r(
2247                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2248                                     );
2249                                 }
2250                             }
2251                             TRIE_BITMAP_SET(trie,*ch);
2252                             if ( folder )
2253                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2254                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2255                         }
2256                         idx = ofs;
2257                     }
2258                 }
2259                 if ( count == 1 ) {
2260                     SV **tmp = av_fetch( revcharmap, idx, 0);
2261                     STRLEN len;
2262                     char *ch = SvPV( *tmp, len );
2263                     DEBUG_OPTIMISE_r({
2264                         SV *sv=sv_newmortal();
2265                         PerlIO_printf( Perl_debug_log,
2266                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2267                             (int)depth * 2 + 2, "",
2268                             (UV)state, (UV)idx,
2269                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2270                                 PL_colors[0], PL_colors[1],
2271                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2272                                 PERL_PV_ESCAPE_FIRSTCHAR
2273                             )
2274                         );
2275                     });
2276                     if ( state==1 ) {
2277                         OP( convert ) = nodetype;
2278                         str=STRING(convert);
2279                         STR_LEN(convert)=0;
2280                     }
2281                     STR_LEN(convert) += len;
2282                     while (len--)
2283                         *str++ = *ch++;
2284                 } else {
2285 #ifdef DEBUGGING
2286                     if (state>1)
2287                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2288 #endif
2289                     break;
2290                 }
2291             }
2292             trie->prefixlen = (state-1);
2293             if (str) {
2294                 regnode *n = convert+NODE_SZ_STR(convert);
2295                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2296                 trie->startstate = state;
2297                 trie->minlen -= (state - 1);
2298                 trie->maxlen -= (state - 1);
2299 #ifdef DEBUGGING
2300                /* At least the UNICOS C compiler choked on this
2301                 * being argument to DEBUG_r(), so let's just have
2302                 * it right here. */
2303                if (
2304 #ifdef PERL_EXT_RE_BUILD
2305                    1
2306 #else
2307                    DEBUG_r_TEST
2308 #endif
2309                    ) {
2310                    regnode *fix = convert;
2311                    U32 word = trie->wordcount;
2312                    mjd_nodelen++;
2313                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2314                    while( ++fix < n ) {
2315                        Set_Node_Offset_Length(fix, 0, 0);
2316                    }
2317                    while (word--) {
2318                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2319                        if (tmp) {
2320                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2321                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2322                            else
2323                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2324                        }
2325                    }
2326                }
2327 #endif
2328                 if (trie->maxlen) {
2329                     convert = n;
2330                 } else {
2331                     NEXT_OFF(convert) = (U16)(tail - convert);
2332                     DEBUG_r(optimize= n);
2333                 }
2334             }
2335         }
2336         if (!jumper)
2337             jumper = last;
2338         if ( trie->maxlen ) {
2339             NEXT_OFF( convert ) = (U16)(tail - convert);
2340             ARG_SET( convert, data_slot );
2341             /* Store the offset to the first unabsorbed branch in
2342                jump[0], which is otherwise unused by the jump logic.
2343                We use this when dumping a trie and during optimisation. */
2344             if (trie->jump)
2345                 trie->jump[0] = (U16)(nextbranch - convert);
2346
2347             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2348              *   and there is a bitmap
2349              *   and the first "jump target" node we found leaves enough room
2350              * then convert the TRIE node into a TRIEC node, with the bitmap
2351              * embedded inline in the opcode - this is hypothetically faster.
2352              */
2353             if ( !trie->states[trie->startstate].wordnum
2354                  && trie->bitmap
2355                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2356             {
2357                 OP( convert ) = TRIEC;
2358                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2359                 PerlMemShared_free(trie->bitmap);
2360                 trie->bitmap= NULL;
2361             } else
2362                 OP( convert ) = TRIE;
2363
2364             /* store the type in the flags */
2365             convert->flags = nodetype;
2366             DEBUG_r({
2367             optimize = convert
2368                       + NODE_STEP_REGNODE
2369                       + regarglen[ OP( convert ) ];
2370             });
2371             /* XXX We really should free up the resource in trie now,
2372                    as we won't use them - (which resources?) dmq */
2373         }
2374         /* needed for dumping*/
2375         DEBUG_r(if (optimize) {
2376             regnode *opt = convert;
2377
2378             while ( ++opt < optimize) {
2379                 Set_Node_Offset_Length(opt,0,0);
2380             }
2381             /*
2382                 Try to clean up some of the debris left after the
2383                 optimisation.
2384              */
2385             while( optimize < jumper ) {
2386                 mjd_nodelen += Node_Length((optimize));
2387                 OP( optimize ) = OPTIMIZED;
2388                 Set_Node_Offset_Length(optimize,0,0);
2389                 optimize++;
2390             }
2391             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2392         });
2393     } /* end node insert */
2394
2395     /*  Finish populating the prev field of the wordinfo array.  Walk back
2396      *  from each accept state until we find another accept state, and if
2397      *  so, point the first word's .prev field at the second word. If the
2398      *  second already has a .prev field set, stop now. This will be the
2399      *  case either if we've already processed that word's accept state,
2400      *  or that state had multiple words, and the overspill words were
2401      *  already linked up earlier.
2402      */
2403     {
2404         U16 word;
2405         U32 state;
2406         U16 prev;
2407
2408         for (word=1; word <= trie->wordcount; word++) {
2409             prev = 0;
2410             if (trie->wordinfo[word].prev)
2411                 continue;
2412             state = trie->wordinfo[word].accept;
2413             while (state) {
2414                 state = prev_states[state];
2415                 if (!state)
2416                     break;
2417                 prev = trie->states[state].wordnum;
2418                 if (prev)
2419                     break;
2420             }
2421             trie->wordinfo[word].prev = prev;
2422         }
2423         Safefree(prev_states);
2424     }
2425
2426
2427     /* and now dump out the compressed format */
2428     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2429
2430     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2431 #ifdef DEBUGGING
2432     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2433     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2434 #else
2435     SvREFCNT_dec(revcharmap);
2436 #endif
2437     return trie->jump
2438            ? MADE_JUMP_TRIE
2439            : trie->startstate>1
2440              ? MADE_EXACT_TRIE
2441              : MADE_TRIE;
2442 }
2443
2444 STATIC void
2445 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2446 {
2447 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2448
2449    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2450    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2451    ISBN 0-201-10088-6
2452
2453    We find the fail state for each state in the trie, this state is the longest proper
2454    suffix of the current state's 'word' that is also a proper prefix of another word in our
2455    trie. State 1 represents the word '' and is thus the default fail state. This allows
2456    the DFA not to have to restart after its tried and failed a word at a given point, it
2457    simply continues as though it had been matching the other word in the first place.
2458    Consider
2459       'abcdgu'=~/abcdefg|cdgu/
2460    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2461    fail, which would bring us to the state representing 'd' in the second word where we would
2462    try 'g' and succeed, proceeding to match 'cdgu'.
2463  */
2464  /* add a fail transition */
2465     const U32 trie_offset = ARG(source);
2466     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2467     U32 *q;
2468     const U32 ucharcount = trie->uniquecharcount;
2469     const U32 numstates = trie->statecount;
2470     const U32 ubound = trie->lasttrans + ucharcount;
2471     U32 q_read = 0;
2472     U32 q_write = 0;
2473     U32 charid;
2474     U32 base = trie->states[ 1 ].trans.base;
2475     U32 *fail;
2476     reg_ac_data *aho;
2477     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2478     GET_RE_DEBUG_FLAGS_DECL;
2479
2480     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2481 #ifndef DEBUGGING
2482     PERL_UNUSED_ARG(depth);
2483 #endif
2484
2485
2486     ARG_SET( stclass, data_slot );
2487     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2488     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2489     aho->trie=trie_offset;
2490     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2491     Copy( trie->states, aho->states, numstates, reg_trie_state );
2492     Newxz( q, numstates, U32);
2493     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2494     aho->refcount = 1;
2495     fail = aho->fail;
2496     /* initialize fail[0..1] to be 1 so that we always have
2497        a valid final fail state */
2498     fail[ 0 ] = fail[ 1 ] = 1;
2499
2500     for ( charid = 0; charid < ucharcount ; charid++ ) {
2501         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2502         if ( newstate ) {
2503             q[ q_write ] = newstate;
2504             /* set to point at the root */
2505             fail[ q[ q_write++ ] ]=1;
2506         }
2507     }
2508     while ( q_read < q_write) {
2509         const U32 cur = q[ q_read++ % numstates ];
2510         base = trie->states[ cur ].trans.base;
2511
2512         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2513             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2514             if (ch_state) {
2515                 U32 fail_state = cur;
2516                 U32 fail_base;
2517                 do {
2518                     fail_state = fail[ fail_state ];
2519                     fail_base = aho->states[ fail_state ].trans.base;
2520                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2521
2522                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2523                 fail[ ch_state ] = fail_state;
2524                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2525                 {
2526                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2527                 }
2528                 q[ q_write++ % numstates] = ch_state;
2529             }
2530         }
2531     }
2532     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2533        when we fail in state 1, this allows us to use the
2534        charclass scan to find a valid start char. This is based on the principle
2535        that theres a good chance the string being searched contains lots of stuff
2536        that cant be a start char.
2537      */
2538     fail[ 0 ] = fail[ 1 ] = 0;
2539     DEBUG_TRIE_COMPILE_r({
2540         PerlIO_printf(Perl_debug_log,
2541                       "%*sStclass Failtable (%"UVuf" states): 0",
2542                       (int)(depth * 2), "", (UV)numstates
2543         );
2544         for( q_read=1; q_read<numstates; q_read++ ) {
2545             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2546         }
2547         PerlIO_printf(Perl_debug_log, "\n");
2548     });
2549     Safefree(q);
2550     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2551 }
2552
2553
2554 /*
2555  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2556  * These need to be revisited when a newer toolchain becomes available.
2557  */
2558 #if defined(__sparc64__) && defined(__GNUC__)
2559 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2560 #       undef  SPARC64_GCC_WORKAROUND
2561 #       define SPARC64_GCC_WORKAROUND 1
2562 #   endif
2563 #endif
2564
2565 #define DEBUG_PEEP(str,scan,depth) \
2566     DEBUG_OPTIMISE_r({if (scan){ \
2567        SV * const mysv=sv_newmortal(); \
2568        regnode *Next = regnext(scan); \
2569        regprop(RExC_rx, mysv, scan); \
2570        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2571        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2572        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2573    }});
2574
2575
2576 /* The below joins as many adjacent EXACTish nodes as possible into a single
2577  * one, and looks for problematic sequences of characters whose folds vs.
2578  * non-folds have sufficiently different lengths, that the optimizer would be
2579  * fooled into rejecting legitimate matches of them, and the trie construction
2580  * code can't cope with them.  The joining is only done if:
2581  * 1) there is room in the current conglomerated node to entirely contain the
2582  *    next one.
2583  * 2) they are the exact same node type
2584  *
2585  * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2586  * these get optimized out
2587  *
2588  * If there are problematic code sequences, *min_subtract is set to the delta
2589  * that the minimum size of the node can be less than its actual size.  And,
2590  * the node type of the result is changed to reflect that it contains these
2591  * sequences.
2592  *
2593  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2594  * and contains LATIN SMALL LETTER SHARP S
2595  *
2596  * This is as good a place as any to discuss the design of handling these
2597  * problematic sequences.  It's been wrong in Perl for a very long time.  There
2598  * are three code points in Unicode whose folded lengths differ so much from
2599  * the un-folded lengths that it causes problems for the optimizer and trie
2600  * construction.  Why only these are problematic, and not others where lengths
2601  * also differ is something I (khw) do not understand.  New versions of Unicode
2602  * might add more such code points.  Hopefully the logic in fold_grind.t that
2603  * figures out what to test (in part by verifying that each size-combination
2604  * gets tested) will catch any that do come along, so they can be added to the
2605  * special handling below.  The chances of new ones are actually rather small,
2606  * as most, if not all, of the world's scripts that have casefolding have
2607  * already been encoded by Unicode.  Also, a number of Unicode's decisions were
2608  * made to allow compatibility with pre-existing standards, and almost all of
2609  * those have already been dealt with.  These would otherwise be the most
2610  * likely candidates for generating further tricky sequences.  In other words,
2611  * Unicode by itself is unlikely to add new ones unless it is for compatibility
2612  * with pre-existing standards, and there aren't many of those left.
2613  *
2614  * The previous designs for dealing with these involved assigning a special
2615  * node for them.  This approach doesn't work, as evidenced by this example:
2616  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2617  * Both these fold to "sss", but if the pattern is parsed to create a node of
2618  * that would match just the \xDF, it won't be able to handle the case where a
2619  * successful match would have to cross the node's boundary.  The new approach
2620  * that hopefully generally solves the problem generates an EXACTFU_SS node
2621  * that is "sss".
2622  *
2623  * There are a number of components to the approach (a lot of work for just
2624  * three code points!):
2625  * 1)   This routine examines each EXACTFish node that could contain the
2626  *      problematic sequences.  It returns in *min_subtract how much to
2627  *      subtract from the the actual length of the string to get a real minimum
2628  *      for one that could match it.  This number is usually 0 except for the
2629  *      problematic sequences.  This delta is used by the caller to adjust the
2630  *      min length of the match, and the delta between min and max, so that the
2631  *      optimizer doesn't reject these possibilities based on size constraints.
2632  * 2)   These sequences are not currently correctly handled by the trie code
2633  *      either, so it changes the joined node type to ops that are not handled
2634  *      by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
2635  * 3)   This is sufficient for the two Greek sequences (described below), but
2636  *      the one involving the Sharp s (\xDF) needs more.  The node type
2637  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2638  *      sequence in it.  For non-UTF-8 patterns and strings, this is the only
2639  *      case where there is a possible fold length change.  That means that a
2640  *      regular EXACTFU node without UTF-8 involvement doesn't have to concern
2641  *      itself with length changes, and so can be processed faster.  regexec.c
2642  *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
2643  *      is pre-folded by regcomp.c.  This saves effort in regex matching.
2644  *      However, probably mostly for historical reasons, the pre-folding isn't
2645  *      done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2646  *      nodes, as what they fold to isn't known until runtime.)  The fold
2647  *      possibilities for the non-UTF8 patterns are quite simple, except for
2648  *      the sharp s.  All the ones that don't involve a UTF-8 target string
2649  *      are members of a fold-pair, and arrays are set up for all of them
2650  *      that quickly find the other member of the pair.  It might actually
2651  *      be faster to pre-fold these, but it isn't currently done, except for
2652  *      the sharp s.  Code elsewhere in this file makes sure that it gets
2653  *      folded to 'ss', even if the pattern isn't UTF-8.  This avoids the
2654  *      issues described in the next item.
2655  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2656  *      'ss' or not is not knowable at compile time.  It will match iff the
2657  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2658  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2659  *      it can't be folded to "ss" at compile time, unlike EXACTFU does as
2660  *      described in item 3).  An assumption that the optimizer part of
2661  *      regexec.c (probably unwittingly) makes is that a character in the
2662  *      pattern corresponds to at most a single character in the target string.
2663  *      (And I do mean character, and not byte here, unlike other parts of the
2664  *      documentation that have never been updated to account for multibyte
2665  *      Unicode.)  This assumption is wrong only in this case, as all other
2666  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2667  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2668  *      reluctant to try to change this assumption, so instead the code punts.
2669  *      This routine examines EXACTF nodes for the sharp s, and returns a
2670  *      boolean indicating whether or not the node is an EXACTF node that
2671  *      contains a sharp s.  When it is true, the caller sets a flag that later
2672  *      causes the optimizer in this file to not set values for the floating
2673  *      and fixed string lengths, and thus avoids the optimizer code in
2674  *      regexec.c that makes the invalid assumption.  Thus, there is no
2675  *      optimization based on string lengths for EXACTF nodes that contain the
2676  *      sharp s.  This only happens for /id rules (which means the pattern
2677  *      isn't in UTF-8).
2678  */
2679
2680 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2681     if (PL_regkind[OP(scan)] == EXACT) \
2682         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2683
2684 STATIC U32
2685 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2686     /* Merge several consecutive EXACTish nodes into one. */
2687     regnode *n = regnext(scan);
2688     U32 stringok = 1;
2689     regnode *next = scan + NODE_SZ_STR(scan);
2690     U32 merged = 0;
2691     U32 stopnow = 0;
2692 #ifdef DEBUGGING
2693     regnode *stop = scan;
2694     GET_RE_DEBUG_FLAGS_DECL;
2695 #else
2696     PERL_UNUSED_ARG(depth);
2697 #endif
2698
2699     PERL_ARGS_ASSERT_JOIN_EXACT;
2700 #ifndef EXPERIMENTAL_INPLACESCAN
2701     PERL_UNUSED_ARG(flags);
2702     PERL_UNUSED_ARG(val);
2703 #endif
2704     DEBUG_PEEP("join",scan,depth);
2705
2706     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2707      * EXACT ones that are mergeable to the current one. */
2708     while (n
2709            && (PL_regkind[OP(n)] == NOTHING
2710                || (stringok && OP(n) == OP(scan)))
2711            && NEXT_OFF(n)
2712            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2713     {
2714
2715         if (OP(n) == TAIL || n > next)
2716             stringok = 0;
2717         if (PL_regkind[OP(n)] == NOTHING) {
2718             DEBUG_PEEP("skip:",n,depth);
2719             NEXT_OFF(scan) += NEXT_OFF(n);
2720             next = n + NODE_STEP_REGNODE;
2721 #ifdef DEBUGGING
2722             if (stringok)
2723                 stop = n;
2724 #endif
2725             n = regnext(n);
2726         }
2727         else if (stringok) {
2728             const unsigned int oldl = STR_LEN(scan);
2729             regnode * const nnext = regnext(n);
2730
2731             if (oldl + STR_LEN(n) > U8_MAX)
2732                 break;
2733
2734             DEBUG_PEEP("merg",n,depth);
2735             merged++;
2736
2737             NEXT_OFF(scan) += NEXT_OFF(n);
2738             STR_LEN(scan) += STR_LEN(n);
2739             next = n + NODE_SZ_STR(n);
2740             /* Now we can overwrite *n : */
2741             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2742 #ifdef DEBUGGING
2743             stop = next - 1;
2744 #endif
2745             n = nnext;
2746             if (stopnow) break;
2747         }
2748
2749 #ifdef EXPERIMENTAL_INPLACESCAN
2750         if (flags && !NEXT_OFF(n)) {
2751             DEBUG_PEEP("atch", val, depth);
2752             if (reg_off_by_arg[OP(n)]) {
2753                 ARG_SET(n, val - n);
2754             }
2755             else {
2756                 NEXT_OFF(n) = val - n;
2757             }
2758             stopnow = 1;
2759         }
2760 #endif
2761     }
2762
2763     *min_subtract = 0;
2764     *has_exactf_sharp_s = FALSE;
2765
2766     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2767      * can now analyze for sequences of problematic code points.  (Prior to
2768      * this final joining, sequences could have been split over boundaries, and
2769      * hence missed).  The sequences only happen in folding, hence for any
2770      * non-EXACT EXACTish node */
2771     if (OP(scan) != EXACT) {
2772         U8 *s;
2773         U8 * s0 = (U8*) STRING(scan);
2774         U8 * const s_end = s0 + STR_LEN(scan);
2775
2776         /* The below is perhaps overboard, but this allows us to save a test
2777          * each time through the loop at the expense of a mask.  This is
2778          * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2779          * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
2780          * This uses an exclusive 'or' to find that bit and then inverts it to
2781          * form a mask, with just a single 0, in the bit position where 'S' and
2782          * 's' differ. */
2783         const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2784         const U8 s_masked = 's' & S_or_s_mask;
2785
2786         /* One pass is made over the node's string looking for all the
2787          * possibilities.  to avoid some tests in the loop, there are two main
2788          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2789          * non-UTF-8 */
2790         if (UTF) {
2791
2792             /* There are two problematic Greek code points in Unicode
2793              * casefolding
2794              *
2795              * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2796              * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2797              *
2798              * which casefold to
2799              *
2800              * Unicode                      UTF-8
2801              *
2802              * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2803              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2804              *
2805              * This means that in case-insensitive matching (or "loose
2806              * matching", as Unicode calls it), an EXACTF of length six (the
2807              * UTF-8 encoded byte length of the above casefolded versions) can
2808              * match a target string of length two (the byte length of UTF-8
2809              * encoded U+0390 or U+03B0).  This would rather mess up the
2810              * minimum length computation.  (there are other code points that
2811              * also fold to these two sequences, but the delta is smaller)
2812              *
2813              * If these sequences are found, the minimum length is decreased by
2814              * four (six minus two).
2815              *
2816              * Similarly, 'ss' may match the single char and byte LATIN SMALL
2817              * LETTER SHARP S.  We decrease the min length by 1 for each
2818              * occurrence of 'ss' found */
2819
2820 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2821 #           define U390_first_byte 0xb4
2822             const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2823 #           define U3B0_first_byte 0xb5
2824             const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2825 #else
2826 #           define U390_first_byte 0xce
2827             const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2828 #           define U3B0_first_byte 0xcf
2829             const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2830 #endif
2831             const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2832                                                  yields a net of 0 */
2833             /* Examine the string for one of the problematic sequences */
2834             for (s = s0;
2835                  s < s_end - 1; /* Can stop 1 before the end, as minimum length
2836                                  * sequence we are looking for is 2 */
2837                  s += UTF8SKIP(s))
2838             {
2839
2840                 /* Look for the first byte in each problematic sequence */
2841                 switch (*s) {
2842                     /* We don't have to worry about other things that fold to
2843                      * 's' (such as the long s, U+017F), as all above-latin1
2844                      * code points have been pre-folded */
2845                     case 's':
2846                     case 'S':
2847
2848                         /* Current character is an 's' or 'S'.  If next one is
2849                          * as well, we have the dreaded sequence */
2850                         if (((*(s+1) & S_or_s_mask) == s_masked)
2851                             /* These two node types don't have special handling
2852                              * for 'ss' */
2853                             && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2854                         {
2855                             *min_subtract += 1;
2856                             OP(scan) = EXACTFU_SS;
2857                             s++;    /* No need to look at this character again */
2858                         }
2859                         break;
2860
2861                     case U390_first_byte:
2862                         if (s_end - s >= len
2863
2864                             /* The 1's are because are skipping comparing the
2865                              * first byte */
2866                             && memEQ(s + 1, U390_tail, len - 1))
2867                         {
2868                             goto greek_sequence;
2869                         }
2870                         break;
2871
2872                     case U3B0_first_byte:
2873                         if (! (s_end - s >= len
2874                                && memEQ(s + 1, U3B0_tail, len - 1)))
2875                         {
2876                             break;
2877                         }
2878                       greek_sequence:
2879                         *min_subtract += 4;
2880
2881                         /* This can't currently be handled by trie's, so change
2882                          * the node type to indicate this.  If EXACTFA and
2883                          * EXACTFL were ever to be handled by trie's, this
2884                          * would have to be changed.  If this node has already
2885                          * been changed to EXACTFU_SS in this loop, leave it as
2886                          * is.  (I (khw) think it doesn't matter in regexec.c
2887                          * for UTF patterns, but no need to change it */
2888                         if (OP(scan) == EXACTFU) {
2889                             OP(scan) = EXACTFU_TRICKYFOLD;
2890                         }
2891                         s += 6; /* We already know what this sequence is.  Skip
2892                                    the rest of it */
2893                         break;
2894                 }
2895             }
2896         }
2897         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2898
2899             /* Here, the pattern is not UTF-8.  We need to look only for the
2900              * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2901              * in the final position.  Otherwise we can stop looking 1 byte
2902              * earlier because have to find both the first and second 's' */
2903             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2904
2905             for (s = s0; s < upper; s++) {
2906                 switch (*s) {
2907                     case 'S':
2908                     case 's':
2909                         if (s_end - s > 1
2910                             && ((*(s+1) & S_or_s_mask) == s_masked))
2911                         {
2912                             *min_subtract += 1;
2913
2914                             /* EXACTF nodes need to know that the minimum
2915                              * length changed so that a sharp s in the string
2916                              * can match this ss in the pattern, but they
2917                              * remain EXACTF nodes, as they are not trie'able,
2918                              * so don't have to invent a new node type to
2919                              * exclude them from the trie code */
2920                             if (OP(scan) != EXACTF) {
2921                                 OP(scan) = EXACTFU_SS;
2922                             }
2923                             s++;
2924                         }
2925                         break;
2926                     case LATIN_SMALL_LETTER_SHARP_S:
2927                         if (OP(scan) == EXACTF) {
2928                             *has_exactf_sharp_s = TRUE;
2929                         }
2930                         break;
2931                 }
2932             }
2933         }
2934     }
2935
2936 #ifdef DEBUGGING
2937     /* Allow dumping but overwriting the collection of skipped
2938      * ops and/or strings with fake optimized ops */
2939     n = scan + NODE_SZ_STR(scan);
2940     while (n <= stop) {
2941         OP(n) = OPTIMIZED;
2942         FLAGS(n) = 0;
2943         NEXT_OFF(n) = 0;
2944         n++;
2945     }
2946 #endif
2947     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2948     return stopnow;
2949 }
2950
2951 /* REx optimizer.  Converts nodes into quicker variants "in place".
2952    Finds fixed substrings.  */
2953
2954 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2955    to the position after last scanned or to NULL. */
2956
2957 #define INIT_AND_WITHP \
2958     assert(!and_withp); \
2959     Newx(and_withp,1,struct regnode_charclass_class); \
2960     SAVEFREEPV(and_withp)
2961
2962 /* this is a chain of data about sub patterns we are processing that
2963    need to be handled separately/specially in study_chunk. Its so
2964    we can simulate recursion without losing state.  */
2965 struct scan_frame;
2966 typedef struct scan_frame {
2967     regnode *last;  /* last node to process in this frame */
2968     regnode *next;  /* next node to process when last is reached */
2969     struct scan_frame *prev; /*previous frame*/
2970     I32 stop; /* what stopparen do we use */
2971 } scan_frame;
2972
2973
2974 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2975
2976 #define CASE_SYNST_FNC(nAmE)                                       \
2977 case nAmE:                                                         \
2978     if (flags & SCF_DO_STCLASS_AND) {                              \
2979             for (value = 0; value < 256; value++)                  \
2980                 if (!is_ ## nAmE ## _cp(value))                       \
2981                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2982     }                                                              \
2983     else {                                                         \
2984             for (value = 0; value < 256; value++)                  \
2985                 if (is_ ## nAmE ## _cp(value))                        \
2986                     ANYOF_BITMAP_SET(data->start_class, value);    \
2987     }                                                              \
2988     break;                                                         \
2989 case N ## nAmE:                                                    \
2990     if (flags & SCF_DO_STCLASS_AND) {                              \
2991             for (value = 0; value < 256; value++)                   \
2992                 if (is_ ## nAmE ## _cp(value))                         \
2993                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2994     }                                                               \
2995     else {                                                          \
2996             for (value = 0; value < 256; value++)                   \
2997                 if (!is_ ## nAmE ## _cp(value))                        \
2998                     ANYOF_BITMAP_SET(data->start_class, value);     \
2999     }                                                               \
3000     break
3001
3002
3003
3004 STATIC I32
3005 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3006                         I32 *minlenp, I32 *deltap,
3007                         regnode *last,
3008                         scan_data_t *data,
3009                         I32 stopparen,
3010                         U8* recursed,
3011                         struct regnode_charclass_class *and_withp,
3012                         U32 flags, U32 depth)
3013                         /* scanp: Start here (read-write). */
3014                         /* deltap: Write maxlen-minlen here. */
3015                         /* last: Stop before this one. */
3016                         /* data: string data about the pattern */
3017                         /* stopparen: treat close N as END */
3018                         /* recursed: which subroutines have we recursed into */
3019                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3020 {
3021     dVAR;
3022     I32 min = 0, pars = 0, code;
3023     regnode *scan = *scanp, *next;
3024     I32 delta = 0;
3025     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3026     int is_inf_internal = 0;            /* The studied chunk is infinite */
3027     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3028     scan_data_t data_fake;
3029     SV *re_trie_maxbuff = NULL;
3030     regnode *first_non_open = scan;
3031     I32 stopmin = I32_MAX;
3032     scan_frame *frame = NULL;
3033     GET_RE_DEBUG_FLAGS_DECL;
3034
3035     PERL_ARGS_ASSERT_STUDY_CHUNK;
3036
3037 #ifdef DEBUGGING
3038     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3039 #endif
3040
3041     if ( depth == 0 ) {
3042         while (first_non_open && OP(first_non_open) == OPEN)
3043             first_non_open=regnext(first_non_open);
3044     }
3045
3046
3047   fake_study_recurse:
3048     while ( scan && OP(scan) != END && scan < last ){
3049         UV min_subtract = 0;    /* How much to subtract from the minimum node
3050                                    length to get a real minimum (because the
3051                                    folded version may be shorter) */
3052         bool has_exactf_sharp_s = FALSE;
3053         /* Peephole optimizer: */
3054         DEBUG_STUDYDATA("Peep:", data,depth);
3055         DEBUG_PEEP("Peep",scan,depth);
3056
3057         /* Its not clear to khw or hv why this is done here, and not in the
3058          * clauses that deal with EXACT nodes.  khw's guess is that it's
3059          * because of a previous design */
3060         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3061
3062         /* Follow the next-chain of the current node and optimize
3063            away all the NOTHINGs from it.  */
3064         if (OP(scan) != CURLYX) {
3065             const int max = (reg_off_by_arg[OP(scan)]
3066                        ? I32_MAX
3067                        /* I32 may be smaller than U16 on CRAYs! */
3068                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3069             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3070             int noff;
3071             regnode *n = scan;
3072
3073             /* Skip NOTHING and LONGJMP. */
3074             while ((n = regnext(n))
3075                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3076                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3077                    && off + noff < max)
3078                 off += noff;
3079             if (reg_off_by_arg[OP(scan)])
3080                 ARG(scan) = off;
3081             else
3082                 NEXT_OFF(scan) = off;
3083         }
3084
3085
3086
3087         /* The principal pseudo-switch.  Cannot be a switch, since we
3088            look into several different things.  */
3089         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3090                    || OP(scan) == IFTHEN) {
3091             next = regnext(scan);
3092             code = OP(scan);
3093             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3094
3095             if (OP(next) == code || code == IFTHEN) {
3096                 /* NOTE - There is similar code to this block below for handling
3097                    TRIE nodes on a re-study.  If you change stuff here check there
3098                    too. */
3099                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3100                 struct regnode_charclass_class accum;
3101                 regnode * const startbranch=scan;
3102
3103                 if (flags & SCF_DO_SUBSTR)
3104                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3105                 if (flags & SCF_DO_STCLASS)
3106                     cl_init_zero(pRExC_state, &accum);
3107
3108                 while (OP(scan) == code) {
3109                     I32 deltanext, minnext, f = 0, fake;
3110                     struct regnode_charclass_class this_class;
3111
3112                     num++;
3113                     data_fake.flags = 0;
3114                     if (data) {
3115                         data_fake.whilem_c = data->whilem_c;
3116                         data_fake.last_closep = data->last_closep;
3117                     }
3118                     else
3119                         data_fake.last_closep = &fake;
3120
3121                     data_fake.pos_delta = delta;
3122                     next = regnext(scan);
3123                     scan = NEXTOPER(scan);
3124                     if (code != BRANCH)
3125                         scan = NEXTOPER(scan);
3126                     if (flags & SCF_DO_STCLASS) {
3127                         cl_init(pRExC_state, &this_class);
3128                         data_fake.start_class = &this_class;
3129                         f = SCF_DO_STCLASS_AND;
3130                     }
3131                     if (flags & SCF_WHILEM_VISITED_POS)
3132                         f |= SCF_WHILEM_VISITED_POS;
3133
3134                     /* we suppose the run is continuous, last=next...*/
3135                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3136                                           next, &data_fake,
3137                                           stopparen, recursed, NULL, f,depth+1);
3138                     if (min1 > minnext)
3139                         min1 = minnext;
3140                     if (max1 < minnext + deltanext)
3141                         max1 = minnext + deltanext;
3142                     if (deltanext == I32_MAX)
3143                         is_inf = is_inf_internal = 1;
3144                     scan = next;
3145                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3146                         pars++;
3147                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3148                         if ( stopmin > minnext)
3149                             stopmin = min + min1;
3150                         flags &= ~SCF_DO_SUBSTR;
3151                         if (data)
3152                             data->flags |= SCF_SEEN_ACCEPT;
3153                     }
3154                     if (data) {
3155                         if (data_fake.flags & SF_HAS_EVAL)
3156                             data->flags |= SF_HAS_EVAL;
3157                         data->whilem_c = data_fake.whilem_c;
3158                     }
3159                     if (flags & SCF_DO_STCLASS)
3160                         cl_or(pRExC_state, &accum, &this_class);
3161                 }
3162                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3163                     min1 = 0;
3164                 if (flags & SCF_DO_SUBSTR) {
3165                     data->pos_min += min1;
3166                     data->pos_delta += max1 - min1;
3167                     if (max1 != min1 || is_inf)
3168                         data->longest = &(data->longest_float);
3169                 }
3170                 min += min1;
3171                 delta += max1 - min1;
3172                 if (flags & SCF_DO_STCLASS_OR) {
3173                     cl_or(pRExC_state, data->start_class, &accum);
3174                     if (min1) {
3175                         cl_and(data->start_class, and_withp);
3176                         flags &= ~SCF_DO_STCLASS;
3177                     }
3178                 }
3179                 else if (flags & SCF_DO_STCLASS_AND) {
3180                     if (min1) {
3181                         cl_and(data->start_class, &accum);
3182                         flags &= ~SCF_DO_STCLASS;
3183                     }
3184                     else {
3185                         /* Switch to OR mode: cache the old value of
3186                          * data->start_class */
3187                         INIT_AND_WITHP;
3188                         StructCopy(data->start_class, and_withp,
3189                                    struct regnode_charclass_class);
3190                         flags &= ~SCF_DO_STCLASS_AND;
3191                         StructCopy(&accum, data->start_class,
3192                                    struct regnode_charclass_class);
3193                         flags |= SCF_DO_STCLASS_OR;
3194                         data->start_class->flags |= ANYOF_EOS;
3195                     }
3196                 }
3197
3198                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3199                 /* demq.
3200
3201                    Assuming this was/is a branch we are dealing with: 'scan' now
3202                    points at the item that follows the branch sequence, whatever
3203                    it is. We now start at the beginning of the sequence and look
3204                    for subsequences of
3205
3206                    BRANCH->EXACT=>x1
3207                    BRANCH->EXACT=>x2
3208                    tail
3209
3210                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3211
3212                    If we can find such a subsequence we need to turn the first
3213                    element into a trie and then add the subsequent branch exact
3214                    strings to the trie.
3215
3216                    We have two cases
3217
3218                      1. patterns where the whole set of branches can be converted.
3219
3220                      2. patterns where only a subset can be converted.
3221
3222                    In case 1 we can replace the whole set with a single regop
3223                    for the trie. In case 2 we need to keep the start and end
3224                    branches so
3225
3226                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3227                      becomes BRANCH TRIE; BRANCH X;
3228
3229                   There is an additional case, that being where there is a
3230                   common prefix, which gets split out into an EXACT like node
3231                   preceding the TRIE node.
3232
3233                   If x(1..n)==tail then we can do a simple trie, if not we make
3234                   a "jump" trie, such that when we match the appropriate word
3235                   we "jump" to the appropriate tail node. Essentially we turn
3236                   a nested if into a case structure of sorts.
3237
3238                 */
3239
3240                     int made=0;
3241                     if (!re_trie_maxbuff) {
3242                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3243                         if (!SvIOK(re_trie_maxbuff))
3244                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3245                     }
3246                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3247                         regnode *cur;
3248                         regnode *first = (regnode *)NULL;
3249                         regnode *last = (regnode *)NULL;
3250                         regnode *tail = scan;
3251                         U8 trietype = 0;
3252                         U32 count=0;
3253
3254 #ifdef DEBUGGING
3255                         SV * const mysv = sv_newmortal();       /* for dumping */
3256 #endif
3257                         /* var tail is used because there may be a TAIL
3258                            regop in the way. Ie, the exacts will point to the
3259                            thing following the TAIL, but the last branch will
3260                            point at the TAIL. So we advance tail. If we
3261                            have nested (?:) we may have to move through several
3262                            tails.
3263                          */
3264
3265                         while ( OP( tail ) == TAIL ) {
3266                             /* this is the TAIL generated by (?:) */
3267                             tail = regnext( tail );
3268                         }
3269
3270
3271                         DEBUG_TRIE_COMPILE_r({
3272                             regprop(RExC_rx, mysv, tail );
3273                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3274                                 (int)depth * 2 + 2, "",
3275                                 "Looking for TRIE'able sequences. Tail node is: ",
3276                                 SvPV_nolen_const( mysv )
3277                             );
3278                         });
3279
3280                         /*
3281
3282                             Step through the branches
3283                                 cur represents each branch,
3284                                 noper is the first thing to be matched as part of that branch
3285                                 noper_next is the regnext() of that node.
3286
3287                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3288                             via a "jump trie" but we also support building with NOJUMPTRIE,
3289                             which restricts the trie logic to structures like /FOO|BAR/.
3290
3291                             If noper is a trieable nodetype then the branch is a possible optimization
3292                             target. If we are building under NOJUMPTRIE then we require that noper_next
3293                             is the same as scan (our current position in the regex program).
3294
3295                             Once we have two or more consecutive such branches we can create a
3296                             trie of the EXACT's contents and stitch it in place into the program.
3297
3298                             If the sequence represents all of the branches in the alternation we
3299                             replace the entire thing with a single TRIE node.
3300
3301                             Otherwise when it is a subsequence we need to stitch it in place and
3302                             replace only the relevant branches. This means the first branch has
3303                             to remain as it is used by the alternation logic, and its next pointer,
3304                             and needs to be repointed at the item on the branch chain following
3305                             the last branch we have optimized away.
3306
3307                             This could be either a BRANCH, in which case the subsequence is internal,
3308                             or it could be the item following the branch sequence in which case the
3309                             subsequence is at the end (which does not necessarily mean the first node
3310                             is the start of the alternation).
3311
3312                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3313
3314                                 optype          |  trietype
3315                                 ----------------+-----------
3316                                 NOTHING         | NOTHING
3317                                 EXACT           | EXACT
3318                                 EXACTFU         | EXACTFU
3319                                 EXACTFU_SS      | EXACTFU
3320                                 EXACTFU_TRICKYFOLD | EXACTFU
3321                                 EXACTFA         | 0
3322
3323
3324                         */
3325 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3326                        ( EXACT == (X) )   ? EXACT :        \
3327                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3328                        0 )
3329
3330                         /* dont use tail as the end marker for this traverse */
3331                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3332                             regnode * const noper = NEXTOPER( cur );
3333                             U8 noper_type = OP( noper );
3334                             U8 noper_trietype = TRIE_TYPE( noper_type );
3335 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3336                             regnode * const noper_next = regnext( noper );
3337                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3338                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3339 #endif
3340
3341                             DEBUG_TRIE_COMPILE_r({
3342                                 regprop(RExC_rx, mysv, cur);
3343                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3344                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3345
3346                                 regprop(RExC_rx, mysv, noper);
3347                                 PerlIO_printf( Perl_debug_log, " -> %s",
3348                                     SvPV_nolen_const(mysv));
3349
3350                                 if ( noper_next ) {
3351                                   regprop(RExC_rx, mysv, noper_next );
3352                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3353                                     SvPV_nolen_const(mysv));
3354                                 }
3355                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3356                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3357                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3358                                 );
3359                             });
3360
3361                             /* Is noper a trieable nodetype that can be merged with the
3362                              * current trie (if there is one)? */
3363                             if ( noper_trietype
3364                                   &&
3365                                   (
3366                                         ( noper_trietype == NOTHING)
3367                                         || ( trietype == NOTHING )
3368                                         || ( trietype == noper_trietype )
3369                                   )
3370 #ifdef NOJUMPTRIE
3371                                   && noper_next == tail
3372 #endif
3373                                   && count < U16_MAX)
3374                             {
3375                                 /* Handle mergable triable node
3376                                  * Either we are the first node in a new trieable sequence,
3377                                  * in which case we do some bookkeeping, otherwise we update
3378                                  * the end pointer. */
3379                                 if ( !first ) {
3380                                     first = cur;
3381                                     trietype = noper_trietype;
3382                                     if ( noper_trietype == NOTHING ) {
3383 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3384                                         regnode * const noper_next = regnext( noper );
3385                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3386                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3387 #endif
3388
3389                                         if ( noper_next_trietype )
3390                                             trietype = noper_next_trietype;
3391                                     }
3392                                 } else {
3393                                     if ( trietype == NOTHING )
3394                                         trietype = noper_trietype;
3395                                     last = cur;
3396                                 }
3397                                 if (first)
3398                                     count++;
3399                             } /* end handle mergable triable node */
3400                             else {
3401                                 /* handle unmergable node -
3402                                  * noper may either be a triable node which can not be tried
3403                                  * together with the current trie, or a non triable node */
3404                                 if ( last ) {
3405                                     /* If last is set and trietype is not NOTHING then we have found
3406                                      * at least two triable branch sequences in a row of a similar
3407                                      * trietype so we can turn them into a trie. If/when we
3408                                      * allow NOTHING to start a trie sequence this condition will be
3409                                      * required, and it isn't expensive so we leave it in for now. */
3410                                     if ( trietype != NOTHING )
3411                                         make_trie( pRExC_state,
3412                                                 startbranch, first, cur, tail, count,
3413                                                 trietype, depth+1 );
3414                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3415                                 }
3416                                 if ( noper_trietype
3417 #ifdef NOJUMPTRIE
3418                                      && noper_next == tail
3419 #endif
3420                                 ){
3421                                     /* noper is triable, so we can start a new trie sequence */
3422                                     count = 1;
3423                                     first = cur;
3424                                     trietype = noper_trietype;
3425                                 } else if (first) {
3426                                     /* if we already saw a first but the current node is not triable then we have
3427                                      * to reset the first information. */
3428                                     count = 0;
3429                                     first = NULL;
3430                                     trietype = 0;
3431                                 }
3432                             } /* end handle unmergable node */
3433                         } /* loop over branches */
3434                         DEBUG_TRIE_COMPILE_r({
3435                             regprop(RExC_rx, mysv, cur);
3436                             PerlIO_printf( Perl_debug_log,
3437                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3438                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3439
3440                         });
3441                         if ( last ) {
3442                             if ( trietype != NOTHING ) {
3443                                 /* the last branch of the sequence was part of a trie,
3444                                  * so we have to construct it here outside of the loop
3445                                  */
3446                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3447 #ifdef TRIE_STUDY_OPT
3448                                 if ( ((made == MADE_EXACT_TRIE &&
3449                                      startbranch == first)
3450                                      || ( first_non_open == first )) &&
3451                                      depth==0 ) {
3452                                     flags |= SCF_TRIE_RESTUDY;
3453                                     if ( startbranch == first
3454                                          && scan == tail )
3455                                     {
3456                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3457                                     }
3458                                 }
3459 #endif
3460                             } else {
3461                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3462                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3463                                  */
3464                                 if ( startbranch == first ) {
3465                                     regnode *opt;
3466                                     /* the entire thing is a NOTHING sequence, something like this:
3467                                      * (?:|) So we can turn it into a plain NOTHING op. */
3468                                     DEBUG_TRIE_COMPILE_r({
3469                                         regprop(RExC_rx, mysv, cur);
3470                                         PerlIO_printf( Perl_debug_log,
3471                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3472                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3473
3474                                     });
3475                                     OP(startbranch)= NOTHING;
3476                                     NEXT_OFF(startbranch)= tail - startbranch;
3477                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3478                                         OP(opt)= OPTIMIZED;
3479                                 }
3480                             }
3481                         } /* end if ( last) */
3482                     } /* TRIE_MAXBUF is non zero */
3483
3484                 } /* do trie */
3485
3486             }
3487             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3488                 scan = NEXTOPER(NEXTOPER(scan));
3489             } else                      /* single branch is optimized. */
3490                 scan = NEXTOPER(scan);
3491             continue;
3492         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3493             scan_frame *newframe = NULL;
3494             I32 paren;
3495             regnode *start;
3496             regnode *end;
3497
3498             if (OP(scan) != SUSPEND) {
3499             /* set the pointer */
3500                 if (OP(scan) == GOSUB) {
3501                     paren = ARG(scan);
3502                     RExC_recurse[ARG2L(scan)] = scan;
3503                     start = RExC_open_parens[paren-1];
3504                     end   = RExC_close_parens[paren-1];
3505                 } else {
3506                     paren = 0;
3507                     start = RExC_rxi->program + 1;
3508                     end   = RExC_opend;
3509                 }
3510                 if (!recursed) {
3511                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3512                     SAVEFREEPV(recursed);
3513                 }
3514                 if (!PAREN_TEST(recursed,paren+1)) {
3515                     PAREN_SET(recursed,paren+1);
3516                     Newx(newframe,1,scan_frame);
3517                 } else {
3518                     if (flags & SCF_DO_SUBSTR) {
3519                         SCAN_COMMIT(pRExC_state,data,minlenp);
3520                         data->longest = &(data->longest_float);
3521                     }
3522                     is_inf = is_inf_internal = 1;
3523                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3524                         cl_anything(pRExC_state, data->start_class);
3525                     flags &= ~SCF_DO_STCLASS;
3526                 }
3527             } else {
3528                 Newx(newframe,1,scan_frame);
3529                 paren = stopparen;
3530                 start = scan+2;
3531                 end = regnext(scan);
3532             }
3533             if (newframe) {
3534                 assert(start);
3535                 assert(end);
3536                 SAVEFREEPV(newframe);
3537                 newframe->next = regnext(scan);
3538                 newframe->last = last;
3539                 newframe->stop = stopparen;
3540                 newframe->prev = frame;
3541
3542                 frame = newframe;
3543                 scan =  start;
3544                 stopparen = paren;
3545                 last = end;
3546
3547                 continue;
3548             }
3549         }
3550         else if (OP(scan) == EXACT) {
3551             I32 l = STR_LEN(scan);
3552             UV uc;
3553             if (UTF) {
3554                 const U8 * const s = (U8*)STRING(scan);
3555                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3556                 l = utf8_length(s, s + l);
3557             } else {
3558                 uc = *((U8*)STRING(scan));
3559             }
3560             min += l;
3561             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3562                 /* The code below prefers earlier match for fixed
3563                    offset, later match for variable offset.  */
3564                 if (data->last_end == -1) { /* Update the start info. */
3565                     data->last_start_min = data->pos_min;
3566                     data->last_start_max = is_inf
3567                         ? I32_MAX : data->pos_min + data->pos_delta;
3568                 }
3569                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3570                 if (UTF)
3571                     SvUTF8_on(data->last_found);
3572                 {
3573                     SV * const sv = data->last_found;
3574                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3575                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3576                     if (mg && mg->mg_len >= 0)
3577                         mg->mg_len += utf8_length((U8*)STRING(scan),
3578                                                   (U8*)STRING(scan)+STR_LEN(scan));
3579                 }
3580                 data->last_end = data->pos_min + l;
3581                 data->pos_min += l; /* As in the first entry. */
3582                 data->flags &= ~SF_BEFORE_EOL;
3583             }
3584             if (flags & SCF_DO_STCLASS_AND) {
3585                 /* Check whether it is compatible with what we know already! */
3586                 int compat = 1;
3587
3588
3589                 /* If compatible, we or it in below.  It is compatible if is
3590                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3591                  * it's for a locale.  Even if there isn't unicode semantics
3592                  * here, at runtime there may be because of matching against a
3593                  * utf8 string, so accept a possible false positive for
3594                  * latin1-range folds */
3595                 if (uc >= 0x100 ||
3596                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3597                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3598                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3599                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3600                     )
3601                 {
3602                     compat = 0;
3603                 }
3604                 ANYOF_CLASS_ZERO(data->start_class);
3605                 ANYOF_BITMAP_ZERO(data->start_class);
3606                 if (compat)
3607                     ANYOF_BITMAP_SET(data->start_class, uc);
3608                 else if (uc >= 0x100) {
3609                     int i;
3610
3611                     /* Some Unicode code points fold to the Latin1 range; as
3612                      * XXX temporary code, instead of figuring out if this is
3613                      * one, just assume it is and set all the start class bits
3614                      * that could be some such above 255 code point's fold
3615                      * which will generate fals positives.  As the code
3616                      * elsewhere that does compute the fold settles down, it
3617                      * can be extracted out and re-used here */
3618                     for (i = 0; i < 256; i++){
3619                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3620                             ANYOF_BITMAP_SET(data->start_class, i);
3621                         }
3622                     }
3623                 }
3624                 data->start_class->flags &= ~ANYOF_EOS;
3625                 if (uc < 0x100)
3626                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3627             }
3628             else if (flags & SCF_DO_STCLASS_OR) {
3629                 /* false positive possible if the class is case-folded */
3630                 if (uc < 0x100)
3631                     ANYOF_BITMAP_SET(data->start_class, uc);
3632                 else
3633                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3634                 data->start_class->flags &= ~ANYOF_EOS;
3635                 cl_and(data->start_class, and_withp);
3636             }
3637             flags &= ~SCF_DO_STCLASS;
3638         }
3639         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3640             I32 l = STR_LEN(scan);
3641             UV uc = *((U8*)STRING(scan));
3642
3643             /* Search for fixed substrings supports EXACT only. */
3644             if (flags & SCF_DO_SUBSTR) {
3645                 assert(data);
3646                 SCAN_COMMIT(pRExC_state, data, minlenp);
3647             }
3648             if (UTF) {
3649                 const U8 * const s = (U8 *)STRING(scan);
3650                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3651                 l = utf8_length(s, s + l);
3652             }
3653             else if (has_exactf_sharp_s) {
3654                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3655             }
3656             min += l - min_subtract;
3657             if (min < 0) {
3658                 min = 0;
3659             }
3660             delta += min_subtract;
3661             if (flags & SCF_DO_SUBSTR) {
3662                 data->pos_min += l - min_subtract;
3663                 if (data->pos_min < 0) {
3664                     data->pos_min = 0;
3665                 }
3666                 data->pos_delta += min_subtract;
3667                 if (min_subtract) {
3668                     data->longest = &(data->longest_float);
3669                 }
3670             }
3671             if (flags & SCF_DO_STCLASS_AND) {
3672                 /* Check whether it is compatible with what we know already! */
3673                 int compat = 1;
3674                 if (uc >= 0x100 ||
3675                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3676                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3677                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3678                 {
3679                     compat = 0;
3680                 }
3681                 ANYOF_CLASS_ZERO(data->start_class);
3682                 ANYOF_BITMAP_ZERO(data->start_class);
3683                 if (compat) {
3684                     ANYOF_BITMAP_SET(data->start_class, uc);
3685                     data->start_class->flags &= ~ANYOF_EOS;
3686                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3687                     if (OP(scan) == EXACTFL) {
3688                         /* XXX This set is probably no longer necessary, and
3689                          * probably wrong as LOCALE now is on in the initial
3690                          * state */
3691                         data->start_class->flags |= ANYOF_LOCALE;
3692                     }
3693                     else {
3694
3695                         /* Also set the other member of the fold pair.  In case
3696                          * that unicode semantics is called for at runtime, use
3697                          * the full latin1 fold.  (Can't do this for locale,
3698                          * because not known until runtime) */
3699                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3700
3701                         /* All other (EXACTFL handled above) folds except under
3702                          * /iaa that include s, S, and sharp_s also may include
3703                          * the others */
3704                         if (OP(scan) != EXACTFA) {
3705                             if (uc == 's' || uc == 'S') {
3706                                 ANYOF_BITMAP_SET(data->start_class,
3707                                                  LATIN_SMALL_LETTER_SHARP_S);
3708                             }
3709                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3710                                 ANYOF_BITMAP_SET(data->start_class, 's');
3711                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3712                             }
3713                         }
3714                     }
3715                 }
3716                 else if (uc >= 0x100) {
3717                     int i;
3718                     for (i = 0; i < 256; i++){
3719                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3720                             ANYOF_BITMAP_SET(data->start_class, i);
3721                         }
3722                     }
3723                 }
3724             }
3725             else if (flags & SCF_DO_STCLASS_OR) {
3726                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3727                     /* false positive possible if the class is case-folded.
3728                        Assume that the locale settings are the same... */
3729                     if (uc < 0x100) {
3730                         ANYOF_BITMAP_SET(data->start_class, uc);
3731                         if (OP(scan) != EXACTFL) {
3732
3733                             /* And set the other member of the fold pair, but
3734                              * can't do that in locale because not known until
3735                              * run-time */
3736                             ANYOF_BITMAP_SET(data->start_class,
3737                                              PL_fold_latin1[uc]);
3738
3739                             /* All folds except under /iaa that include s, S,
3740                              * and sharp_s also may include the others */
3741                             if (OP(scan) != EXACTFA) {
3742                                 if (uc == 's' || uc == 'S') {
3743                                     ANYOF_BITMAP_SET(data->start_class,
3744                                                    LATIN_SMALL_LETTER_SHARP_S);
3745                                 }
3746                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3747                                     ANYOF_BITMAP_SET(data->start_class, 's');
3748                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3749                                 }
3750                             }
3751                         }
3752                     }
3753                     data->start_class->flags &= ~ANYOF_EOS;
3754                 }
3755                 cl_and(data->start_class, and_withp);
3756             }
3757             flags &= ~SCF_DO_STCLASS;
3758         }
3759         else if (REGNODE_VARIES(OP(scan))) {
3760             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3761             I32 f = flags, pos_before = 0;
3762             regnode * const oscan = scan;
3763             struct regnode_charclass_class this_class;
3764             struct regnode_charclass_class *oclass = NULL;
3765             I32 next_is_eval = 0;
3766
3767             switch (PL_regkind[OP(scan)]) {
3768             case WHILEM:                /* End of (?:...)* . */
3769                 scan = NEXTOPER(scan);
3770                 goto finish;
3771             case PLUS:
3772                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3773                     next = NEXTOPER(scan);
3774                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3775                         mincount = 1;
3776                         maxcount = REG_INFTY;
3777                         next = regnext(scan);
3778                         scan = NEXTOPER(scan);
3779                         goto do_curly;
3780                     }
3781                 }
3782                 if (flags & SCF_DO_SUBSTR)
3783                     data->pos_min++;
3784                 min++;
3785                 /* Fall through. */
3786             case STAR:
3787                 if (flags & SCF_DO_STCLASS) {
3788                     mincount = 0;
3789                     maxcount = REG_INFTY;
3790                     next = regnext(scan);
3791                     scan = NEXTOPER(scan);
3792                     goto do_curly;
3793                 }
3794                 is_inf = is_inf_internal = 1;
3795                 scan = regnext(scan);
3796                 if (flags & SCF_DO_SUBSTR) {
3797                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3798                     data->longest = &(data->longest_float);
3799                 }
3800                 goto optimize_curly_tail;
3801             case CURLY:
3802                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3803                     && (scan->flags == stopparen))
3804                 {
3805                     mincount = 1;
3806                     maxcount = 1;
3807                 } else {
3808                     mincount = ARG1(scan);
3809                     maxcount = ARG2(scan);
3810                 }
3811                 next = regnext(scan);
3812                 if (OP(scan) == CURLYX) {
3813                     I32 lp = (data ? *(data->last_closep) : 0);
3814                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3815                 }
3816                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3817                 next_is_eval = (OP(scan) == EVAL);
3818               do_curly:
3819                 if (flags & SCF_DO_SUBSTR) {
3820                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3821                     pos_before = data->pos_min;
3822                 }
3823                 if (data) {
3824                     fl = data->flags;
3825                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3826                     if (is_inf)
3827                         data->flags |= SF_IS_INF;
3828                 }
3829                 if (flags & SCF_DO_STCLASS) {
3830                     cl_init(pRExC_state, &this_class);
3831                     oclass = data->start_class;
3832                     data->start_class = &this_class;
3833                     f |= SCF_DO_STCLASS_AND;
3834                     f &= ~SCF_DO_STCLASS_OR;
3835                 }
3836                 /* Exclude from super-linear cache processing any {n,m}
3837                    regops for which the combination of input pos and regex
3838                    pos is not enough information to determine if a match
3839                    will be possible.
3840
3841                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3842                    regex pos at the \s*, the prospects for a match depend not
3843                    only on the input position but also on how many (bar\s*)
3844                    repeats into the {4,8} we are. */
3845                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3846                     f &= ~SCF_WHILEM_VISITED_POS;
3847
3848                 /* This will finish on WHILEM, setting scan, or on NULL: */
3849                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3850                                       last, data, stopparen, recursed, NULL,
3851                                       (mincount == 0
3852                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3853
3854                 if (flags & SCF_DO_STCLASS)
3855                     data->start_class = oclass;
3856                 if (mincount == 0 || minnext == 0) {
3857                     if (flags & SCF_DO_STCLASS_OR) {
3858                         cl_or(pRExC_state, data->start_class, &this_class);
3859                     }
3860                     else if (flags & SCF_DO_STCLASS_AND) {
3861                         /* Switch to OR mode: cache the old value of
3862                          * data->start_class */
3863                         INIT_AND_WITHP;
3864                         StructCopy(data->start_class, and_withp,
3865                                    struct regnode_charclass_class);
3866                         flags &= ~SCF_DO_STCLASS_AND;
3867                         StructCopy(&this_class, data->start_class,
3868                                    struct regnode_charclass_class);
3869                         flags |= SCF_DO_STCLASS_OR;
3870                         data->start_class->flags |= ANYOF_EOS;
3871                     }
3872                 } else {                /* Non-zero len */
3873                     if (flags & SCF_DO_STCLASS_OR) {
3874                         cl_or(pRExC_state, data->start_class, &this_class);
3875                         cl_and(data->start_class, and_withp);
3876                     }
3877                     else if (flags & SCF_DO_STCLASS_AND)
3878                         cl_and(data->start_class, &this_class);
3879                     flags &= ~SCF_DO_STCLASS;
3880                 }
3881                 if (!scan)              /* It was not CURLYX, but CURLY. */
3882                     scan = next;
3883                 if ( /* ? quantifier ok, except for (?{ ... }) */
3884                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3885                     && (minnext == 0) && (deltanext == 0)
3886                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3887                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3888                 {
3889                     ckWARNreg(RExC_parse,
3890                               "Quantifier unexpected on zero-length expression");
3891                 }
3892
3893                 min += minnext * mincount;
3894                 is_inf_internal |= ((maxcount == REG_INFTY
3895                                      && (minnext + deltanext) > 0)
3896                                     || deltanext == I32_MAX);
3897                 is_inf |= is_inf_internal;
3898                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3899
3900                 /* Try powerful optimization CURLYX => CURLYN. */
3901                 if (  OP(oscan) == CURLYX && data
3902                       && data->flags & SF_IN_PAR
3903                       && !(data->flags & SF_HAS_EVAL)
3904                       && !deltanext && minnext == 1 ) {
3905                     /* Try to optimize to CURLYN.  */
3906                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3907                     regnode * const nxt1 = nxt;
3908 #ifdef DEBUGGING
3909                     regnode *nxt2;
3910 #endif
3911
3912                     /* Skip open. */
3913                     nxt = regnext(nxt);
3914                     if (!REGNODE_SIMPLE(OP(nxt))
3915                         && !(PL_regkind[OP(nxt)] == EXACT
3916                              && STR_LEN(nxt) == 1))
3917                         goto nogo;
3918 #ifdef DEBUGGING
3919                     nxt2 = nxt;
3920 #endif
3921                     nxt = regnext(nxt);
3922                     if (OP(nxt) != CLOSE)
3923                         goto nogo;
3924                     if (RExC_open_parens) {
3925                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3926                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3927                     }
3928                     /* Now we know that nxt2 is the only contents: */
3929                     oscan->flags = (U8)ARG(nxt);
3930                     OP(oscan) = CURLYN;
3931                     OP(nxt1) = NOTHING; /* was OPEN. */
3932
3933 #ifdef DEBUGGING
3934                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3935                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3936                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3937                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3938                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3939                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3940 #endif
3941                 }
3942               nogo:
3943
3944                 /* Try optimization CURLYX => CURLYM. */
3945                 if (  OP(oscan) == CURLYX && data
3946                       && !(data->flags & SF_HAS_PAR)
3947                       && !(data->flags & SF_HAS_EVAL)
3948                       && !deltanext     /* atom is fixed width */
3949                       && minnext != 0   /* CURLYM can't handle zero width */
3950                 ) {
3951                     /* XXXX How to optimize if data == 0? */
3952                     /* Optimize to a simpler form.  */
3953                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3954                     regnode *nxt2;
3955
3956                     OP(oscan) = CURLYM;
3957                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3958                             && (OP(nxt2) != WHILEM))
3959                         nxt = nxt2;
3960                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3961                     /* Need to optimize away parenths. */
3962                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3963                         /* Set the parenth number.  */
3964                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3965
3966                         oscan->flags = (U8)ARG(nxt);
3967                         if (RExC_open_parens) {
3968                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3969                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3970                         }
3971                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3972                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3973
3974 #ifdef DEBUGGING
3975                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3976                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3977                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3978                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3979 #endif
3980 #if 0
3981                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3982                             regnode *nnxt = regnext(nxt1);
3983                             if (nnxt == nxt) {
3984                                 if (reg_off_by_arg[OP(nxt1)])
3985                                     ARG_SET(nxt1, nxt2 - nxt1);
3986                                 else if (nxt2 - nxt1 < U16_MAX)
3987                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3988                                 else
3989                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3990                             }
3991                             nxt1 = nnxt;
3992                         }
3993 #endif
3994                         /* Optimize again: */
3995                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3996                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3997                     }
3998                     else
3999                         oscan->flags = 0;
4000                 }
4001                 else if ((OP(oscan) == CURLYX)
4002                          && (flags & SCF_WHILEM_VISITED_POS)
4003                          /* See the comment on a similar expression above.
4004                             However, this time it's not a subexpression
4005                             we care about, but the expression itself. */
4006                          && (maxcount == REG_INFTY)
4007                          && data && ++data->whilem_c < 16) {
4008                     /* This stays as CURLYX, we can put the count/of pair. */
4009                     /* Find WHILEM (as in regexec.c) */
4010                     regnode *nxt = oscan + NEXT_OFF(oscan);
4011
4012                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4013                         nxt += ARG(nxt);
4014                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4015                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4016                 }
4017                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4018                     pars++;
4019                 if (flags & SCF_DO_SUBSTR) {
4020                     SV *last_str = NULL;
4021                     int counted = mincount != 0;
4022
4023                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4024 #if defined(SPARC64_GCC_WORKAROUND)
4025                         I32 b = 0;
4026                         STRLEN l = 0;
4027                         const char *s = NULL;
4028                         I32 old = 0;
4029
4030                         if (pos_before >= data->last_start_min)
4031                             b = pos_before;
4032                         else
4033                             b = data->last_start_min;
4034
4035                         l = 0;
4036                         s = SvPV_const(data->last_found, l);
4037                         old = b - data->last_start_min;
4038
4039 #else
4040                         I32 b = pos_before >= data->last_start_min
4041                             ? pos_before : data->last_start_min;
4042                         STRLEN l;
4043                         const char * const s = SvPV_const(data->last_found, l);
4044                         I32 old = b - data->last_start_min;
4045 #endif
4046
4047                         if (UTF)
4048                             old = utf8_hop((U8*)s, old) - (U8*)s;
4049                         l -= old;
4050                         /* Get the added string: */
4051                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4052                         if (deltanext == 0 && pos_before == b) {
4053                             /* What was added is a constant string */
4054                             if (mincount > 1) {
4055                                 SvGROW(last_str, (mincount * l) + 1);
4056                                 repeatcpy(SvPVX(last_str) + l,
4057                                           SvPVX_const(last_str), l, mincount - 1);
4058                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4059                                 /* Add additional parts. */
4060                                 SvCUR_set(data->last_found,
4061                                           SvCUR(data->last_found) - l);
4062                                 sv_catsv(data->last_found, last_str);
4063                                 {
4064                                     SV * sv = data->last_found;
4065                                     MAGIC *mg =
4066                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4067                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4068                                     if (mg && mg->mg_len >= 0)
4069                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4070                                 }
4071                                 data->last_end += l * (mincount - 1);
4072                             }
4073                         } else {
4074                             /* start offset must point into the last copy */
4075                             data->last_start_min += minnext * (mincount - 1);
4076                             data->last_start_max += is_inf ? I32_MAX
4077                                 : (maxcount - 1) * (minnext + data->pos_delta);
4078                         }
4079                     }
4080                     /* It is counted once already... */
4081                     data->pos_min += minnext * (mincount - counted);
4082                     data->pos_delta += - counted * deltanext +
4083                         (minnext + deltanext) * maxcount - minnext * mincount;
4084                     if (mincount != maxcount) {
4085                          /* Cannot extend fixed substrings found inside
4086                             the group.  */
4087                         SCAN_COMMIT(pRExC_state,data,minlenp);
4088                         if (mincount && last_str) {
4089                             SV * const sv = data->last_found;
4090                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4091                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4092
4093                             if (mg)
4094                                 mg->mg_len = -1;
4095                             sv_setsv(sv, last_str);
4096                             data->last_end = data->pos_min;
4097                             data->last_start_min =
4098                                 data->pos_min - CHR_SVLEN(last_str);
4099                             data->last_start_max = is_inf
4100                                 ? I32_MAX
4101                                 : data->pos_min + data->pos_delta
4102                                 - CHR_SVLEN(last_str);
4103                         }
4104                         data->longest = &(data->longest_float);
4105                     }
4106                     SvREFCNT_dec(last_str);
4107                 }
4108                 if (data && (fl & SF_HAS_EVAL))
4109                     data->flags |= SF_HAS_EVAL;
4110               optimize_curly_tail:
4111                 if (OP(oscan) != CURLYX) {
4112                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4113                            && NEXT_OFF(next))
4114                         NEXT_OFF(oscan) += NEXT_OFF(next);
4115                 }
4116                 continue;
4117             default:                    /* REF, ANYOFV, and CLUMP only? */
4118                 if (flags & SCF_DO_SUBSTR) {
4119                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4120                     data->longest = &(data->longest_float);
4121                 }
4122                 is_inf = is_inf_internal = 1;
4123                 if (flags & SCF_DO_STCLASS_OR)
4124                     cl_anything(pRExC_state, data->start_class);
4125                 flags &= ~SCF_DO_STCLASS;
4126                 break;
4127             }
4128         }
4129         else if (OP(scan) == LNBREAK) {
4130             if (flags & SCF_DO_STCLASS) {
4131                 int value = 0;
4132                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4133                 if (flags & SCF_DO_STCLASS_AND) {
4134                     for (value = 0; value < 256; value++)
4135                         if (!is_VERTWS_cp(value))
4136                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4137                 }
4138                 else {
4139                     for (value = 0; value < 256; value++)
4140                         if (is_VERTWS_cp(value))
4141                             ANYOF_BITMAP_SET(data->start_class, value);
4142                 }
4143                 if (flags & SCF_DO_STCLASS_OR)
4144                     cl_and(data->start_class, and_withp);
4145                 flags &= ~SCF_DO_STCLASS;
4146             }
4147             min += 1;
4148             delta += 1;
4149             if (flags & SCF_DO_SUBSTR) {
4150                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4151                 data->pos_min += 1;
4152                 data->pos_delta += 1;
4153                 data->longest = &(data->longest_float);
4154             }
4155         }
4156         else if (REGNODE_SIMPLE(OP(scan))) {
4157             int value = 0;
4158
4159             if (flags & SCF_DO_SUBSTR) {
4160                 SCAN_COMMIT(pRExC_state,data,minlenp);
4161                 data->pos_min++;
4162             }
4163             min++;
4164             if (flags & SCF_DO_STCLASS) {
4165                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4166
4167                 /* Some of the logic below assumes that switching
4168                    locale on will only add false positives. */
4169                 switch (PL_regkind[OP(scan)]) {
4170                 case SANY:
4171                 default:
4172                   do_default:
4173                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4174                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4175                         cl_anything(pRExC_state, data->start_class);
4176                     break;
4177                 case REG_ANY:
4178                     if (OP(scan) == SANY)
4179                         goto do_default;
4180                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4181                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4182                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4183                         cl_anything(pRExC_state, data->start_class);
4184                     }
4185                     if (flags & SCF_DO_STCLASS_AND || !value)
4186                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4187                     break;
4188                 case ANYOF:
4189                     if (flags & SCF_DO_STCLASS_AND)
4190                         cl_and(data->start_class,
4191                                (struct regnode_charclass_class*)scan);
4192                     else
4193                         cl_or(pRExC_state, data->start_class,
4194                               (struct regnode_charclass_class*)scan);
4195                     break;
4196                 case ALNUM:
4197                     if (flags & SCF_DO_STCLASS_AND) {
4198                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4199                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4200                             if (OP(scan) == ALNUMU) {
4201                                 for (value = 0; value < 256; value++) {
4202                                     if (!isWORDCHAR_L1(value)) {
4203                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4204                                     }
4205                                 }
4206                             } else {
4207                                 for (value = 0; value < 256; value++) {
4208                                     if (!isALNUM(value)) {
4209                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4210                                     }
4211                                 }
4212                             }
4213                         }
4214                     }
4215                     else {
4216                         if (data->start_class->flags & ANYOF_LOCALE)
4217                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4218
4219                         /* Even if under locale, set the bits for non-locale
4220                          * in case it isn't a true locale-node.  This will
4221                          * create false positives if it truly is locale */
4222                         if (OP(scan) == ALNUMU) {
4223                             for (value = 0; value < 256; value++) {
4224                                 if (isWORDCHAR_L1(value)) {
4225                                     ANYOF_BITMAP_SET(data->start_class, value);
4226                                 }
4227                             }
4228                         } else {
4229                             for (value = 0; value < 256; value++) {
4230                                 if (isALNUM(value)) {
4231                                     ANYOF_BITMAP_SET(data->start_class, value);
4232                                 }
4233                             }
4234                         }
4235                     }
4236                     break;
4237                 case NALNUM:
4238                     if (flags & SCF_DO_STCLASS_AND) {
4239                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4240                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4241                             if (OP(scan) == NALNUMU) {
4242                                 for (value = 0; value < 256; value++) {
4243                                     if (isWORDCHAR_L1(value)) {
4244                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4245                                     }
4246                                 }
4247                             } else {
4248                                 for (value = 0; value < 256; value++) {
4249                                     if (isALNUM(value)) {
4250                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4251                                     }
4252                                 }
4253                             }
4254                         }
4255                     }
4256                     else {
4257                         if (data->start_class->flags & ANYOF_LOCALE)
4258                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4259
4260                         /* Even if under locale, set the bits for non-locale in
4261                          * case it isn't a true locale-node.  This will create
4262                          * false positives if it truly is locale */
4263                         if (OP(scan) == NALNUMU) {
4264                             for (value = 0; value < 256; value++) {
4265                                 if (! isWORDCHAR_L1(value)) {
4266                                     ANYOF_BITMAP_SET(data->start_class, value);
4267                                 }
4268                             }
4269                         } else {
4270                             for (value = 0; value < 256; value++) {
4271                                 if (! isALNUM(value)) {
4272                                     ANYOF_BITMAP_SET(data->start_class, value);
4273                                 }
4274                             }
4275                         }
4276                     }
4277                     break;
4278                 case SPACE:
4279                     if (flags & SCF_DO_STCLASS_AND) {
4280                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4281                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4282                             if (OP(scan) == SPACEU) {
4283                                 for (value = 0; value < 256; value++) {
4284                                     if (!isSPACE_L1(value)) {
4285                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4286                                     }
4287                                 }
4288                             } else {
4289                                 for (value = 0; value < 256; value++) {
4290                                     if (!isSPACE(value)) {
4291                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4292                                     }
4293                                 }
4294                             }
4295                         }
4296                     }
4297                     else {
4298                         if (data->start_class->flags & ANYOF_LOCALE) {
4299                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4300                         }
4301                         if (OP(scan) == SPACEU) {
4302                             for (value = 0; value < 256; value++) {
4303                                 if (isSPACE_L1(value)) {
4304                                     ANYOF_BITMAP_SET(data->start_class, value);
4305                                 }
4306                             }
4307                         } else {
4308                             for (value = 0; value < 256; value++) {
4309                                 if (isSPACE(value)) {
4310                                     ANYOF_BITMAP_SET(data->start_class, value);
4311                                 }
4312                             }
4313                         }
4314                     }
4315                     break;
4316                 case NSPACE:
4317                     if (flags & SCF_DO_STCLASS_AND) {
4318                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4319                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4320                             if (OP(scan) == NSPACEU) {
4321                                 for (value = 0; value < 256; value++) {
4322                                     if (isSPACE_L1(value)) {
4323                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4324                                     }
4325                                 }
4326                             } else {
4327                                 for (value = 0; value < 256; value++) {
4328                                     if (isSPACE(value)) {
4329                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4330                                     }
4331                                 }
4332                             }
4333                         }
4334                     }
4335                     else {
4336                         if (data->start_class->flags & ANYOF_LOCALE)
4337                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4338                         if (OP(scan) == NSPACEU) {
4339                             for (value = 0; value < 256; value++) {
4340                                 if (!isSPACE_L1(value)) {
4341                                     ANYOF_BITMAP_SET(data->start_class, value);
4342                                 }
4343                             }
4344                         }
4345                         else {
4346                             for (value = 0; value < 256; value++) {
4347                                 if (!isSPACE(value)) {
4348                                     ANYOF_BITMAP_SET(data->start_class, value);
4349                                 }
4350                             }
4351                         }
4352                     }
4353                     break;
4354                 case DIGIT:
4355                     if (flags & SCF_DO_STCLASS_AND) {
4356                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4357                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4358                             for (value = 0; value < 256; value++)
4359                                 if (!isDIGIT(value))
4360                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4361                         }
4362                     }
4363                     else {
4364                         if (data->start_class->flags & ANYOF_LOCALE)
4365                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4366                         for (value = 0; value < 256; value++)
4367                             if (isDIGIT(value))
4368                                 ANYOF_BITMAP_SET(data->start_class, value);
4369                     }
4370                     break;
4371                 case NDIGIT:
4372                     if (flags & SCF_DO_STCLASS_AND) {
4373                         if (!(data->start_class->flags & ANYOF_LOCALE))
4374                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4375                         for (value = 0; value < 256; value++)
4376                             if (isDIGIT(value))
4377                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4378                     }
4379                     else {
4380                         if (data->start_class->flags & ANYOF_LOCALE)
4381                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4382                         for (value = 0; value < 256; value++)
4383                             if (!isDIGIT(value))
4384                                 ANYOF_BITMAP_SET(data->start_class, value);
4385                     }
4386                     break;
4387                 CASE_SYNST_FNC(VERTWS);
4388                 CASE_SYNST_FNC(HORIZWS);
4389
4390                 }
4391                 if (flags & SCF_DO_STCLASS_OR)
4392                     cl_and(data->start_class, and_withp);
4393                 flags &= ~SCF_DO_STCLASS;
4394             }
4395         }
4396         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4397             data->flags |= (OP(scan) == MEOL
4398                             ? SF_BEFORE_MEOL
4399                             : SF_BEFORE_SEOL);
4400         }
4401         else if (  PL_regkind[OP(scan)] == BRANCHJ
4402                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4403                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4404                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4405             if ( OP(scan) == UNLESSM &&
4406                  scan->flags == 0 &&
4407                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4408                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4409             ) {
4410                 regnode *opt;
4411                 regnode *upto= regnext(scan);
4412                 DEBUG_PARSE_r({
4413                     SV * const mysv_val=sv_newmortal();
4414                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4415
4416                     /*DEBUG_PARSE_MSG("opfail");*/
4417                     regprop(RExC_rx, mysv_val, upto);
4418                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4419                                   SvPV_nolen_const(mysv_val),
4420                                   (IV)REG_NODE_NUM(upto),
4421                                   (IV)(upto - scan)
4422                     );
4423                 });
4424                 OP(scan) = OPFAIL;
4425                 NEXT_OFF(scan) = upto - scan;
4426                 for (opt= scan + 1; opt < upto ; opt++)
4427                     OP(opt) = OPTIMIZED;
4428                 scan= upto;
4429                 continue;
4430             }
4431             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4432                 || OP(scan) == UNLESSM )
4433             {
4434                 /* Negative Lookahead/lookbehind
4435                    In this case we can't do fixed string optimisation.
4436                 */
4437
4438                 I32 deltanext, minnext, fake = 0;
4439                 regnode *nscan;
4440                 struct regnode_charclass_class intrnl;
4441                 int f = 0;
4442
4443                 data_fake.flags = 0;
4444                 if (data) {
4445                     data_fake.whilem_c = data->whilem_c;
4446                     data_fake.last_closep = data->last_closep;
4447                 }
4448                 else
4449                     data_fake.last_closep = &fake;
4450                 data_fake.pos_delta = delta;
4451                 if ( flags & SCF_DO_STCLASS && !scan->flags
4452                      && OP(scan) == IFMATCH ) { /* Lookahead */
4453                     cl_init(pRExC_state, &intrnl);
4454                     data_fake.start_class = &intrnl;
4455                     f |= SCF_DO_STCLASS_AND;
4456                 }
4457                 if (flags & SCF_WHILEM_VISITED_POS)
4458                     f |= SCF_WHILEM_VISITED_POS;
4459                 next = regnext(scan);
4460                 nscan = NEXTOPER(NEXTOPER(scan));
4461                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4462                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4463                 if (scan->flags) {
4464                     if (deltanext) {
4465                         FAIL("Variable length lookbehind not implemented");
4466                     }
4467                     else if (minnext > (I32)U8_MAX) {
4468                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4469                     }
4470                     scan->flags = (U8)minnext;
4471                 }
4472                 if (data) {
4473                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4474                         pars++;
4475                     if (data_fake.flags & SF_HAS_EVAL)
4476                         data->flags |= SF_HAS_EVAL;
4477                     data->whilem_c = data_fake.whilem_c;
4478                 }
4479                 if (f & SCF_DO_STCLASS_AND) {
4480                     if (flags & SCF_DO_STCLASS_OR) {
4481                         /* OR before, AND after: ideally we would recurse with
4482                          * data_fake to get the AND applied by study of the
4483                          * remainder of the pattern, and then derecurse;
4484                          * *** HACK *** for now just treat as "no information".
4485                          * See [perl #56690].
4486                          */
4487                         cl_init(pRExC_state, data->start_class);
4488                     }  else {
4489                         /* AND before and after: combine and continue */
4490                         const int was = (data->start_class->flags & ANYOF_EOS);
4491
4492                         cl_and(data->start_class, &intrnl);
4493                         if (was)
4494                             data->start_class->flags |= ANYOF_EOS;
4495                     }
4496                 }
4497             }
4498 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4499             else {
4500                 /* Positive Lookahead/lookbehind
4501                    In this case we can do fixed string optimisation,
4502                    but we must be careful about it. Note in the case of
4503                    lookbehind the positions will be offset by the minimum
4504                    length of the pattern, something we won't know about
4505                    until after the recurse.
4506                 */
4507                 I32 deltanext, fake = 0;
4508                 regnode *nscan;
4509                 struct regnode_charclass_class intrnl;
4510                 int f = 0;
4511                 /* We use SAVEFREEPV so that when the full compile
4512                     is finished perl will clean up the allocated
4513                     minlens when it's all done. This way we don't
4514                     have to worry about freeing them when we know
4515                     they wont be used, which would be a pain.
4516                  */
4517                 I32 *minnextp;
4518                 Newx( minnextp, 1, I32 );
4519                 SAVEFREEPV(minnextp);
4520
4521                 if (data) {
4522                     StructCopy(data, &data_fake, scan_data_t);
4523                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4524                         f |= SCF_DO_SUBSTR;
4525                         if (scan->flags)
4526                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4527                         data_fake.last_found=newSVsv(data->last_found);
4528                     }
4529                 }
4530                 else
4531                     data_fake.last_closep = &fake;
4532                 data_fake.flags = 0;
4533                 data_fake.pos_delta = delta;
4534                 if (is_inf)
4535                     data_fake.flags |= SF_IS_INF;
4536                 if ( flags & SCF_DO_STCLASS && !scan->flags
4537                      && OP(scan) == IFMATCH ) { /* Lookahead */
4538                     cl_init(pRExC_state, &intrnl);
4539                     data_fake.start_class = &intrnl;
4540                     f |= SCF_DO_STCLASS_AND;
4541                 }
4542                 if (flags & SCF_WHILEM_VISITED_POS)
4543                     f |= SCF_WHILEM_VISITED_POS;
4544                 next = regnext(scan);
4545                 nscan = NEXTOPER(NEXTOPER(scan));
4546
4547                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4548                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4549                 if (scan->flags) {
4550                     if (deltanext) {
4551                         FAIL("Variable length lookbehind not implemented");
4552                     }
4553                     else if (*minnextp > (I32)U8_MAX) {
4554                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4555                     }
4556                     scan->flags = (U8)*minnextp;
4557                 }
4558
4559                 *minnextp += min;
4560
4561                 if (f & SCF_DO_STCLASS_AND) {
4562                     const int was = (data->start_class->flags & ANYOF_EOS);
4563
4564                     cl_and(data->start_class, &intrnl);
4565                     if (was)
4566                         data->start_class->flags |= ANYOF_EOS;
4567                 }
4568                 if (data) {
4569                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4570                         pars++;
4571                     if (data_fake.flags & SF_HAS_EVAL)
4572                         data->flags |= SF_HAS_EVAL;
4573                     data->whilem_c = data_fake.whilem_c;
4574                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4575                         if (RExC_rx->minlen<*minnextp)
4576                             RExC_rx->minlen=*minnextp;
4577                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4578                         SvREFCNT_dec(data_fake.last_found);
4579
4580                         if ( data_fake.minlen_fixed != minlenp )
4581                         {
4582                             data->offset_fixed= data_fake.offset_fixed;
4583                             data->minlen_fixed= data_fake.minlen_fixed;
4584                             data->lookbehind_fixed+= scan->flags;
4585                         }
4586                         if ( data_fake.minlen_float != minlenp )
4587                         {
4588                             data->minlen_float= data_fake.minlen_float;
4589                             data->offset_float_min=data_fake.offset_float_min;
4590                             data->offset_float_max=data_fake.offset_float_max;
4591                             data->lookbehind_float+= scan->flags;
4592                         }
4593                     }
4594                 }
4595             }
4596 #endif
4597         }
4598         else if (OP(scan) == OPEN) {
4599             if (stopparen != (I32)ARG(scan))
4600                 pars++;
4601         }
4602         else if (OP(scan) == CLOSE) {
4603             if (stopparen == (I32)ARG(scan)) {
4604                 break;
4605             }
4606             if ((I32)ARG(scan) == is_par) {
4607                 next = regnext(scan);
4608
4609                 if ( next && (OP(next) != WHILEM) && next < last)
4610                     is_par = 0;         /* Disable optimization */
4611             }
4612             if (data)
4613                 *(data->last_closep) = ARG(scan);
4614         }
4615         else if (OP(scan) == EVAL) {
4616                 if (data)
4617                     data->flags |= SF_HAS_EVAL;
4618         }
4619         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4620             if (flags & SCF_DO_SUBSTR) {
4621                 SCAN_COMMIT(pRExC_state,data,minlenp);
4622                 flags &= ~SCF_DO_SUBSTR;
4623             }
4624             if (data && OP(scan)==ACCEPT) {
4625                 data->flags |= SCF_SEEN_ACCEPT;
4626                 if (stopmin > min)
4627                     stopmin = min;
4628             }
4629         }
4630         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4631         {
4632                 if (flags & SCF_DO_SUBSTR) {
4633                     SCAN_COMMIT(pRExC_state,data,minlenp);
4634                     data->longest = &(data->longest_float);
4635                 }
4636                 is_inf = is_inf_internal = 1;
4637                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4638                     cl_anything(pRExC_state, data->start_class);
4639                 flags &= ~SCF_DO_STCLASS;
4640         }
4641         else if (OP(scan) == GPOS) {
4642             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4643                 !(delta || is_inf || (data && data->pos_delta)))
4644             {
4645                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4646                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4647                 if (RExC_rx->gofs < (U32)min)
4648                     RExC_rx->gofs = min;
4649             } else {
4650                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4651                 RExC_rx->gofs = 0;
4652             }
4653         }
4654 #ifdef TRIE_STUDY_OPT
4655 #ifdef FULL_TRIE_STUDY
4656         else if (PL_regkind[OP(scan)] == TRIE) {
4657             /* NOTE - There is similar code to this block above for handling
4658                BRANCH nodes on the initial study.  If you change stuff here
4659                check there too. */
4660             regnode *trie_node= scan;
4661             regnode *tail= regnext(scan);
4662             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4663             I32 max1 = 0, min1 = I32_MAX;
4664             struct regnode_charclass_class accum;
4665
4666             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4667                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4668             if (flags & SCF_DO_STCLASS)
4669                 cl_init_zero(pRExC_state, &accum);
4670
4671             if (!trie->jump) {
4672                 min1= trie->minlen;
4673                 max1= trie->maxlen;
4674             } else {
4675                 const regnode *nextbranch= NULL;
4676                 U32 word;
4677
4678                 for ( word=1 ; word <= trie->wordcount ; word++)
4679                 {
4680                     I32 deltanext=0, minnext=0, f = 0, fake;
4681                     struct regnode_charclass_class this_class;
4682
4683                     data_fake.flags = 0;
4684                     if (data) {
4685                         data_fake.whilem_c = data->whilem_c;
4686                         data_fake.last_closep = data->last_closep;
4687                     }
4688                     else
4689                         data_fake.last_closep = &fake;
4690                     data_fake.pos_delta = delta;
4691                     if (flags & SCF_DO_STCLASS) {
4692                         cl_init(pRExC_state, &this_class);
4693                         data_fake.start_class = &this_class;
4694                         f = SCF_DO_STCLASS_AND;
4695                     }
4696                     if (flags & SCF_WHILEM_VISITED_POS)
4697                         f |= SCF_WHILEM_VISITED_POS;
4698
4699                     if (trie->jump[word]) {
4700                         if (!nextbranch)
4701                             nextbranch = trie_node + trie->jump[0];
4702                         scan= trie_node + trie->jump[word];
4703                         /* We go from the jump point to the branch that follows
4704                            it. Note this means we need the vestigal unused branches
4705                            even though they arent otherwise used.
4706                          */
4707                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4708                             &deltanext, (regnode *)nextbranch, &data_fake,
4709                             stopparen, recursed, NULL, f,depth+1);
4710                     }
4711                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4712                         nextbranch= regnext((regnode*)nextbranch);
4713
4714                     if (min1 > (I32)(minnext + trie->minlen))
4715                         min1 = minnext + trie->minlen;
4716                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4717                         max1 = minnext + deltanext + trie->maxlen;
4718                     if (deltanext == I32_MAX)
4719                         is_inf = is_inf_internal = 1;
4720
4721                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4722                         pars++;
4723                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4724                         if ( stopmin > min + min1)
4725                             stopmin = min + min1;
4726                         flags &= ~SCF_DO_SUBSTR;
4727                         if (data)
4728                             data->flags |= SCF_SEEN_ACCEPT;
4729                     }
4730                     if (data) {
4731                         if (data_fake.flags & SF_HAS_EVAL)
4732                             data->flags |= SF_HAS_EVAL;
4733                         data->whilem_c = data_fake.whilem_c;
4734                     }
4735                     if (flags & SCF_DO_STCLASS)
4736                         cl_or(pRExC_state, &accum, &this_class);
4737                 }
4738             }
4739             if (flags & SCF_DO_SUBSTR) {
4740                 data->pos_min += min1;
4741                 data->pos_delta += max1 - min1;
4742                 if (max1 != min1 || is_inf)
4743                     data->longest = &(data->longest_float);
4744             }
4745             min += min1;
4746             delta += max1 - min1;
4747             if (flags & SCF_DO_STCLASS_OR) {
4748                 cl_or(pRExC_state, data->start_class, &accum);
4749                 if (min1) {
4750                     cl_and(data->start_class, and_withp);
4751                     flags &= ~SCF_DO_STCLASS;
4752                 }
4753             }
4754             else if (flags & SCF_DO_STCLASS_AND) {
4755                 if (min1) {
4756                     cl_and(data->start_class, &accum);
4757                     flags &= ~SCF_DO_STCLASS;
4758                 }
4759                 else {
4760                     /* Switch to OR mode: cache the old value of
4761                      * data->start_class */
4762                     INIT_AND_WITHP;
4763                     StructCopy(data->start_class, and_withp,
4764                                struct regnode_charclass_class);
4765                     flags &= ~SCF_DO_STCLASS_AND;
4766                     StructCopy(&accum, data->start_class,
4767                                struct regnode_charclass_class);
4768                     flags |= SCF_DO_STCLASS_OR;
4769                     data->start_class->flags |= ANYOF_EOS;
4770                 }
4771             }
4772             scan= tail;
4773             continue;
4774         }
4775 #else
4776         else if (PL_regkind[OP(scan)] == TRIE) {
4777             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4778             U8*bang=NULL;
4779
4780             min += trie->minlen;
4781             delta += (trie->maxlen - trie->minlen);
4782             flags &= ~SCF_DO_STCLASS; /* xxx */
4783             if (flags & SCF_DO_SUBSTR) {
4784                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4785                 data->pos_min += trie->minlen;
4786                 data->pos_delta += (trie->maxlen - trie->minlen);
4787                 if (trie->maxlen != trie->minlen)
4788                     data->longest = &(data->longest_float);
4789             }
4790             if (trie->jump) /* no more substrings -- for now /grr*/
4791                 flags &= ~SCF_DO_SUBSTR;
4792         }
4793 #endif /* old or new */
4794 #endif /* TRIE_STUDY_OPT */
4795
4796         /* Else: zero-length, ignore. */
4797         scan = regnext(scan);
4798     }
4799     if (frame) {
4800         last = frame->last;
4801         scan = frame->next;
4802         stopparen = frame->stop;
4803         frame = frame->prev;
4804         goto fake_study_recurse;
4805     }
4806
4807   finish:
4808     assert(!frame);
4809     DEBUG_STUDYDATA("pre-fin:",data,depth);
4810
4811     *scanp = scan;
4812     *deltap = is_inf_internal ? I32_MAX : delta;
4813     if (flags & SCF_DO_SUBSTR && is_inf)
4814         data->pos_delta = I32_MAX - data->pos_min;
4815     if (is_par > (I32)U8_MAX)
4816         is_par = 0;
4817     if (is_par && pars==1 && data) {
4818         data->flags |= SF_IN_PAR;
4819         data->flags &= ~SF_HAS_PAR;
4820     }
4821     else if (pars && data) {
4822         data->flags |= SF_HAS_PAR;
4823         data->flags &= ~SF_IN_PAR;
4824     }
4825     if (flags & SCF_DO_STCLASS_OR)
4826         cl_and(data->start_class, and_withp);
4827     if (flags & SCF_TRIE_RESTUDY)
4828         data->flags |=  SCF_TRIE_RESTUDY;
4829
4830     DEBUG_STUDYDATA("post-fin:",data,depth);
4831
4832     return min < stopmin ? min : stopmin;
4833 }
4834
4835 STATIC U32
4836 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4837 {
4838     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4839
4840     PERL_ARGS_ASSERT_ADD_DATA;
4841
4842     Renewc(RExC_rxi->data,
4843            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4844            char, struct reg_data);
4845     if(count)
4846         Renew(RExC_rxi->data->what, count + n, U8);
4847     else
4848         Newx(RExC_rxi->data->what, n, U8);
4849     RExC_rxi->data->count = count + n;
4850     Copy(s, RExC_rxi->data->what + count, n, U8);
4851     return count;
4852 }
4853
4854 /*XXX: todo make this not included in a non debugging perl */
4855 #ifndef PERL_IN_XSUB_RE
4856 void
4857 Perl_reginitcolors(pTHX)
4858 {
4859     dVAR;
4860     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4861     if (s) {
4862         char *t = savepv(s);
4863         int i = 0;
4864         PL_colors[0] = t;
4865         while (++i < 6) {
4866             t = strchr(t, '\t');
4867             if (t) {
4868                 *t = '\0';
4869                 PL_colors[i] = ++t;
4870             }
4871             else
4872                 PL_colors[i] = t = (char *)"";
4873         }
4874     } else {
4875         int i = 0;
4876         while (i < 6)
4877             PL_colors[i++] = (char *)"";
4878     }
4879     PL_colorset = 1;
4880 }
4881 #endif
4882
4883
4884 #ifdef TRIE_STUDY_OPT
4885 #define CHECK_RESTUDY_GOTO                                  \
4886         if (                                                \
4887               (data.flags & SCF_TRIE_RESTUDY)               \
4888               && ! restudied++                              \
4889         )     goto reStudy
4890 #else
4891 #define CHECK_RESTUDY_GOTO
4892 #endif
4893
4894 /*
4895  * pregcomp - compile a regular expression into internal code
4896  *
4897  * Decides which engine's compiler to call based on the hint currently in
4898  * scope
4899  */
4900
4901 #ifndef PERL_IN_XSUB_RE
4902 #define RE_ENGINE_PTR &PL_core_reg_engine
4903 #else
4904 extern const struct regexp_engine my_reg_engine;
4905 #define RE_ENGINE_PTR &my_reg_engine
4906 #endif
4907
4908 #ifndef PERL_IN_XSUB_RE
4909
4910 /* return the currently in-scope regex engine (or NULL if none)  */
4911
4912 regexp_engine *
4913 Perl_current_re_engine(pTHX)
4914 {
4915     dVAR;
4916
4917     if (IN_PERL_COMPILETIME) {
4918         HV * const table = GvHV(PL_hintgv);
4919         SV **ptr;
4920
4921         if (!table)
4922             return NULL;
4923         ptr = hv_fetchs(table, "regcomp", FALSE);
4924         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4925             return NULL;
4926         return INT2PTR(regexp_engine*,SvIV(*ptr));
4927     }
4928     else {
4929         SV *ptr;
4930         if (!PL_curcop->cop_hints_hash)
4931             return NULL;
4932         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4933         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4934             return NULL;
4935         return INT2PTR(regexp_engine*,SvIV(ptr));
4936     }
4937 }
4938
4939
4940 REGEXP *
4941 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4942 {
4943     dVAR;
4944     regexp_engine *eng = current_re_engine();
4945
4946     PERL_ARGS_ASSERT_PREGCOMP;
4947
4948     /* Dispatch a request to compile a regexp to correct regexp engine. */
4949     if (eng) {
4950         GET_RE_DEBUG_FLAGS_DECL;
4951         DEBUG_COMPILE_r({
4952             PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4953                             PTR2UV(eng));
4954         });
4955         return CALLREGCOMP_ENG(eng, pattern, flags);
4956     }
4957     return Perl_re_compile(aTHX_ pattern, flags);
4958 }
4959 #endif
4960
4961 /* public(ish) wrapper for Perl_re_op_compile that only takes an SV
4962  * pattern rather than a list of OPs */
4963
4964 REGEXP *
4965 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4966 {
4967     SV *pat = pattern; /* defeat constness! */
4968     PERL_ARGS_ASSERT_RE_COMPILE;
4969     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4970                                     NULL, NULL, NULL, orig_pm_flags);
4971 }
4972
4973
4974 /*
4975  * Perl_re_op_compile - the perl internal RE engine's function to compile a
4976  * regular expression into internal code.
4977  * The pattern may be passed either as:
4978  *    a list of SVs (patternp plus pat_count)
4979  *    a list of OPs (expr)
4980  * If both are passed, the SV list is used, but the OP list indicates
4981  * which SVs are actually pre-compiled code blocks
4982  *
4983  * The SVs in the list have magic and qr overloading applied to them (and
4984  * the list may be modified in-place with replacement SVs in the latter
4985  * case).
4986  *
4987  * If the pattern hasn't changed from old_re, then old_re will be
4988  * returned.
4989  *
4990  * If eng is set (and not equal to PL_core_reg_engine), then just do the
4991  * initial concatenation of arguments, then pass on to the external
4992  * engine.
4993  *
4994  * If is_bare_re is not null, set it to a boolean indicating whether the
4995  * arg list reduced (after overloading) to a single bare regex which has
4996  * been returned (i.e. /$qr/).
4997  *
4998  * We can't allocate space until we know how big the compiled form will be,
4999  * but we can't compile it (and thus know how big it is) until we've got a
5000  * place to put the code.  So we cheat:  we compile it twice, once with code
5001  * generation turned off and size counting turned on, and once "for real".
5002  * This also means that we don't allocate space until we are sure that the
5003  * thing really will compile successfully, and we never have to move the
5004  * code and thus invalidate pointers into it.  (Note that it has to be in
5005  * one piece because free() must be able to free it all.) [NB: not true in perl]
5006  *
5007  * Beware that the optimization-preparation code in here knows about some
5008  * of the structure of the compiled regexp.  [I'll say.]
5009  */
5010
5011 REGEXP *
5012 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5013                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5014                      int *is_bare_re, U32 orig_pm_flags)
5015 {
5016     dVAR;
5017     REGEXP *rx;
5018     struct regexp *r;
5019     register regexp_internal *ri;
5020     STRLEN plen;
5021     char  * VOL exp;
5022     char* xend;
5023     regnode *scan;
5024     I32 flags;
5025     I32 minlen = 0;
5026     U32 pm_flags;
5027     SV * VOL pat;
5028
5029     /* these are all flags - maybe they should be turned
5030      * into a single int with different bit masks */
5031     I32 sawlookahead = 0;
5032     I32 sawplus = 0;
5033     I32 sawopen = 0;
5034     bool used_setjump = FALSE;
5035     regex_charset initial_charset = get_regex_charset(orig_pm_flags);
5036     bool code_is_utf8 = 0;
5037
5038     U8 jump_ret = 0;
5039     dJMPENV;
5040     scan_data_t data;
5041     RExC_state_t RExC_state;
5042     RExC_state_t * const pRExC_state = &RExC_state;
5043 #ifdef TRIE_STUDY_OPT
5044     int restudied;
5045     RExC_state_t copyRExC_state;
5046 #endif
5047     GET_RE_DEBUG_FLAGS_DECL;
5048
5049     DEBUG_r(if (!PL_colorset) reginitcolors());
5050
5051 #ifndef PERL_IN_XSUB_RE
5052     /* Initialize these here instead of as-needed, as is quick and avoids
5053      * having to test them each time otherwise */
5054     if (! PL_AboveLatin1) {
5055         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5056         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5057         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5058
5059         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5060         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5061
5062         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5063         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5064
5065         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5066         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5067
5068         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5069
5070         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5071         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5072
5073         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5074
5075         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5076         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5077
5078         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5079         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5080
5081         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5082         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5083
5084         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5085         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5086
5087         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5088         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5089
5090         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5091         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5092
5093         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5094         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5095
5096         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5097         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5098
5099         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5100
5101         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5102         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5103
5104         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5105         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5106     }
5107 #endif
5108
5109     pRExC_state->code_blocks = NULL;
5110     pRExC_state->num_code_blocks = 0;
5111
5112     if (is_bare_re)
5113         *is_bare_re = 0;
5114
5115     if (expr && (expr->op_type == OP_LIST ||
5116                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5117
5118         /* is the source UTF8, and how many code blocks are there? */
5119         OP *o;
5120         int ncode = 0;
5121
5122         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5123             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5124                 code_is_utf8 = 1;
5125             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5126                 /* count of DO blocks */
5127                 ncode++;
5128         }
5129         if (ncode) {
5130             pRExC_state->num_code_blocks = ncode;
5131             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5132         }
5133     }
5134
5135     if (pat_count) {
5136         /* handle a list of SVs */
5137
5138         SV **svp;
5139
5140         /* apply magic and RE overloading to each arg */
5141         for (svp = patternp; svp < patternp + pat_count; svp++) {
5142             SV *rx = *svp;
5143             SvGETMAGIC(rx);
5144             if (SvROK(rx) && SvAMAGIC(rx)) {
5145                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5146                 if (sv) {
5147                     if (SvROK(sv))
5148                         sv = SvRV(sv);
5149                     if (SvTYPE(sv) != SVt_REGEXP)
5150                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5151                     *svp = sv;
5152                 }
5153             }
5154         }
5155
5156         if (pat_count > 1) {
5157             /* concat multiple args and find any code block indexes */
5158
5159             OP *o = NULL;
5160             int n = 0;
5161             bool utf8 = 0;
5162
5163             if (pRExC_state->num_code_blocks) {
5164                 o = cLISTOPx(expr)->op_first;
5165                 assert(o->op_type == OP_PUSHMARK);
5166                 o = o->op_sibling;
5167             }
5168
5169             pat = newSVpvn("", 0);
5170             SAVEFREESV(pat);
5171
5172             /* determine if the pattern is going to be utf8 (needed
5173              * in advance to align code block indices correctly).
5174              * XXX This could fail to be detected for an arg with
5175              * overloading but not concat overloading; but the main effect
5176              * in this obscure case is to need a 'use re eval' for a
5177              * literal code block */
5178             for (svp = patternp; svp < patternp + pat_count; svp++) {
5179                 if (SvUTF8(*svp))
5180                     utf8 = 1;
5181             }
5182             if (utf8)
5183                 SvUTF8_on(pat);
5184
5185             for (svp = patternp; svp < patternp + pat_count; svp++) {
5186                 SV *sv, *msv = *svp;
5187                 SV *rx;
5188                 bool code = 0;
5189                 if (o) {
5190                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5191                         assert(n < pRExC_state->num_code_blocks);
5192                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5193                         pRExC_state->code_blocks[n].block = o;
5194                         pRExC_state->code_blocks[n].src_regex = NULL;
5195                         n++;
5196                         code = 1;
5197                         o = o->op_sibling; /* skip CONST */
5198                         assert(o);
5199                     }
5200                     o = o->op_sibling;;
5201                 }
5202
5203                 /* extract any code blocks within any embedded qr//'s */
5204                 rx = msv;
5205                 if (SvROK(rx))
5206                     rx = SvRV(rx);
5207                 if (SvTYPE(rx) == SVt_REGEXP
5208                     && RX_ENGINE((REGEXP*)rx) == RE_ENGINE_PTR)
5209                 {
5210
5211                     RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5212                     if (ri->num_code_blocks) {
5213                         int i;
5214                         Renew(pRExC_state->code_blocks,
5215                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5216                             struct reg_code_block);
5217                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5218                         for (i=0; i < ri->num_code_blocks; i++) {
5219                             struct reg_code_block *src, *dst;
5220                             STRLEN offset =  SvCUR(pat)
5221                                 + ((struct regexp *)SvANY(rx))->pre_prefix;
5222                             assert(n < pRExC_state->num_code_blocks);
5223                             src = &ri->code_blocks[i];
5224                             dst = &pRExC_state->code_blocks[n];
5225                             dst->start      = src->start + offset;
5226                             dst->end        = src->end   + offset;
5227                             dst->block      = src->block;
5228                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5229                                                     src->src_regex
5230                                                         ? src->src_regex
5231                                                         : (REGEXP*)rx);
5232                             n++;
5233                         }
5234                     }
5235                 }
5236
5237                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5238                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5239                 {
5240                     sv_setsv(pat, sv);
5241                     /* overloading involved: all bets are off over literal
5242                      * code. Pretend we haven't seen it */
5243                     pRExC_state->num_code_blocks -= n;
5244                     n = 0;
5245
5246                 }
5247                 else {
5248                     sv_catsv_nomg(pat, msv);
5249                     if (code)
5250                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5251                 }
5252             }
5253             SvSETMAGIC(pat);
5254         }
5255         else
5256             pat = *patternp;
5257
5258         /* handle bare regex: foo =~ $re */
5259         {
5260             SV *re = pat;
5261             if (SvROK(re))
5262                 re = SvRV(re);
5263             if (SvTYPE(re) == SVt_REGEXP) {
5264                 if (is_bare_re)
5265                     *is_bare_re = 1;
5266                 SvREFCNT_inc(re);
5267                 Safefree(pRExC_state->code_blocks);
5268                 return (REGEXP*)re;
5269             }
5270         }
5271     }
5272     else {
5273         /* not a list of SVs, so must be a list of OPs */
5274         assert(expr);
5275         if (expr->op_type == OP_LIST) {
5276             int i = -1;
5277             bool is_code = 0;
5278             OP *o;
5279
5280             pat = newSVpvn("", 0);
5281             SAVEFREESV(pat);
5282             if (code_is_utf8)
5283                 SvUTF8_on(pat);
5284
5285             /* given a list of CONSTs and DO blocks in expr, append all
5286              * the CONSTs to pat, and record the start and end of each
5287              * code block in code_blocks[] (each DO{} op is followed by an
5288              * OP_CONST containing the corresponding literal '(?{...})
5289              * text)
5290              */
5291             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5292                 if (o->op_type == OP_CONST) {
5293                     sv_catsv(pat, cSVOPo_sv);
5294                     if (is_code) {
5295                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5296                         is_code = 0;
5297                     }
5298                 }
5299                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5300                     assert(i+1 < pRExC_state->num_code_blocks);
5301                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5302                     pRExC_state->code_blocks[i].block = o;
5303                     pRExC_state->code_blocks[i].src_regex = NULL;
5304                     is_code = 1;
5305                 }
5306             }
5307         }
5308         else {
5309             assert(expr->op_type == OP_CONST);
5310             pat = cSVOPx_sv(expr);
5311         }
5312     }
5313
5314     exp = SvPV_nomg(pat, plen);
5315
5316     if (eng && eng != RE_ENGINE_PTR) {
5317         if ((SvUTF8(pat) && IN_BYTES)
5318                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5319         {
5320             /* make a temporary copy; either to convert to bytes,
5321              * or to avoid repeating get-magic / overloaded stringify */
5322             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5323                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5324         }
5325         Safefree(pRExC_state->code_blocks);
5326         return CALLREGCOMP_ENG(eng, pat, orig_pm_flags);
5327     }
5328
5329     if (   old_re
5330         && !!RX_UTF8(old_re) == !!SvUTF8(pat)
5331         && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen
5332         && memEQ(RX_PRECOMP(old_re), exp, plen))
5333     {
5334         ReREFCNT_inc(old_re);
5335         Safefree(pRExC_state->code_blocks);
5336         return old_re;
5337     }
5338
5339     /* ignore the utf8ness if the pattern is 0 length */
5340     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5341     RExC_uni_semantics = 0;
5342     RExC_contains_locale = 0;
5343
5344     /****************** LONG JUMP TARGET HERE***********************/
5345     /* Longjmp back to here if have to switch in midstream to utf8 */
5346     if (! RExC_orig_utf8) {
5347         JMPENV_PUSH(jump_ret);
5348         used_setjump = TRUE;
5349     }
5350
5351     if (jump_ret == 0) {    /* First time through */
5352         xend = exp + plen;
5353
5354         DEBUG_COMPILE_r({
5355             SV *dsv= sv_newmortal();
5356             RE_PV_QUOTED_DECL(s, RExC_utf8,
5357                 dsv, exp, plen, 60);
5358             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5359                            PL_colors[4],PL_colors[5],s);
5360         });
5361     }
5362     else {  /* longjumped back */
5363         U8 *src, *dst;
5364         int n=0;
5365         STRLEN s = 0, d = 0;
5366         bool do_end = 0;
5367
5368         /* If the cause for the longjmp was other than changing to utf8, pop
5369          * our own setjmp, and longjmp to the correct handler */
5370         if (jump_ret != UTF8_LONGJMP) {
5371             JMPENV_POP;
5372             JMPENV_JUMP(jump_ret);
5373         }
5374
5375         GET_RE_DEBUG_FLAGS;
5376
5377         /* It's possible to write a regexp in ascii that represents Unicode
5378         codepoints outside of the byte range, such as via \x{100}. If we
5379         detect such a sequence we have to convert the entire pattern to utf8
5380         and then recompile, as our sizing calculation will have been based
5381         on 1 byte == 1 character, but we will need to use utf8 to encode
5382         at least some part of the pattern, and therefore must convert the whole
5383         thing.
5384         -- dmq */
5385         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5386             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5387
5388         /* upgrade pattern to UTF8, and if there are code blocks,
5389          * recalculate the indices.
5390          * This is essentially an unrolled Perl_bytes_to_utf8() */
5391
5392         src = (U8*)SvPV_nomg(pat, plen);
5393         Newx(dst, plen * 2 + 1, U8);
5394
5395         while (s < plen) {
5396             const UV uv = NATIVE_TO_ASCII(src[s]);
5397             if (UNI_IS_INVARIANT(uv))
5398                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5399             else {
5400                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5401                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5402             }
5403             if (n < pRExC_state->num_code_blocks) {
5404                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5405                     pRExC_state->code_blocks[n].start = d;
5406                     assert(dst[d] == '(');
5407                     do_end = 1;
5408                 }
5409                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5410                     pRExC_state->code_blocks[n].end = d;
5411                     assert(dst[d] == ')');
5412                     do_end = 0;
5413                     n++;
5414                 }
5415             }
5416             s++;
5417             d++;
5418         }
5419         dst[d] = '\0';
5420         plen = d;
5421         exp = (char*) dst;
5422         xend = exp + plen;
5423         SAVEFREEPV(exp);
5424         RExC_orig_utf8 = RExC_utf8 = 1;
5425
5426         /* we've changed the string; check again whether it matches
5427          * the old pattern, to avoid recompilation */
5428         if (   old_re
5429             && RX_UTF8(old_re)
5430             && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen
5431             && memEQ(RX_PRECOMP(old_re), exp, plen))
5432         {
5433             ReREFCNT_inc(old_re);
5434             if (used_setjump) {
5435                 JMPENV_POP;
5436             }
5437             Safefree(pRExC_state->code_blocks);
5438             return old_re;
5439         }
5440
5441     }
5442
5443 #ifdef TRIE_STUDY_OPT
5444     restudied = 0;
5445 #endif
5446
5447     pm_flags = orig_pm_flags;
5448
5449     if (initial_charset == REGEX_LOCALE_CHARSET) {
5450         RExC_contains_locale = 1;
5451     }
5452     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5453
5454         /* Set to use unicode semantics if the pattern is in utf8 and has the
5455          * 'depends' charset specified, as it means unicode when utf8  */
5456         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5457     }
5458
5459     RExC_precomp = exp;
5460     RExC_flags = pm_flags;
5461     RExC_sawback = 0;
5462
5463     RExC_seen = 0;
5464     RExC_in_lookbehind = 0;
5465     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5466     RExC_seen_evals = 0;
5467     RExC_extralen = 0;
5468     RExC_override_recoding = 0;
5469
5470     /* First pass: determine size, legality. */
5471     RExC_parse = exp;
5472     RExC_start = exp;
5473     RExC_end = xend;
5474     RExC_naughty = 0;
5475     RExC_npar = 1;
5476     RExC_nestroot = 0;
5477     RExC_size = 0L;
5478     RExC_emit = &PL_regdummy;
5479     RExC_whilem_seen = 0;
5480     RExC_open_parens = NULL;
5481     RExC_close_parens = NULL;
5482     RExC_opend = NULL;
5483     RExC_paren_names = NULL;
5484 #ifdef DEBUGGING
5485     RExC_paren_name_list = NULL;
5486 #endif
5487     RExC_recurse = NULL;
5488     RExC_recurse_count = 0;
5489     pRExC_state->code_index = 0;
5490
5491 #if 0 /* REGC() is (currently) a NOP at the first pass.
5492        * Clever compilers notice this and complain. --jhi */
5493     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5494 #endif
5495     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
5496     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5497         RExC_precomp = NULL;
5498         Safefree(pRExC_state->code_blocks);
5499         return(NULL);
5500     }
5501
5502     /* Here, finished first pass.  Get rid of any added setjmp */
5503     if (used_setjump) {
5504         JMPENV_POP;
5505     }
5506
5507     DEBUG_PARSE_r({
5508         PerlIO_printf(Perl_debug_log,
5509             "Required size %"IVdf" nodes\n"
5510             "Starting second pass (creation)\n",
5511             (IV)RExC_size);
5512         RExC_lastnum=0;
5513         RExC_lastparse=NULL;
5514     });
5515
5516     /* The first pass could have found things that force Unicode semantics */
5517     if ((RExC_utf8 || RExC_uni_semantics)
5518          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5519     {
5520         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5521     }
5522
5523     /* Small enough for pointer-storage convention?
5524        If extralen==0, this means that we will not need long jumps. */
5525     if (RExC_size >= 0x10000L && RExC_extralen)
5526         RExC_size += RExC_extralen;
5527     else
5528         RExC_extralen = 0;
5529     if (RExC_whilem_seen > 15)
5530         RExC_whilem_seen = 15;
5531
5532     /* Allocate space and zero-initialize. Note, the two step process
5533        of zeroing when in debug mode, thus anything assigned has to
5534        happen after that */
5535     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5536     r = (struct regexp*)SvANY(rx);
5537     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5538          char, regexp_internal);
5539     if ( r == NULL || ri == NULL )
5540         FAIL("Regexp out of space");
5541 #ifdef DEBUGGING
5542     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5543     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5544 #else
5545     /* bulk initialize base fields with 0. */
5546     Zero(ri, sizeof(regexp_internal), char);
5547 #endif
5548
5549     /* non-zero initialization begins here */
5550     RXi_SET( r, ri );
5551     r->engine= RE_ENGINE_PTR;
5552     r->extflags = pm_flags;
5553     if (orig_pm_flags & PMf_HAS_CV) {
5554         ri->code_blocks = pRExC_state->code_blocks;
5555         ri->num_code_blocks = pRExC_state->num_code_blocks;
5556     }
5557     else
5558         SAVEFREEPV(pRExC_state->code_blocks);
5559
5560     {
5561         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5562         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5563
5564         /* The caret is output if there are any defaults: if not all the STD
5565          * flags are set, or if no character set specifier is needed */
5566         bool has_default =
5567                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5568                     || ! has_charset);
5569         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5570         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5571                             >> RXf_PMf_STD_PMMOD_SHIFT);
5572         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5573         char *p;
5574         /* Allocate for the worst case, which is all the std flags are turned
5575          * on.  If more precision is desired, we could do a population count of
5576          * the flags set.  This could be done with a small lookup table, or by
5577          * shifting, masking and adding, or even, when available, assembly
5578          * language for a machine-language population count.
5579          * We never output a minus, as all those are defaults, so are
5580          * covered by the caret */
5581         const STRLEN wraplen = plen + has_p + has_runon
5582             + has_default       /* If needs a caret */
5583
5584                 /* If needs a character set specifier */
5585             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5586             + (sizeof(STD_PAT_MODS) - 1)
5587             + (sizeof("(?:)") - 1);
5588
5589         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5590         SvPOK_on(rx);
5591         if (RExC_utf8)
5592             SvFLAGS(rx) |= SVf_UTF8;
5593         *p++='('; *p++='?';
5594
5595         /* If a default, cover it using the caret */
5596         if (has_default) {
5597             *p++= DEFAULT_PAT_MOD;
5598         }
5599         if (has_charset) {
5600             STRLEN len;
5601             const char* const name = get_regex_charset_name(r->extflags, &len);
5602             Copy(name, p, len, char);
5603             p += len;
5604         }
5605         if (has_p)
5606             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5607         {
5608             char ch;
5609             while((ch = *fptr++)) {
5610                 if(reganch & 1)
5611                     *p++ = ch;
5612                 reganch >>= 1;
5613             }
5614         }
5615
5616         *p++ = ':';
5617         Copy(RExC_precomp, p, plen, char);
5618         assert ((RX_WRAPPED(rx) - p) < 16);
5619         r->pre_prefix = p - RX_WRAPPED(rx);
5620         p += plen;
5621         if (has_runon)
5622             *p++ = '\n';
5623         *p++ = ')';
5624         *p = 0;
5625         SvCUR_set(rx, p - SvPVX_const(rx));
5626     }
5627
5628     r->intflags = 0;
5629     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5630
5631     if (RExC_seen & REG_SEEN_RECURSE) {
5632         Newxz(RExC_open_parens, RExC_npar,regnode *);
5633         SAVEFREEPV(RExC_open_parens);
5634         Newxz(RExC_close_parens,RExC_npar,regnode *);
5635         SAVEFREEPV(RExC_close_parens);
5636     }
5637
5638     /* Useful during FAIL. */
5639 #ifdef RE_TRACK_PATTERN_OFFSETS
5640     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5641     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5642                           "%s %"UVuf" bytes for offset annotations.\n",
5643                           ri->u.offsets ? "Got" : "Couldn't get",
5644                           (UV)((2*RExC_size+1) * sizeof(U32))));
5645 #endif
5646     SetProgLen(ri,RExC_size);
5647     RExC_rx_sv = rx;
5648     RExC_rx = r;
5649     RExC_rxi = ri;
5650
5651     /* Second pass: emit code. */
5652     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
5653     RExC_parse = exp;
5654     RExC_end = xend;
5655     RExC_naughty = 0;
5656     RExC_npar = 1;
5657     RExC_emit_start = ri->program;
5658     RExC_emit = ri->program;
5659     RExC_emit_bound = ri->program + RExC_size + 1;
5660     pRExC_state->code_index = 0;
5661
5662     /* Store the count of eval-groups for security checks: */
5663     RExC_rx->seen_evals = RExC_seen_evals;
5664     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5665     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5666         ReREFCNT_dec(rx);
5667         return(NULL);
5668     }
5669     /* XXXX To minimize changes to RE engine we always allocate
5670        3-units-long substrs field. */
5671     Newx(r->substrs, 1, struct reg_substr_data);
5672     if (RExC_recurse_count) {
5673         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5674         SAVEFREEPV(RExC_recurse);
5675     }
5676
5677 reStudy:
5678     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5679     Zero(r->substrs, 1, struct reg_substr_data);
5680
5681 #ifdef TRIE_STUDY_OPT
5682     if (!restudied) {
5683         StructCopy(&zero_scan_data, &data, scan_data_t);
5684         copyRExC_state = RExC_state;
5685     } else {
5686         U32 seen=RExC_seen;
5687         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5688
5689         RExC_state = copyRExC_state;
5690         if (seen & REG_TOP_LEVEL_BRANCHES)
5691             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5692         else
5693             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5694         if (data.last_found) {
5695             SvREFCNT_dec(data.longest_fixed);
5696             SvREFCNT_dec(data.longest_float);
5697             SvREFCNT_dec(data.last_found);
5698         }
5699         StructCopy(&zero_scan_data, &data, scan_data_t);
5700     }
5701 #else
5702     StructCopy(&zero_scan_data, &data, scan_data_t);
5703 #endif
5704
5705     /* Dig out information for optimizations. */
5706     r->extflags = RExC_flags; /* was pm_op */
5707     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5708
5709     if (UTF)
5710         SvUTF8_on(rx);  /* Unicode in it? */
5711     ri->regstclass = NULL;
5712     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5713         r->intflags |= PREGf_NAUGHTY;
5714     scan = ri->program + 1;             /* First BRANCH. */
5715
5716     /* testing for BRANCH here tells us whether there is "must appear"
5717        data in the pattern. If there is then we can use it for optimisations */
5718     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5719         I32 fake;
5720         STRLEN longest_float_length, longest_fixed_length;
5721         struct regnode_charclass_class ch_class; /* pointed to by data */
5722         int stclass_flag;
5723         I32 last_close = 0; /* pointed to by data */
5724         regnode *first= scan;
5725         regnode *first_next= regnext(first);
5726         /*
5727          * Skip introductions and multiplicators >= 1
5728          * so that we can extract the 'meat' of the pattern that must
5729          * match in the large if() sequence following.
5730          * NOTE that EXACT is NOT covered here, as it is normally
5731          * picked up by the optimiser separately.
5732          *
5733          * This is unfortunate as the optimiser isnt handling lookahead
5734          * properly currently.
5735          *
5736          */
5737         while ((OP(first) == OPEN && (sawopen = 1)) ||
5738                /* An OR of *one* alternative - should not happen now. */
5739             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5740             /* for now we can't handle lookbehind IFMATCH*/
5741             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5742             (OP(first) == PLUS) ||
5743             (OP(first) == MINMOD) ||
5744                /* An {n,m} with n>0 */
5745             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5746             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5747         {
5748                 /*
5749                  * the only op that could be a regnode is PLUS, all the rest
5750                  * will be regnode_1 or regnode_2.
5751                  *
5752                  */
5753                 if (OP(first) == PLUS)
5754                     sawplus = 1;
5755                 else
5756                     first += regarglen[OP(first)];
5757
5758                 first = NEXTOPER(first);
5759                 first_next= regnext(first);
5760         }
5761
5762         /* Starting-point info. */
5763       again:
5764         DEBUG_PEEP("first:",first,0);
5765         /* Ignore EXACT as we deal with it later. */
5766         if (PL_regkind[OP(first)] == EXACT) {
5767             if (OP(first) == EXACT)
5768                 NOOP;   /* Empty, get anchored substr later. */
5769             else
5770                 ri->regstclass = first;
5771         }
5772 #ifdef TRIE_STCLASS
5773         else if (PL_regkind[OP(first)] == TRIE &&
5774                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5775         {
5776             regnode *trie_op;
5777             /* this can happen only on restudy */
5778             if ( OP(first) == TRIE ) {
5779                 struct regnode_1 *trieop = (struct regnode_1 *)
5780                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
5781                 StructCopy(first,trieop,struct regnode_1);
5782                 trie_op=(regnode *)trieop;
5783             } else {
5784                 struct regnode_charclass *trieop = (struct regnode_charclass *)
5785                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5786                 StructCopy(first,trieop,struct regnode_charclass);
5787                 trie_op=(regnode *)trieop;
5788             }
5789             OP(trie_op)+=2;
5790             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5791             ri->regstclass = trie_op;
5792         }
5793 #endif
5794         else if (REGNODE_SIMPLE(OP(first)))
5795             ri->regstclass = first;
5796         else if (PL_regkind[OP(first)] == BOUND ||
5797                  PL_regkind[OP(first)] == NBOUND)
5798             ri->regstclass = first;
5799         else if (PL_regkind[OP(first)] == BOL) {
5800             r->extflags |= (OP(first) == MBOL
5801                            ? RXf_ANCH_MBOL
5802                            : (OP(first) == SBOL
5803                               ? RXf_ANCH_SBOL
5804                               : RXf_ANCH_BOL));
5805             first = NEXTOPER(first);
5806             goto again;
5807         }
5808         else if (OP(first) == GPOS) {
5809             r->extflags |= RXf_ANCH_GPOS;
5810             first = NEXTOPER(first);
5811             goto again;
5812         }
5813         else if ((!sawopen || !RExC_sawback) &&
5814             (OP(first) == STAR &&
5815             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5816             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5817         {
5818             /* turn .* into ^.* with an implied $*=1 */
5819             const int type =
5820                 (OP(NEXTOPER(first)) == REG_ANY)
5821                     ? RXf_ANCH_MBOL
5822                     : RXf_ANCH_SBOL;
5823             r->extflags |= type;
5824             r->intflags |= PREGf_IMPLICIT;
5825             first = NEXTOPER(first);
5826             goto again;
5827         }
5828         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5829             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5830             /* x+ must match at the 1st pos of run of x's */
5831             r->intflags |= PREGf_SKIP;
5832
5833         /* Scan is after the zeroth branch, first is atomic matcher. */
5834 #ifdef TRIE_STUDY_OPT
5835         DEBUG_PARSE_r(
5836             if (!restudied)
5837                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5838                               (IV)(first - scan + 1))
5839         );
5840 #else
5841         DEBUG_PARSE_r(
5842             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5843                 (IV)(first - scan + 1))
5844         );
5845 #endif
5846
5847
5848         /*
5849         * If there's something expensive in the r.e., find the
5850         * longest literal string that must appear and make it the
5851         * regmust.  Resolve ties in favor of later strings, since
5852         * the regstart check works with the beginning of the r.e.
5853         * and avoiding duplication strengthens checking.  Not a
5854         * strong reason, but sufficient in the absence of others.
5855         * [Now we resolve ties in favor of the earlier string if
5856         * it happens that c_offset_min has been invalidated, since the
5857         * earlier string may buy us something the later one won't.]
5858         */
5859
5860         data.longest_fixed = newSVpvs("");
5861         data.longest_float = newSVpvs("");
5862         data.last_found = newSVpvs("");
5863         data.longest = &(data.longest_fixed);
5864         first = scan;
5865         if (!ri->regstclass) {
5866             cl_init(pRExC_state, &ch_class);
5867             data.start_class = &ch_class;
5868             stclass_flag = SCF_DO_STCLASS_AND;
5869         } else                          /* XXXX Check for BOUND? */
5870             stclass_flag = 0;
5871         data.last_closep = &last_close;
5872
5873         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5874             &data, -1, NULL, NULL,
5875             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5876
5877
5878         CHECK_RESTUDY_GOTO;
5879
5880
5881         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5882              && data.last_start_min == 0 && data.last_end > 0
5883              && !RExC_seen_zerolen
5884              && !(RExC_seen & REG_SEEN_VERBARG)
5885              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5886             r->extflags |= RXf_CHECK_ALL;
5887         scan_commit(pRExC_state, &data,&minlen,0);
5888         SvREFCNT_dec(data.last_found);
5889
5890         /* Note that code very similar to this but for anchored string
5891            follows immediately below, changes may need to be made to both.
5892            Be careful.
5893          */
5894         longest_float_length = CHR_SVLEN(data.longest_float);
5895         if (longest_float_length
5896             || (data.flags & SF_FL_BEFORE_EOL
5897                 && (!(data.flags & SF_FL_BEFORE_MEOL)
5898                     || (RExC_flags & RXf_PMf_MULTILINE))))
5899         {
5900             I32 t,ml;
5901
5902             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5903             if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5904                 || (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5905                     && data.offset_fixed == data.offset_float_min
5906                     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5907                     goto remove_float;          /* As in (a)+. */
5908
5909             /* copy the information about the longest float from the reg_scan_data
5910                over to the program. */
5911             if (SvUTF8(data.longest_float)) {
5912                 r->float_utf8 = data.longest_float;
5913                 r->float_substr = NULL;
5914             } else {
5915                 r->float_substr = data.longest_float;
5916                 r->float_utf8 = NULL;
5917             }
5918             /* float_end_shift is how many chars that must be matched that
5919                follow this item. We calculate it ahead of time as once the
5920                lookbehind offset is added in we lose the ability to correctly
5921                calculate it.*/
5922             ml = data.minlen_float ? *(data.minlen_float)
5923                                    : (I32)longest_float_length;
5924             r->float_end_shift = ml - data.offset_float_min
5925                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5926                 + data.lookbehind_float;
5927             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5928             r->float_max_offset = data.offset_float_max;
5929             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5930                 r->float_max_offset -= data.lookbehind_float;
5931
5932             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5933                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5934                            || (RExC_flags & RXf_PMf_MULTILINE)));
5935             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5936         }
5937         else {
5938           remove_float:
5939             r->float_substr = r->float_utf8 = NULL;
5940             SvREFCNT_dec(data.longest_float);
5941             longest_float_length = 0;
5942         }
5943
5944         /* Note that code very similar to this but for floating string
5945            is immediately above, changes may need to be made to both.
5946            Be careful.
5947          */
5948         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5949
5950         /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5951         if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5952             && (longest_fixed_length
5953                 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5954                     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5955                         || (RExC_flags & RXf_PMf_MULTILINE)))) )
5956         {
5957             I32 t,ml;
5958
5959             /* copy the information about the longest fixed
5960                from the reg_scan_data over to the program. */
5961             if (SvUTF8(data.longest_fixed)) {
5962                 r->anchored_utf8 = data.longest_fixed;
5963                 r->anchored_substr = NULL;
5964             } else {
5965                 r->anchored_substr = data.longest_fixed;
5966                 r->anchored_utf8 = NULL;
5967             }
5968             /* fixed_end_shift is how many chars that must be matched that
5969                follow this item. We calculate it ahead of time as once the
5970                lookbehind offset is added in we lose the ability to correctly
5971                calculate it.*/
5972             ml = data.minlen_fixed ? *(data.minlen_fixed)
5973                                    : (I32)longest_fixed_length;
5974             r->anchored_end_shift = ml - data.offset_fixed
5975                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5976                 + data.lookbehind_fixed;
5977             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5978
5979             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5980                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5981                      || (RExC_flags & RXf_PMf_MULTILINE)));
5982             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5983         }
5984         else {
5985             r->anchored_substr = r->anchored_utf8 = NULL;
5986             SvREFCNT_dec(data.longest_fixed);
5987             longest_fixed_length = 0;
5988         }
5989         if (ri->regstclass
5990             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5991             ri->regstclass = NULL;
5992
5993         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5994             && stclass_flag
5995             && !(data.start_class->flags & ANYOF_EOS)
5996             && !cl_is_anything(data.start_class))
5997         {
5998             const U32 n = add_data(pRExC_state, 1, "f");
5999             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6000
6001             Newx(RExC_rxi->data->data[n], 1,
6002                 struct regnode_charclass_class);
6003             StructCopy(data.start_class,
6004                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6005                        struct regnode_charclass_class);
6006             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6007             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6008             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6009                       regprop(r, sv, (regnode*)data.start_class);
6010                       PerlIO_printf(Perl_debug_log,
6011                                     "synthetic stclass \"%s\".\n",
6012                                     SvPVX_const(sv));});
6013         }
6014
6015         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6016         if (longest_fixed_length > longest_float_length) {
6017             r->check_end_shift = r->anchored_end_shift;
6018             r->check_substr = r->anchored_substr;
6019             r->check_utf8 = r->anchored_utf8;
6020             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6021             if (r->extflags & RXf_ANCH_SINGLE)
6022                 r->extflags |= RXf_NOSCAN;
6023         }
6024         else {
6025             r->check_end_shift = r->float_end_shift;
6026             r->check_substr = r->float_substr;
6027             r->check_utf8 = r->float_utf8;
6028             r->check_offset_min = r->float_min_offset;
6029             r->check_offset_max = r->float_max_offset;
6030         }
6031         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6032            This should be changed ASAP!  */
6033         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6034             r->extflags |= RXf_USE_INTUIT;
6035             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6036                 r->extflags |= RXf_INTUIT_TAIL;
6037         }
6038         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6039         if ( (STRLEN)minlen < longest_float_length )
6040             minlen= longest_float_length;
6041         if ( (STRLEN)minlen < longest_fixed_length )
6042             minlen= longest_fixed_length;
6043         */
6044     }
6045     else {
6046         /* Several toplevels. Best we can is to set minlen. */
6047         I32 fake;
6048         struct regnode_charclass_class ch_class;
6049         I32 last_close = 0;
6050
6051         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6052
6053         scan = ri->program + 1;
6054         cl_init(pRExC_state, &ch_class);
6055         data.start_class = &ch_class;
6056         data.last_closep = &last_close;
6057
6058
6059         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6060             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6061
6062         CHECK_RESTUDY_GOTO;
6063
6064         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6065                 = r->float_substr = r->float_utf8 = NULL;
6066
6067         if (!(data.start_class->flags & ANYOF_EOS)
6068             && !cl_is_anything(data.start_class))
6069         {
6070             const U32 n = add_data(pRExC_state, 1, "f");
6071             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6072
6073             Newx(RExC_rxi->data->data[n], 1,
6074                 struct regnode_charclass_class);
6075             StructCopy(data.start_class,
6076                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6077                        struct regnode_charclass_class);
6078             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6079             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6080             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6081                       regprop(r, sv, (regnode*)data.start_class);
6082                       PerlIO_printf(Perl_debug_log,
6083                                     "synthetic stclass \"%s\".\n",
6084                                     SvPVX_const(sv));});
6085         }
6086     }
6087
6088     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6089        the "real" pattern. */
6090     DEBUG_OPTIMISE_r({
6091         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6092                       (IV)minlen, (IV)r->minlen);
6093     });
6094     r->minlenret = minlen;
6095     if (r->minlen < minlen)
6096         r->minlen = minlen;
6097
6098     if (RExC_seen & REG_SEEN_GPOS)
6099         r->extflags |= RXf_GPOS_SEEN;
6100     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6101         r->extflags |= RXf_LOOKBEHIND_SEEN;
6102     if (RExC_seen & REG_SEEN_EVAL)
6103         r->extflags |= RXf_EVAL_SEEN;
6104     if (RExC_seen & REG_SEEN_CANY)
6105         r->extflags |= RXf_CANY_SEEN;
6106     if (RExC_seen & REG_SEEN_VERBARG)
6107         r->intflags |= PREGf_VERBARG_SEEN;
6108     if (RExC_seen & REG_SEEN_CUTGROUP)
6109         r->intflags |= PREGf_CUTGROUP_SEEN;
6110     if (RExC_paren_names)
6111         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6112     else
6113         RXp_PAREN_NAMES(r) = NULL;
6114
6115 #ifdef STUPID_PATTERN_CHECKS
6116     if (RX_PRELEN(rx) == 0)
6117         r->extflags |= RXf_NULL;
6118     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6119         /* XXX: this should happen BEFORE we compile */
6120         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6121     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6122         r->extflags |= RXf_WHITE;
6123     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6124         r->extflags |= RXf_START_ONLY;
6125 #else
6126     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6127             /* XXX: this should happen BEFORE we compile */
6128             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6129     else {
6130         regnode *first = ri->program + 1;
6131         U8 fop = OP(first);
6132
6133         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6134             r->extflags |= RXf_NULL;
6135         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6136             r->extflags |= RXf_START_ONLY;
6137         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6138                              && OP(regnext(first)) == END)
6139             r->extflags |= RXf_WHITE;
6140     }
6141 #endif
6142 #ifdef DEBUGGING
6143     if (RExC_paren_names) {
6144         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6145         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6146     } else
6147 #endif
6148         ri->name_list_idx = 0;
6149
6150     if (RExC_recurse_count) {
6151         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6152             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6153             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6154         }
6155     }
6156     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6157     /* assume we don't need to swap parens around before we match */
6158
6159     DEBUG_DUMP_r({
6160         PerlIO_printf(Perl_debug_log,"Final program:\n");
6161         regdump(r);
6162     });
6163 #ifdef RE_TRACK_PATTERN_OFFSETS
6164     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6165         const U32 len = ri->u.offsets[0];
6166         U32 i;
6167         GET_RE_DEBUG_FLAGS_DECL;
6168         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6169         for (i = 1; i <= len; i++) {
6170             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6171                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6172                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6173             }
6174         PerlIO_printf(Perl_debug_log, "\n");
6175     });
6176 #endif
6177     return rx;
6178 }
6179
6180 #undef RE_ENGINE_PTR
6181
6182
6183 SV*
6184 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6185                     const U32 flags)
6186 {
6187     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6188
6189     PERL_UNUSED_ARG(value);
6190
6191     if (flags & RXapif_FETCH) {
6192         return reg_named_buff_fetch(rx, key, flags);
6193     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6194         Perl_croak_no_modify(aTHX);
6195         return NULL;
6196     } else if (flags & RXapif_EXISTS) {
6197         return reg_named_buff_exists(rx, key, flags)
6198             ? &PL_sv_yes
6199             : &PL_sv_no;
6200     } else if (flags & RXapif_REGNAMES) {
6201         return reg_named_buff_all(rx, flags);
6202     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6203         return reg_named_buff_scalar(rx, flags);
6204     } else {
6205         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6206         return NULL;
6207     }
6208 }
6209
6210 SV*
6211 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6212                          const U32 flags)
6213 {
6214     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6215     PERL_UNUSED_ARG(lastkey);
6216
6217     if (flags & RXapif_FIRSTKEY)
6218         return reg_named_buff_firstkey(rx, flags);
6219     else if (flags & RXapif_NEXTKEY)
6220         return reg_named_buff_nextkey(rx, flags);
6221     else {
6222         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6223         return NULL;
6224     }
6225 }
6226
6227 SV*
6228 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6229                           const U32 flags)
6230 {
6231     AV *retarray = NULL;
6232     SV *ret;
6233     struct regexp *const rx = (struct regexp *)SvANY(r);
6234
6235     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6236
6237     if (flags & RXapif_ALL)
6238         retarray=newAV();
6239
6240     if (rx && RXp_PAREN_NAMES(rx)) {
6241         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6242         if (he_str) {
6243             IV i;
6244             SV* sv_dat=HeVAL(he_str);
6245             I32 *nums=(I32*)SvPVX(sv_dat);
6246             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6247                 if ((I32)(rx->nparens) >= nums[i]
6248                     && rx->offs[nums[i]].start != -1
6249                     && rx->offs[nums[i]].end != -1)
6250                 {
6251                     ret = newSVpvs("");
6252                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6253                     if (!retarray)
6254                         return ret;
6255                 } else {
6256                     if (retarray)
6257                         ret = newSVsv(&PL_sv_undef);
6258                 }
6259                 if (retarray)
6260                     av_push(retarray, ret);
6261             }
6262             if (retarray)
6263                 return newRV_noinc(MUTABLE_SV(retarray));
6264         }
6265     }
6266     return NULL;
6267 }
6268
6269 bool
6270 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6271                            const U32 flags)
6272 {
6273     struct regexp *const rx = (struct regexp *)SvANY(r);
6274
6275     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6276
6277     if (rx && RXp_PAREN_NAMES(rx)) {
6278         if (flags & RXapif_ALL) {
6279             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6280         } else {
6281             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6282             if (sv) {
6283                 SvREFCNT_dec(sv);
6284                 return TRUE;
6285             } else {
6286                 return FALSE;
6287             }
6288         }
6289     } else {
6290         return FALSE;
6291     }
6292 }
6293
6294 SV*
6295 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6296 {
6297     struct regexp *const rx = (struct regexp *)SvANY(r);
6298
6299     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6300
6301     if ( rx && RXp_PAREN_NAMES(rx) ) {
6302         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6303
6304         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6305     } else {
6306         return FALSE;
6307     }
6308 }
6309
6310 SV*
6311 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6312 {
6313     struct regexp *const rx = (struct regexp *)SvANY(r);
6314     GET_RE_DEBUG_FLAGS_DECL;
6315
6316     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6317
6318     if (rx && RXp_PAREN_NAMES(rx)) {
6319         HV *hv = RXp_PAREN_NAMES(rx);
6320         HE *temphe;
6321         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6322             IV i;
6323             IV parno = 0;
6324             SV* sv_dat = HeVAL(temphe);
6325             I32 *nums = (I32*)SvPVX(sv_dat);
6326             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6327                 if ((I32)(rx->lastparen) >= nums[i] &&
6328                     rx->offs[nums[i]].start != -1 &&
6329                     rx->offs[nums[i]].end != -1)
6330                 {
6331                     parno = nums[i];
6332                     break;
6333                 }
6334             }
6335             if (parno || flags & RXapif_ALL) {
6336                 return newSVhek(HeKEY_hek(temphe));
6337             }
6338         }
6339     }
6340     return NULL;
6341 }
6342
6343 SV*
6344 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6345 {
6346     SV *ret;
6347     AV *av;
6348     I32 length;
6349     struct regexp *const rx = (struct regexp *)SvANY(r);
6350
6351     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6352
6353     if (rx && RXp_PAREN_NAMES(rx)) {
6354         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6355             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6356         } else if (flags & RXapif_ONE) {
6357             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6358             av = MUTABLE_AV(SvRV(ret));
6359             length = av_len(av);
6360             SvREFCNT_dec(ret);
6361             return newSViv(length + 1);
6362         } else {
6363             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6364             return NULL;
6365         }
6366     }
6367     return &PL_sv_undef;
6368 }
6369
6370 SV*
6371 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6372 {
6373     struct regexp *const rx = (struct regexp *)SvANY(r);
6374     AV *av = newAV();
6375
6376     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6377
6378     if (rx && RXp_PAREN_NAMES(rx)) {
6379         HV *hv= RXp_PAREN_NAMES(rx);
6380         HE *temphe;
6381         (void)hv_iterinit(hv);
6382         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6383             IV i;
6384             IV parno = 0;
6385             SV* sv_dat = HeVAL(temphe);
6386             I32 *nums = (I32*)SvPVX(sv_dat);
6387             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6388                 if ((I32)(rx->lastparen) >= nums[i] &&
6389                     rx->offs[nums[i]].start != -1 &&
6390                     rx->offs[nums[i]].end != -1)
6391                 {
6392                     parno = nums[i];
6393                     break;
6394                 }
6395             }
6396             if (parno || flags & RXapif_ALL) {
6397                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6398             }
6399         }
6400     }
6401
6402     return newRV_noinc(MUTABLE_SV(av));
6403 }
6404
6405 void
6406 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6407                              SV * const sv)
6408 {
6409     struct regexp *const rx = (struct regexp *)SvANY(r);
6410     char *s = NULL;
6411     I32 i = 0;
6412     I32 s1, t1;
6413
6414     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6415
6416     if (!rx->subbeg) {
6417         sv_setsv(sv,&PL_sv_undef);
6418         return;
6419     }
6420     else
6421     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
6422         /* $` */
6423         i = rx->offs[0].start;
6424         s = rx->subbeg;
6425     }
6426     else
6427     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6428         /* $' */
6429         s = rx->subbeg + rx->offs[0].end;
6430         i = rx->sublen - rx->offs[0].end;
6431     }
6432     else
6433     if ( 0 <= paren && paren <= (I32)rx->nparens &&
6434         (s1 = rx->offs[paren].start) != -1 &&
6435         (t1 = rx->offs[paren].end) != -1)
6436     {
6437         /* $& $1 ... */
6438         i = t1 - s1;
6439         s = rx->subbeg + s1;
6440     } else {
6441         sv_setsv(sv,&PL_sv_undef);
6442         return;
6443     }
6444     assert(rx->sublen >= (s - rx->subbeg) + i );
6445     if (i >= 0) {
6446         const int oldtainted = PL_tainted;
6447         TAINT_NOT;
6448         sv_setpvn(sv, s, i);
6449         PL_tainted = oldtainted;
6450         if ( (rx->extflags & RXf_CANY_SEEN)
6451             ? (RXp_MATCH_UTF8(rx)
6452                         && (!i || is_utf8_string((U8*)s, i)))
6453             : (RXp_MATCH_UTF8(rx)) )
6454         {
6455             SvUTF8_on(sv);
6456         }
6457         else
6458             SvUTF8_off(sv);
6459         if (PL_tainting) {
6460             if (RXp_MATCH_TAINTED(rx)) {
6461                 if (SvTYPE(sv) >= SVt_PVMG) {
6462                     MAGIC* const mg = SvMAGIC(sv);
6463                     MAGIC* mgt;
6464                     PL_tainted = 1;
6465                     SvMAGIC_set(sv, mg->mg_moremagic);
6466                     SvTAINT(sv);
6467                     if ((mgt = SvMAGIC(sv))) {
6468                         mg->mg_moremagic = mgt;
6469                         SvMAGIC_set(sv, mg);
6470                     }
6471                 } else {
6472                     PL_tainted = 1;
6473                     SvTAINT(sv);
6474                 }
6475             } else
6476                 SvTAINTED_off(sv);
6477         }
6478     } else {
6479         sv_setsv(sv,&PL_sv_undef);
6480         return;
6481     }
6482 }
6483
6484 void
6485 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6486                                                          SV const * const value)
6487 {
6488     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6489
6490     PERL_UNUSED_ARG(rx);
6491     PERL_UNUSED_ARG(paren);
6492     PERL_UNUSED_ARG(value);
6493
6494     if (!PL_localizing)
6495         Perl_croak_no_modify(aTHX);
6496 }
6497
6498 I32
6499 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6500                               const I32 paren)
6501 {
6502     struct regexp *const rx = (struct regexp *)SvANY(r);
6503     I32 i;
6504     I32 s1, t1;
6505
6506     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6507
6508     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6509         switch (paren) {
6510       /* $` / ${^PREMATCH} */
6511       case RX_BUFF_IDX_PREMATCH:
6512         if (rx->offs[0].start != -1) {
6513                         i = rx->offs[0].start;
6514                         if (i > 0) {
6515                                 s1 = 0;
6516                                 t1 = i;
6517                                 goto getlen;
6518                         }
6519             }
6520         return 0;
6521       /* $' / ${^POSTMATCH} */
6522       case RX_BUFF_IDX_POSTMATCH:
6523             if (rx->offs[0].end != -1) {
6524                         i = rx->sublen - rx->offs[0].end;
6525                         if (i > 0) {
6526                                 s1 = rx->offs[0].end;
6527                                 t1 = rx->sublen;
6528                                 goto getlen;
6529                         }
6530             }
6531         return 0;
6532       /* $& / ${^MATCH}, $1, $2, ... */
6533       default:
6534             if (paren <= (I32)rx->nparens &&
6535             (s1 = rx->offs[paren].start) != -1 &&
6536             (t1 = rx->offs[paren].end) != -1)
6537             {
6538             i = t1 - s1;
6539             goto getlen;
6540         } else {
6541             if (ckWARN(WARN_UNINITIALIZED))
6542                 report_uninit((const SV *)sv);
6543             return 0;
6544         }
6545     }
6546   getlen:
6547     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6548         const char * const s = rx->subbeg + s1;
6549         const U8 *ep;
6550         STRLEN el;
6551
6552         i = t1 - s1;
6553         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6554                         i = el;
6555     }
6556     return i;
6557 }
6558
6559 SV*
6560 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6561 {
6562     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6563         PERL_UNUSED_ARG(rx);
6564         if (0)
6565             return NULL;
6566         else
6567             return newSVpvs("Regexp");
6568 }
6569
6570 /* Scans the name of a named buffer from the pattern.
6571  * If flags is REG_RSN_RETURN_NULL returns null.
6572  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6573  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6574  * to the parsed name as looked up in the RExC_paren_names hash.
6575  * If there is an error throws a vFAIL().. type exception.
6576  */
6577
6578 #define REG_RSN_RETURN_NULL    0
6579 #define REG_RSN_RETURN_NAME    1
6580 #define REG_RSN_RETURN_DATA    2
6581
6582 STATIC SV*
6583 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6584 {
6585     char *name_start = RExC_parse;
6586
6587     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6588
6589     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6590          /* skip IDFIRST by using do...while */
6591         if (UTF)
6592             do {
6593                 RExC_parse += UTF8SKIP(RExC_parse);
6594             } while (isALNUM_utf8((U8*)RExC_parse));
6595         else
6596             do {
6597                 RExC_parse++;
6598             } while (isALNUM(*RExC_parse));
6599     }
6600
6601     if ( flags ) {
6602         SV* sv_name
6603             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6604                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6605         if ( flags == REG_RSN_RETURN_NAME)
6606             return sv_name;
6607         else if (flags==REG_RSN_RETURN_DATA) {
6608             HE *he_str = NULL;
6609             SV *sv_dat = NULL;
6610             if ( ! sv_name )      /* should not happen*/
6611                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6612             if (RExC_paren_names)
6613                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6614             if ( he_str )
6615                 sv_dat = HeVAL(he_str);
6616             if ( ! sv_dat )
6617                 vFAIL("Reference to nonexistent named group");
6618             return sv_dat;
6619         }
6620         else {
6621             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6622                        (unsigned long) flags);
6623         }
6624         /* NOT REACHED */
6625     }
6626     return NULL;
6627 }
6628
6629 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6630     int rem=(int)(RExC_end - RExC_parse);                       \
6631     int cut;                                                    \
6632     int num;                                                    \
6633     int iscut=0;                                                \
6634     if (rem>10) {                                               \
6635         rem=10;                                                 \
6636         iscut=1;                                                \
6637     }                                                           \
6638     cut=10-rem;                                                 \
6639     if (RExC_lastparse!=RExC_parse)                             \
6640         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6641             rem, RExC_parse,                                    \
6642             cut + 4,                                            \
6643             iscut ? "..." : "<"                                 \
6644         );                                                      \
6645     else                                                        \
6646         PerlIO_printf(Perl_debug_log,"%16s","");                \
6647                                                                 \
6648     if (SIZE_ONLY)                                              \
6649        num = RExC_size + 1;                                     \
6650     else                                                        \
6651        num=REG_NODE_NUM(RExC_emit);                             \
6652     if (RExC_lastnum!=num)                                      \
6653        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6654     else                                                        \
6655        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6656     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6657         (int)((depth*2)), "",                                   \
6658         (funcname)                                              \
6659     );                                                          \
6660     RExC_lastnum=num;                                           \
6661     RExC_lastparse=RExC_parse;                                  \
6662 })
6663
6664
6665
6666 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6667     DEBUG_PARSE_MSG((funcname));                            \
6668     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6669 })
6670 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6671     DEBUG_PARSE_MSG((funcname));                            \
6672     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6673 })
6674
6675 /* This section of code defines the inversion list object and its methods.  The
6676  * interfaces are highly subject to change, so as much as possible is static to
6677  * this file.  An inversion list is here implemented as a malloc'd C UV array
6678  * with some added info that is placed as UVs at the beginning in a header
6679  * portion.  An inversion list for Unicode is an array of code points, sorted
6680  * by ordinal number.  The zeroth element is the first code point in the list.
6681  * The 1th element is the first element beyond that not in the list.  In other
6682  * words, the first range is
6683  *  invlist[0]..(invlist[1]-1)
6684  * The other ranges follow.  Thus every element whose index is divisible by two
6685  * marks the beginning of a range that is in the list, and every element not
6686  * divisible by two marks the beginning of a range not in the list.  A single
6687  * element inversion list that contains the single code point N generally
6688  * consists of two elements
6689  *  invlist[0] == N
6690  *  invlist[1] == N+1
6691  * (The exception is when N is the highest representable value on the
6692  * machine, in which case the list containing just it would be a single
6693  * element, itself.  By extension, if the last range in the list extends to
6694  * infinity, then the first element of that range will be in the inversion list
6695  * at a position that is divisible by two, and is the final element in the
6696  * list.)
6697  * Taking the complement (inverting) an inversion list is quite simple, if the
6698  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6699  * This implementation reserves an element at the beginning of each inversion list
6700  * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
6701  * beginning of the list is either that element if 0, or the next one if 1.
6702  *
6703  * More about inversion lists can be found in "Unicode Demystified"
6704  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6705  * More will be coming when functionality is added later.
6706  *
6707  * The inversion list data structure is currently implemented as an SV pointing
6708  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6709  * array of UV whose memory management is automatically handled by the existing
6710  * facilities for SV's.
6711  *
6712  * Some of the methods should always be private to the implementation, and some
6713  * should eventually be made public */
6714
6715 #define INVLIST_LEN_OFFSET 0    /* Number of elements in the inversion list */
6716 #define INVLIST_ITER_OFFSET 1   /* Current iteration position */
6717
6718 /* This is a combination of a version and data structure type, so that one
6719  * being passed in can be validated to be an inversion list of the correct
6720  * vintage.  When the structure of the header is changed, a new random number
6721  * in the range 2**31-1 should be generated and the new() method changed to
6722  * insert that at this location.  Then, if an auxiliary program doesn't change
6723  * correspondingly, it will be discovered immediately */
6724 #define INVLIST_VERSION_ID_OFFSET 2
6725 #define INVLIST_VERSION_ID 1064334010
6726
6727 /* For safety, when adding new elements, remember to #undef them at the end of
6728  * the inversion list code section */
6729
6730 #define INVLIST_ZERO_OFFSET 3   /* 0 or 1; must be last element in header */
6731 /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
6732  * contains the code point U+00000, and begins here.  If 1, the inversion list
6733  * doesn't contain U+0000, and it begins at the next UV in the array.
6734  * Inverting an inversion list consists of adding or removing the 0 at the
6735  * beginning of it.  By reserving a space for that 0, inversion can be made
6736  * very fast */
6737
6738 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6739
6740 /* Internally things are UVs */
6741 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6742 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6743
6744 #define INVLIST_INITIAL_LEN 10
6745
6746 PERL_STATIC_INLINE UV*
6747 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6748 {
6749     /* Returns a pointer to the first element in the inversion list's array.
6750      * This is called upon initialization of an inversion list.  Where the
6751      * array begins depends on whether the list has the code point U+0000
6752      * in it or not.  The other parameter tells it whether the code that
6753      * follows this call is about to put a 0 in the inversion list or not.
6754      * The first element is either the element with 0, if 0, or the next one,
6755      * if 1 */
6756
6757     UV* zero = get_invlist_zero_addr(invlist);
6758
6759     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6760
6761     /* Must be empty */
6762     assert(! *get_invlist_len_addr(invlist));
6763
6764     /* 1^1 = 0; 1^0 = 1 */
6765     *zero = 1 ^ will_have_0;
6766     return zero + *zero;
6767 }
6768
6769 PERL_STATIC_INLINE UV*
6770 S_invlist_array(pTHX_ SV* const invlist)
6771 {
6772     /* Returns the pointer to the inversion list's array.  Every time the
6773      * length changes, this needs to be called in case malloc or realloc moved
6774      * it */
6775
6776     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6777
6778     /* Must not be empty.  If these fail, you probably didn't check for <len>
6779      * being non-zero before trying to get the array */
6780     assert(*get_invlist_len_addr(invlist));
6781     assert(*get_invlist_zero_addr(invlist) == 0
6782            || *get_invlist_zero_addr(invlist) == 1);
6783
6784     /* The array begins either at the element reserved for zero if the
6785      * list contains 0 (that element will be set to 0), or otherwise the next
6786      * element (in which case the reserved element will be set to 1). */
6787     return (UV *) (get_invlist_zero_addr(invlist)
6788                    + *get_invlist_zero_addr(invlist));
6789 }
6790
6791 PERL_STATIC_INLINE UV*
6792 S_get_invlist_len_addr(pTHX_ SV* invlist)
6793 {
6794     /* Return the address of the UV that contains the current number
6795      * of used elements in the inversion list */
6796
6797     PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6798
6799     return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6800 }
6801
6802 PERL_STATIC_INLINE UV
6803 S_invlist_len(pTHX_ SV* const invlist)
6804 {
6805     /* Returns the current number of elements stored in the inversion list's
6806      * array */
6807
6808     PERL_ARGS_ASSERT_INVLIST_LEN;
6809
6810     return *get_invlist_len_addr(invlist);
6811 }
6812
6813 PERL_STATIC_INLINE void
6814 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6815 {
6816     /* Sets the current number of elements stored in the inversion list */
6817
6818     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6819
6820     *get_invlist_len_addr(invlist) = len;
6821
6822     assert(len <= SvLEN(invlist));
6823
6824     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6825     /* If the list contains U+0000, that element is part of the header,
6826      * and should not be counted as part of the array.  It will contain
6827      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
6828      * subtract:
6829      *  SvCUR_set(invlist,
6830      *            TO_INTERNAL_SIZE(len
6831      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
6832      * But, this is only valid if len is not 0.  The consequences of not doing
6833      * this is that the memory allocation code may think that 1 more UV is
6834      * being used than actually is, and so might do an unnecessary grow.  That
6835      * seems worth not bothering to make this the precise amount.
6836      *
6837      * Note that when inverting, SvCUR shouldn't change */
6838 }
6839
6840 PERL_STATIC_INLINE UV
6841 S_invlist_max(pTHX_ SV* const invlist)
6842 {
6843     /* Returns the maximum number of elements storable in the inversion list's
6844      * array, without having to realloc() */
6845
6846     PERL_ARGS_ASSERT_INVLIST_MAX;
6847
6848     return FROM_INTERNAL_SIZE(SvLEN(invlist));
6849 }
6850
6851 PERL_STATIC_INLINE UV*
6852 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6853 {
6854     /* Return the address of the UV that is reserved to hold 0 if the inversion
6855      * list contains 0.  This has to be the last element of the heading, as the
6856      * list proper starts with either it if 0, or the next element if not.
6857      * (But we force it to contain either 0 or 1) */
6858
6859     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6860
6861     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6862 }
6863
6864 #ifndef PERL_IN_XSUB_RE
6865 SV*
6866 Perl__new_invlist(pTHX_ IV initial_size)
6867 {
6868
6869     /* Return a pointer to a newly constructed inversion list, with enough
6870      * space to store 'initial_size' elements.  If that number is negative, a
6871      * system default is used instead */
6872
6873     SV* new_list;
6874
6875     if (initial_size < 0) {
6876         initial_size = INVLIST_INITIAL_LEN;
6877     }
6878
6879     /* Allocate the initial space */
6880     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6881     invlist_set_len(new_list, 0);
6882
6883     /* Force iterinit() to be used to get iteration to work */
6884     *get_invlist_iter_addr(new_list) = UV_MAX;
6885
6886     /* This should force a segfault if a method doesn't initialize this
6887      * properly */
6888     *get_invlist_zero_addr(new_list) = UV_MAX;
6889
6890     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6891 #if HEADER_LENGTH != 4
6892 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6893 #endif
6894
6895     return new_list;
6896 }
6897 #endif
6898
6899 STATIC SV*
6900 S__new_invlist_C_array(pTHX_ UV* list)
6901 {
6902     /* Return a pointer to a newly constructed inversion list, initialized to
6903      * point to <list>, which has to be in the exact correct inversion list
6904      * form, including internal fields.  Thus this is a dangerous routine that
6905      * should not be used in the wrong hands */
6906
6907     SV* invlist = newSV_type(SVt_PV);
6908
6909     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6910
6911     SvPV_set(invlist, (char *) list);
6912     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
6913                                shouldn't touch it */
6914     SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6915
6916     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6917         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6918     }
6919
6920     return invlist;
6921 }
6922
6923 STATIC void
6924 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6925 {
6926     /* Grow the maximum size of an inversion list */
6927
6928     PERL_ARGS_ASSERT_INVLIST_EXTEND;
6929
6930     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6931 }
6932
6933 PERL_STATIC_INLINE void
6934 S_invlist_trim(pTHX_ SV* const invlist)
6935 {
6936     PERL_ARGS_ASSERT_INVLIST_TRIM;
6937
6938     /* Change the length of the inversion list to how many entries it currently
6939      * has */
6940
6941     SvPV_shrink_to_cur((SV *) invlist);
6942 }
6943
6944 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6945  * etc */
6946 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6947 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6948
6949 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6950
6951 STATIC void
6952 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6953 {
6954    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6955     * the end of the inversion list.  The range must be above any existing
6956     * ones. */
6957
6958     UV* array;
6959     UV max = invlist_max(invlist);
6960     UV len = invlist_len(invlist);
6961
6962     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6963
6964     if (len == 0) { /* Empty lists must be initialized */
6965         array = _invlist_array_init(invlist, start == 0);
6966     }
6967     else {
6968         /* Here, the existing list is non-empty. The current max entry in the
6969          * list is generally the first value not in the set, except when the
6970          * set extends to the end of permissible values, in which case it is
6971          * the first entry in that final set, and so this call is an attempt to
6972          * append out-of-order */
6973
6974         UV final_element = len - 1;
6975         array = invlist_array(invlist);
6976         if (array[final_element] > start
6977             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6978         {
6979             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6980                        array[final_element], start,
6981                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6982         }
6983
6984         /* Here, it is a legal append.  If the new range begins with the first
6985          * value not in the set, it is extending the set, so the new first
6986          * value not in the set is one greater than the newly extended range.
6987          * */
6988         if (array[final_element] == start) {
6989             if (end != UV_MAX) {
6990                 array[final_element] = end + 1;
6991             }
6992             else {
6993                 /* But if the end is the maximum representable on the machine,
6994                  * just let the range that this would extend to have no end */
6995                 invlist_set_len(invlist, len - 1);
6996             }
6997             return;
6998         }
6999     }
7000
7001     /* Here the new range doesn't extend any existing set.  Add it */
7002
7003     len += 2;   /* Includes an element each for the start and end of range */
7004
7005     /* If overflows the existing space, extend, which may cause the array to be
7006      * moved */
7007     if (max < len) {
7008         invlist_extend(invlist, len);
7009         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7010                                            failure in invlist_array() */
7011         array = invlist_array(invlist);
7012     }
7013     else {
7014         invlist_set_len(invlist, len);
7015     }
7016
7017     /* The next item on the list starts the range, the one after that is
7018      * one past the new range.  */
7019     array[len - 2] = start;
7020     if (end != UV_MAX) {
7021         array[len - 1] = end + 1;
7022     }
7023     else {
7024         /* But if the end is the maximum representable on the machine, just let
7025          * the range have no end */
7026         invlist_set_len(invlist, len - 1);
7027     }
7028 }
7029
7030 #ifndef PERL_IN_XSUB_RE
7031
7032 STATIC IV
7033 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
7034 {
7035     /* Searches the inversion list for the entry that contains the input code
7036      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7037      * return value is the index into the list's array of the range that
7038      * contains <cp> */
7039
7040     IV low = 0;
7041     IV high = invlist_len(invlist);
7042     const UV * const array = invlist_array(invlist);
7043
7044     PERL_ARGS_ASSERT_INVLIST_SEARCH;
7045
7046     /* If list is empty or the code point is before the first element, return
7047      * failure. */
7048     if (high == 0 || cp < array[0]) {
7049         return -1;
7050     }
7051
7052     /* Binary search.  What we are looking for is <i> such that
7053      *  array[i] <= cp < array[i+1]
7054      * The loop below converges on the i+1. */
7055     while (low < high) {
7056         IV mid = (low + high) / 2;
7057         if (array[mid] <= cp) {
7058             low = mid + 1;
7059
7060             /* We could do this extra test to exit the loop early.
7061             if (cp < array[low]) {
7062                 return mid;
7063             }
7064             */
7065         }
7066         else { /* cp < array[mid] */
7067             high = mid;
7068         }
7069     }
7070
7071     return high - 1;
7072 }
7073
7074 void
7075 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7076 {
7077     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7078      * but is used when the swash has an inversion list.  This makes this much
7079      * faster, as it uses a binary search instead of a linear one.  This is
7080      * intimately tied to that function, and perhaps should be in utf8.c,
7081      * except it is intimately tied to inversion lists as well.  It assumes
7082      * that <swatch> is all 0's on input */
7083
7084     UV current = start;
7085     const IV len = invlist_len(invlist);
7086     IV i;
7087     const UV * array;
7088
7089     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7090
7091     if (len == 0) { /* Empty inversion list */
7092         return;
7093     }
7094
7095     array = invlist_array(invlist);
7096
7097     /* Find which element it is */
7098     i = invlist_search(invlist, start);
7099
7100     /* We populate from <start> to <end> */
7101     while (current < end) {
7102         UV upper;
7103
7104         /* The inversion list gives the results for every possible code point
7105          * after the first one in the list.  Only those ranges whose index is
7106          * even are ones that the inversion list matches.  For the odd ones,
7107          * and if the initial code point is not in the list, we have to skip
7108          * forward to the next element */
7109         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7110             i++;
7111             if (i >= len) { /* Finished if beyond the end of the array */
7112                 return;
7113             }
7114             current = array[i];
7115             if (current >= end) {   /* Finished if beyond the end of what we
7116                                        are populating */
7117                 return;
7118             }
7119         }
7120         assert(current >= start);
7121
7122         /* The current range ends one below the next one, except don't go past
7123          * <end> */
7124         i++;
7125         upper = (i < len && array[i] < end) ? array[i] : end;
7126
7127         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7128          * for each code point in it */
7129         for (; current < upper; current++) {
7130             const STRLEN offset = (STRLEN)(current - start);
7131             swatch[offset >> 3] |= 1 << (offset & 7);
7132         }
7133
7134         /* Quit if at the end of the list */
7135         if (i >= len) {
7136
7137             /* But first, have to deal with the highest possible code point on
7138              * the platform.  The previous code assumes that <end> is one
7139              * beyond where we want to populate, but that is impossible at the
7140              * platform's infinity, so have to handle it specially */
7141             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7142             {
7143                 const STRLEN offset = (STRLEN)(end - start);
7144                 swatch[offset >> 3] |= 1 << (offset & 7);
7145             }
7146             return;
7147         }
7148
7149         /* Advance to the next range, which will be for code points not in the
7150          * inversion list */
7151         current = array[i];
7152     }
7153
7154     return;
7155 }
7156
7157
7158 void
7159 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7160 {
7161     /* Take the union of two inversion lists and point <output> to it.  *output
7162      * should be defined upon input, and if it points to one of the two lists,
7163      * the reference count to that list will be decremented.  The first list,
7164      * <a>, may be NULL, in which case a copy of the second list is returned.
7165      * If <complement_b> is TRUE, the union is taken of the complement
7166      * (inversion) of <b> instead of b itself.
7167      *
7168      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7169      * Richard Gillam, published by Addison-Wesley, and explained at some
7170      * length there.  The preface says to incorporate its examples into your
7171      * code at your own risk.
7172      *
7173      * The algorithm is like a merge sort.
7174      *
7175      * XXX A potential performance improvement is to keep track as we go along
7176      * if only one of the inputs contributes to the result, meaning the other
7177      * is a subset of that one.  In that case, we can skip the final copy and
7178      * return the larger of the input lists, but then outside code might need
7179      * to keep track of whether to free the input list or not */
7180
7181     UV* array_a;    /* a's array */
7182     UV* array_b;
7183     UV len_a;       /* length of a's array */
7184     UV len_b;
7185
7186     SV* u;                      /* the resulting union */
7187     UV* array_u;
7188     UV len_u;
7189
7190     UV i_a = 0;             /* current index into a's array */
7191     UV i_b = 0;
7192     UV i_u = 0;
7193
7194     /* running count, as explained in the algorithm source book; items are
7195      * stopped accumulating and are output when the count changes to/from 0.
7196      * The count is incremented when we start a range that's in the set, and
7197      * decremented when we start a range that's not in the set.  So its range
7198      * is 0 to 2.  Only when the count is zero is something not in the set.
7199      */
7200     UV count = 0;
7201
7202     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7203     assert(a != b);
7204
7205     /* If either one is empty, the union is the other one */
7206     if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
7207         if (*output == a) {
7208             if (a != NULL) {
7209                 SvREFCNT_dec(a);
7210             }
7211         }
7212         if (*output != b) {
7213             *output = invlist_clone(b);
7214             if (complement_b) {
7215                 _invlist_invert(*output);
7216             }
7217         } /* else *output already = b; */
7218         return;
7219     }
7220     else if ((len_b = invlist_len(b)) == 0) {
7221         if (*output == b) {
7222             SvREFCNT_dec(b);
7223         }
7224
7225         /* The complement of an empty list is a list that has everything in it,
7226          * so the union with <a> includes everything too */
7227         if (complement_b) {
7228             if (a == *output) {
7229                 SvREFCNT_dec(a);
7230             }
7231             *output = _new_invlist(1);
7232             _append_range_to_invlist(*output, 0, UV_MAX);
7233         }
7234         else if (*output != a) {
7235             *output = invlist_clone(a);
7236         }
7237         /* else *output already = a; */
7238         return;
7239     }
7240
7241     /* Here both lists exist and are non-empty */
7242     array_a = invlist_array(a);
7243     array_b = invlist_array(b);
7244
7245     /* If are to take the union of 'a' with the complement of b, set it
7246      * up so are looking at b's complement. */
7247     if (complement_b) {
7248
7249         /* To complement, we invert: if the first element is 0, remove it.  To
7250          * do this, we just pretend the array starts one later, and clear the
7251          * flag as we don't have to do anything else later */
7252         if (array_b[0] == 0) {
7253             array_b++;
7254             len_b--;
7255             complement_b = FALSE;
7256         }
7257         else {
7258
7259             /* But if the first element is not zero, we unshift a 0 before the
7260              * array.  The data structure reserves a space for that 0 (which
7261              * should be a '1' right now), so physical shifting is unneeded,
7262              * but temporarily change that element to 0.  Before exiting the
7263              * routine, we must restore the element to '1' */
7264             array_b--;
7265             len_b++;
7266             array_b[0] = 0;
7267         }
7268     }
7269
7270     /* Size the union for the worst case: that the sets are completely
7271      * disjoint */
7272     u = _new_invlist(len_a + len_b);
7273
7274     /* Will contain U+0000 if either component does */
7275     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7276                                       || (len_b > 0 && array_b[0] == 0));
7277
7278     /* Go through each list item by item, stopping when exhausted one of
7279      * them */
7280     while (i_a < len_a && i_b < len_b) {
7281         UV cp;      /* The element to potentially add to the union's array */
7282         bool cp_in_set;   /* is it in the the input list's set or not */
7283
7284         /* We need to take one or the other of the two inputs for the union.
7285          * Since we are merging two sorted lists, we take the smaller of the
7286          * next items.  In case of a tie, we take the one that is in its set
7287          * first.  If we took one not in the set first, it would decrement the
7288          * count, possibly to 0 which would cause it to be output as ending the
7289          * range, and the next time through we would take the same number, and
7290          * output it again as beginning the next range.  By doing it the
7291          * opposite way, there is no possibility that the count will be
7292          * momentarily decremented to 0, and thus the two adjoining ranges will
7293          * be seamlessly merged.  (In a tie and both are in the set or both not
7294          * in the set, it doesn't matter which we take first.) */
7295         if (array_a[i_a] < array_b[i_b]
7296             || (array_a[i_a] == array_b[i_b]
7297                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7298         {
7299             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7300             cp= array_a[i_a++];
7301         }
7302         else {
7303             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7304             cp= array_b[i_b++];
7305         }
7306
7307         /* Here, have chosen which of the two inputs to look at.  Only output
7308          * if the running count changes to/from 0, which marks the
7309          * beginning/end of a range in that's in the set */
7310         if (cp_in_set) {
7311             if (count == 0) {
7312                 array_u[i_u++] = cp;
7313             }
7314             count++;
7315         }
7316         else {
7317             count--;
7318             if (count == 0) {
7319                 array_u[i_u++] = cp;
7320             }
7321         }
7322     }
7323
7324     /* Here, we are finished going through at least one of the lists, which
7325      * means there is something remaining in at most one.  We check if the list
7326      * that hasn't been exhausted is positioned such that we are in the middle
7327      * of a range in its set or not.  (i_a and i_b point to the element beyond
7328      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7329      * is potentially more to output.
7330      * There are four cases:
7331      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7332      *     in the union is entirely from the non-exhausted set.
7333      *  2) Both were in their sets, count is 2.  Nothing further should
7334      *     be output, as everything that remains will be in the exhausted
7335      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7336      *     that
7337      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7338      *     Nothing further should be output because the union includes
7339      *     everything from the exhausted set.  Not decrementing ensures that.
7340      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7341      *     decrementing to 0 insures that we look at the remainder of the
7342      *     non-exhausted set */
7343     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7344         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7345     {
7346         count--;
7347     }
7348
7349     /* The final length is what we've output so far, plus what else is about to
7350      * be output.  (If 'count' is non-zero, then the input list we exhausted
7351      * has everything remaining up to the machine's limit in its set, and hence
7352      * in the union, so there will be no further output. */
7353     len_u = i_u;
7354     if (count == 0) {
7355         /* At most one of the subexpressions will be non-zero */
7356         len_u += (len_a - i_a) + (len_b - i_b);
7357     }
7358
7359     /* Set result to final length, which can change the pointer to array_u, so
7360      * re-find it */
7361     if (len_u != invlist_len(u)) {
7362         invlist_set_len(u, len_u);
7363         invlist_trim(u);
7364         array_u = invlist_array(u);
7365     }
7366
7367     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7368      * the other) ended with everything above it not in its set.  That means
7369      * that the remaining part of the union is precisely the same as the
7370      * non-exhausted list, so can just copy it unchanged.  (If both list were
7371      * exhausted at the same time, then the operations below will be both 0.)
7372      */
7373     if (count == 0) {
7374         IV copy_count; /* At most one will have a non-zero copy count */
7375         if ((copy_count = len_a - i_a) > 0) {
7376             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7377         }
7378         else if ((copy_count = len_b - i_b) > 0) {
7379             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7380         }
7381     }
7382
7383     /*  We may be removing a reference to one of the inputs */
7384     if (a == *output || b == *output) {
7385         SvREFCNT_dec(*output);
7386     }
7387
7388     /* If we've changed b, restore it */
7389     if (complement_b) {
7390         array_b[0] = 1;
7391     }
7392
7393     *output = u;
7394     return;
7395 }
7396
7397 void
7398 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7399 {
7400     /* Take the intersection of two inversion lists and point <i> to it.  *i
7401      * should be defined upon input, and if it points to one of the two lists,
7402      * the reference count to that list will be decremented.
7403      * If <complement_b> is TRUE, the result will be the intersection of <a>
7404      * and the complement (or inversion) of <b> instead of <b> directly.
7405      *
7406      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7407      * Richard Gillam, published by Addison-Wesley, and explained at some
7408      * length there.  The preface says to incorporate its examples into your
7409      * code at your own risk.  In fact, it had bugs
7410      *
7411      * The algorithm is like a merge sort, and is essentially the same as the
7412      * union above
7413      */
7414
7415     UV* array_a;                /* a's array */
7416     UV* array_b;
7417     UV len_a;   /* length of a's array */
7418     UV len_b;
7419
7420     SV* r;                   /* the resulting intersection */
7421     UV* array_r;
7422     UV len_r;
7423
7424     UV i_a = 0;             /* current index into a's array */
7425     UV i_b = 0;
7426     UV i_r = 0;
7427
7428     /* running count, as explained in the algorithm source book; items are
7429      * stopped accumulating and are output when the count changes to/from 2.
7430      * The count is incremented when we start a range that's in the set, and
7431      * decremented when we start a range that's not in the set.  So its range
7432      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7433      */
7434     UV count = 0;
7435
7436     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7437     assert(a != b);
7438
7439     /* Special case if either one is empty */
7440     len_a = invlist_len(a);
7441     if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7442
7443         if (len_a != 0 && complement_b) {
7444
7445             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7446              * be empty.  Here, also we are using 'b's complement, which hence
7447              * must be every possible code point.  Thus the intersection is
7448              * simply 'a'. */
7449             if (*i != a) {
7450                 *i = invlist_clone(a);
7451
7452                 if (*i == b) {
7453                     SvREFCNT_dec(b);
7454                 }
7455             }
7456             /* else *i is already 'a' */
7457             return;
7458         }
7459
7460         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7461          * intersection must be empty */
7462         if (*i == a) {
7463             SvREFCNT_dec(a);
7464         }
7465         else if (*i == b) {
7466             SvREFCNT_dec(b);
7467         }
7468         *i = _new_invlist(0);
7469         return;
7470     }
7471
7472     /* Here both lists exist and are non-empty */
7473     array_a = invlist_array(a);
7474     array_b = invlist_array(b);
7475
7476     /* If are to take the intersection of 'a' with the complement of b, set it
7477      * up so are looking at b's complement. */
7478     if (complement_b) {
7479
7480         /* To complement, we invert: if the first element is 0, remove it.  To
7481          * do this, we just pretend the array starts one later, and clear the
7482          * flag as we don't have to do anything else later */
7483         if (array_b[0] == 0) {
7484             array_b++;
7485             len_b--;
7486             complement_b = FALSE;
7487         }
7488         else {
7489
7490             /* But if the first element is not zero, we unshift a 0 before the
7491              * array.  The data structure reserves a space for that 0 (which
7492              * should be a '1' right now), so physical shifting is unneeded,
7493              * but temporarily change that element to 0.  Before exiting the
7494              * routine, we must restore the element to '1' */
7495             array_b--;
7496             len_b++;
7497             array_b[0] = 0;
7498         }
7499     }
7500
7501     /* Size the intersection for the worst case: that the intersection ends up
7502      * fragmenting everything to be completely disjoint */
7503     r= _new_invlist(len_a + len_b);
7504
7505     /* Will contain U+0000 iff both components do */
7506     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7507                                      && len_b > 0 && array_b[0] == 0);
7508
7509     /* Go through each list item by item, stopping when exhausted one of
7510      * them */
7511     while (i_a < len_a && i_b < len_b) {
7512         UV cp;      /* The element to potentially add to the intersection's
7513                        array */
7514         bool cp_in_set; /* Is it in the input list's set or not */
7515
7516         /* We need to take one or the other of the two inputs for the
7517          * intersection.  Since we are merging two sorted lists, we take the
7518          * smaller of the next items.  In case of a tie, we take the one that
7519          * is not in its set first (a difference from the union algorithm).  If
7520          * we took one in the set first, it would increment the count, possibly
7521          * to 2 which would cause it to be output as starting a range in the
7522          * intersection, and the next time through we would take that same
7523          * number, and output it again as ending the set.  By doing it the
7524          * opposite of this, there is no possibility that the count will be
7525          * momentarily incremented to 2.  (In a tie and both are in the set or
7526          * both not in the set, it doesn't matter which we take first.) */
7527         if (array_a[i_a] < array_b[i_b]
7528             || (array_a[i_a] == array_b[i_b]
7529                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7530         {
7531             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7532             cp= array_a[i_a++];
7533         }
7534         else {
7535             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7536             cp= array_b[i_b++];
7537         }
7538
7539         /* Here, have chosen which of the two inputs to look at.  Only output
7540          * if the running count changes to/from 2, which marks the
7541          * beginning/end of a range that's in the intersection */
7542         if (cp_in_set) {
7543             count++;
7544             if (count == 2) {
7545                 array_r[i_r++] = cp;
7546             }
7547         }
7548         else {
7549             if (count == 2) {
7550                 array_r[i_r++] = cp;
7551             }
7552             count--;
7553         }
7554     }
7555
7556     /* Here, we are finished going through at least one of the lists, which
7557      * means there is something remaining in at most one.  We check if the list
7558      * that has been exhausted is positioned such that we are in the middle
7559      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7560      * the ones we care about.)  There are four cases:
7561      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7562      *     nothing left in the intersection.
7563      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7564      *     above 2.  What should be output is exactly that which is in the
7565      *     non-exhausted set, as everything it has is also in the intersection
7566      *     set, and everything it doesn't have can't be in the intersection
7567      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7568      *     gets incremented to 2.  Like the previous case, the intersection is
7569      *     everything that remains in the non-exhausted set.
7570      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7571      *     remains 1.  And the intersection has nothing more. */
7572     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7573         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7574     {
7575         count++;
7576     }
7577
7578     /* The final length is what we've output so far plus what else is in the
7579      * intersection.  At most one of the subexpressions below will be non-zero */
7580     len_r = i_r;
7581     if (count >= 2) {
7582         len_r += (len_a - i_a) + (len_b - i_b);
7583     }
7584
7585     /* Set result to final length, which can change the pointer to array_r, so
7586      * re-find it */
7587     if (len_r != invlist_len(r)) {
7588         invlist_set_len(r, len_r);
7589         invlist_trim(r);
7590         array_r = invlist_array(r);
7591     }
7592
7593     /* Finish outputting any remaining */
7594     if (count >= 2) { /* At most one will have a non-zero copy count */
7595         IV copy_count;
7596         if ((copy_count = len_a - i_a) > 0) {
7597             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7598         }
7599         else if ((copy_count = len_b - i_b) > 0) {
7600             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7601         }
7602     }
7603
7604     /*  We may be removing a reference to one of the inputs */
7605     if (a == *i || b == *i) {
7606         SvREFCNT_dec(*i);
7607     }
7608
7609     /* If we've changed b, restore it */
7610     if (complement_b) {
7611         array_b[0] = 1;
7612     }
7613
7614     *i = r;
7615     return;
7616 }
7617
7618 SV*
7619 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7620 {
7621     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7622      * set.  A pointer to the inversion list is returned.  This may actually be
7623      * a new list, in which case the passed in one has been destroyed.  The
7624      * passed in inversion list can be NULL, in which case a new one is created
7625      * with just the one range in it */
7626
7627     SV* range_invlist;
7628     UV len;
7629
7630     if (invlist == NULL) {
7631         invlist = _new_invlist(2);
7632         len = 0;
7633     }
7634     else {
7635         len = invlist_len(invlist);
7636     }
7637
7638     /* If comes after the final entry, can just append it to the end */
7639     if (len == 0
7640         || start >= invlist_array(invlist)
7641                                     [invlist_len(invlist) - 1])
7642     {
7643         _append_range_to_invlist(invlist, start, end);
7644         return invlist;
7645     }
7646
7647     /* Here, can't just append things, create and return a new inversion list
7648      * which is the union of this range and the existing inversion list */
7649     range_invlist = _new_invlist(2);
7650     _append_range_to_invlist(range_invlist, start, end);
7651
7652     _invlist_union(invlist, range_invlist, &invlist);
7653
7654     /* The temporary can be freed */
7655     SvREFCNT_dec(range_invlist);
7656
7657     return invlist;
7658 }
7659
7660 #endif
7661
7662 PERL_STATIC_INLINE SV*
7663 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7664     return _add_range_to_invlist(invlist, cp, cp);
7665 }
7666
7667 #ifndef PERL_IN_XSUB_RE
7668 void
7669 Perl__invlist_invert(pTHX_ SV* const invlist)
7670 {
7671     /* Complement the input inversion list.  This adds a 0 if the list didn't
7672      * have a zero; removes it otherwise.  As described above, the data
7673      * structure is set up so that this is very efficient */
7674
7675     UV* len_pos = get_invlist_len_addr(invlist);
7676
7677     PERL_ARGS_ASSERT__INVLIST_INVERT;
7678
7679     /* The inverse of matching nothing is matching everything */
7680     if (*len_pos == 0) {
7681         _append_range_to_invlist(invlist, 0, UV_MAX);
7682         return;
7683     }
7684
7685     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7686      * zero element was a 0, so it is being removed, so the length decrements
7687      * by 1; and vice-versa.  SvCUR is unaffected */
7688     if (*get_invlist_zero_addr(invlist) ^= 1) {
7689         (*len_pos)--;
7690     }
7691     else {
7692         (*len_pos)++;
7693     }
7694 }
7695
7696 void
7697 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7698 {
7699     /* Complement the input inversion list (which must be a Unicode property,
7700      * all of which don't match above the Unicode maximum code point.)  And
7701      * Perl has chosen to not have the inversion match above that either.  This
7702      * adds a 0x110000 if the list didn't end with it, and removes it if it did
7703      */
7704
7705     UV len;
7706     UV* array;
7707
7708     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7709
7710     _invlist_invert(invlist);
7711
7712     len = invlist_len(invlist);
7713
7714     if (len != 0) { /* If empty do nothing */
7715         array = invlist_array(invlist);
7716         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7717             /* Add 0x110000.  First, grow if necessary */
7718             len++;
7719             if (invlist_max(invlist) < len) {
7720                 invlist_extend(invlist, len);
7721                 array = invlist_array(invlist);
7722             }
7723             invlist_set_len(invlist, len);
7724             array[len - 1] = PERL_UNICODE_MAX + 1;
7725         }
7726         else {  /* Remove the 0x110000 */
7727             invlist_set_len(invlist, len - 1);
7728         }
7729     }
7730
7731     return;
7732 }
7733 #endif
7734
7735 PERL_STATIC_INLINE SV*
7736 S_invlist_clone(pTHX_ SV* const invlist)
7737 {
7738
7739     /* Return a new inversion list that is a copy of the input one, which is
7740      * unchanged */
7741
7742     /* Need to allocate extra space to accommodate Perl's addition of a
7743      * trailing NUL to SvPV's, since it thinks they are always strings */
7744     SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7745     STRLEN length = SvCUR(invlist);
7746
7747     PERL_ARGS_ASSERT_INVLIST_CLONE;
7748
7749     SvCUR_set(new_invlist, length); /* This isn't done automatically */
7750     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7751
7752     return new_invlist;
7753 }
7754
7755 PERL_STATIC_INLINE UV*
7756 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7757 {
7758     /* Return the address of the UV that contains the current iteration
7759      * position */
7760
7761     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7762
7763     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7764 }
7765
7766 PERL_STATIC_INLINE UV*
7767 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7768 {
7769     /* Return the address of the UV that contains the version id. */
7770
7771     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7772
7773     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7774 }
7775
7776 PERL_STATIC_INLINE void
7777 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
7778 {
7779     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7780
7781     *get_invlist_iter_addr(invlist) = 0;
7782 }
7783
7784 STATIC bool
7785 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7786 {
7787     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7788      * This call sets in <*start> and <*end>, the next range in <invlist>.
7789      * Returns <TRUE> if successful and the next call will return the next
7790      * range; <FALSE> if was already at the end of the list.  If the latter,
7791      * <*start> and <*end> are unchanged, and the next call to this function
7792      * will start over at the beginning of the list */
7793
7794     UV* pos = get_invlist_iter_addr(invlist);
7795     UV len = invlist_len(invlist);
7796     UV *array;
7797
7798     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7799
7800     if (*pos >= len) {
7801         *pos = UV_MAX;  /* Force iternit() to be required next time */
7802         return FALSE;
7803     }
7804
7805     array = invlist_array(invlist);
7806
7807     *start = array[(*pos)++];
7808
7809     if (*pos >= len) {
7810         *end = UV_MAX;
7811     }
7812     else {
7813         *end = array[(*pos)++] - 1;
7814     }
7815
7816     return TRUE;
7817 }
7818
7819 #ifndef PERL_IN_XSUB_RE
7820 SV *
7821 Perl__invlist_contents(pTHX_ SV* const invlist)
7822 {
7823     /* Get the contents of an inversion list into a string SV so that they can
7824      * be printed out.  It uses the format traditionally done for debug tracing
7825      */
7826
7827     UV start, end;
7828     SV* output = newSVpvs("\n");
7829
7830     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7831
7832     invlist_iterinit(invlist);
7833     while (invlist_iternext(invlist, &start, &end)) {
7834         if (end == UV_MAX) {
7835             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7836         }
7837         else if (end != start) {
7838             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7839                     start,       end);
7840         }
7841         else {
7842             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7843         }
7844     }
7845
7846     return output;
7847 }
7848 #endif
7849
7850 #if 0
7851 void
7852 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7853 {
7854     /* Dumps out the ranges in an inversion list.  The string 'header'
7855      * if present is output on a line before the first range */
7856
7857     UV start, end;
7858
7859     if (header && strlen(header)) {
7860         PerlIO_printf(Perl_debug_log, "%s\n", header);
7861     }
7862     invlist_iterinit(invlist);
7863     while (invlist_iternext(invlist, &start, &end)) {
7864         if (end == UV_MAX) {
7865             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7866         }
7867         else {
7868             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7869         }
7870     }
7871 }
7872 #endif
7873
7874 #undef HEADER_LENGTH
7875 #undef INVLIST_INITIAL_LENGTH
7876 #undef TO_INTERNAL_SIZE
7877 #undef FROM_INTERNAL_SIZE
7878 #undef INVLIST_LEN_OFFSET
7879 #undef INVLIST_ZERO_OFFSET
7880 #undef INVLIST_ITER_OFFSET
7881 #undef INVLIST_VERSION_ID
7882
7883 /* End of inversion list object */
7884
7885 /*
7886  - reg - regular expression, i.e. main body or parenthesized thing
7887  *
7888  * Caller must absorb opening parenthesis.
7889  *
7890  * Combining parenthesis handling with the base level of regular expression
7891  * is a trifle forced, but the need to tie the tails of the branches to what
7892  * follows makes it hard to avoid.
7893  */
7894 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7895 #ifdef DEBUGGING
7896 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7897 #else
7898 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7899 #endif
7900
7901 STATIC regnode *
7902 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7903     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7904 {
7905     dVAR;
7906     register regnode *ret;              /* Will be the head of the group. */
7907     register regnode *br;
7908     register regnode *lastbr;
7909     register regnode *ender = NULL;
7910     register I32 parno = 0;
7911     I32 flags;
7912     U32 oregflags = RExC_flags;
7913     bool have_branch = 0;
7914     bool is_open = 0;
7915     I32 freeze_paren = 0;
7916     I32 after_freeze = 0;
7917
7918     /* for (?g), (?gc), and (?o) warnings; warning
7919        about (?c) will warn about (?g) -- japhy    */
7920
7921 #define WASTED_O  0x01
7922 #define WASTED_G  0x02
7923 #define WASTED_C  0x04
7924 #define WASTED_GC (0x02|0x04)
7925     I32 wastedflags = 0x00;
7926
7927     char * parse_start = RExC_parse; /* MJD */
7928     char * const oregcomp_parse = RExC_parse;
7929
7930     GET_RE_DEBUG_FLAGS_DECL;
7931
7932     PERL_ARGS_ASSERT_REG;
7933     DEBUG_PARSE("reg ");
7934
7935     *flagp = 0;                         /* Tentatively. */
7936
7937
7938     /* Make an OPEN node, if parenthesized. */
7939     if (paren) {
7940         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7941             char *start_verb = RExC_parse;
7942             STRLEN verb_len = 0;
7943             char *start_arg = NULL;
7944             unsigned char op = 0;
7945             int argok = 1;
7946             int internal_argval = 0; /* internal_argval is only useful if !argok */
7947             while ( *RExC_parse && *RExC_parse != ')' ) {
7948                 if ( *RExC_parse == ':' ) {
7949                     start_arg = RExC_parse + 1;
7950                     break;
7951                 }
7952                 RExC_parse++;
7953             }
7954             ++start_verb;
7955             verb_len = RExC_parse - start_verb;
7956             if ( start_arg ) {
7957                 RExC_parse++;
7958                 while ( *RExC_parse && *RExC_parse != ')' )
7959                     RExC_parse++;
7960                 if ( *RExC_parse != ')' )
7961                     vFAIL("Unterminated verb pattern argument");
7962                 if ( RExC_parse == start_arg )
7963                     start_arg = NULL;
7964             } else {
7965                 if ( *RExC_parse != ')' )
7966                     vFAIL("Unterminated verb pattern");
7967             }
7968
7969             switch ( *start_verb ) {
7970             case 'A':  /* (*ACCEPT) */
7971                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7972                     op = ACCEPT;
7973                     internal_argval = RExC_nestroot;
7974                 }
7975                 break;
7976             case 'C':  /* (*COMMIT) */
7977                 if ( memEQs(start_verb,verb_len,"COMMIT") )
7978                     op = COMMIT;
7979                 break;
7980             case 'F':  /* (*FAIL) */
7981                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7982                     op = OPFAIL;
7983                     argok = 0;
7984                 }
7985                 break;
7986             case ':':  /* (*:NAME) */
7987             case 'M':  /* (*MARK:NAME) */
7988                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7989                     op = MARKPOINT;
7990                     argok = -1;
7991                 }
7992                 break;
7993             case 'P':  /* (*PRUNE) */
7994                 if ( memEQs(start_verb,verb_len,"PRUNE") )
7995                     op = PRUNE;
7996                 break;
7997             case 'S':   /* (*SKIP) */
7998                 if ( memEQs(start_verb,verb_len,"SKIP") )
7999                     op = SKIP;
8000                 break;
8001             case 'T':  /* (*THEN) */
8002                 /* [19:06] <TimToady> :: is then */
8003                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8004                     op = CUTGROUP;
8005                     RExC_seen |= REG_SEEN_CUTGROUP;
8006                 }
8007                 break;
8008             }
8009             if ( ! op ) {
8010                 RExC_parse++;
8011                 vFAIL3("Unknown verb pattern '%.*s'",
8012                     verb_len, start_verb);
8013             }
8014             if ( argok ) {
8015                 if ( start_arg && internal_argval ) {
8016                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8017                         verb_len, start_verb);
8018                 } else if ( argok < 0 && !start_arg ) {
8019                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8020                         verb_len, start_verb);
8021                 } else {
8022                     ret = reganode(pRExC_state, op, internal_argval);
8023                     if ( ! internal_argval && ! SIZE_ONLY ) {
8024                         if (start_arg) {
8025                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8026                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8027                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8028                             ret->flags = 0;
8029                         } else {
8030                             ret->flags = 1;
8031                         }
8032                     }
8033                 }
8034                 if (!internal_argval)
8035                     RExC_seen |= REG_SEEN_VERBARG;
8036             } else if ( start_arg ) {
8037                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8038                         verb_len, start_verb);
8039             } else {
8040                 ret = reg_node(pRExC_state, op);
8041             }
8042             nextchar(pRExC_state);
8043             return ret;
8044         } else
8045         if (*RExC_parse == '?') { /* (?...) */
8046             bool is_logical = 0;
8047             const char * const seqstart = RExC_parse;
8048             bool has_use_defaults = FALSE;
8049
8050             RExC_parse++;
8051             paren = *RExC_parse++;
8052             ret = NULL;                 /* For look-ahead/behind. */
8053             switch (paren) {
8054
8055             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8056                 paren = *RExC_parse++;
8057                 if ( paren == '<')         /* (?P<...>) named capture */
8058                     goto named_capture;
8059                 else if (paren == '>') {   /* (?P>name) named recursion */
8060                     goto named_recursion;
8061                 }
8062                 else if (paren == '=') {   /* (?P=...)  named backref */
8063                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8064                        you change this make sure you change that */
8065                     char* name_start = RExC_parse;
8066                     U32 num = 0;
8067                     SV *sv_dat = reg_scan_name(pRExC_state,
8068                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8069                     if (RExC_parse == name_start || *RExC_parse != ')')
8070                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8071
8072                     if (!SIZE_ONLY) {
8073                         num = add_data( pRExC_state, 1, "S" );
8074                         RExC_rxi->data->data[num]=(void*)sv_dat;
8075                         SvREFCNT_inc_simple_void(sv_dat);
8076                     }
8077                     RExC_sawback = 1;
8078                     ret = reganode(pRExC_state,
8079                                    ((! FOLD)
8080                                      ? NREF
8081                                      : (MORE_ASCII_RESTRICTED)
8082                                        ? NREFFA
8083                                        : (AT_LEAST_UNI_SEMANTICS)
8084                                          ? NREFFU
8085                                          : (LOC)
8086                                            ? NREFFL
8087                                            : NREFF),
8088                                     num);
8089                     *flagp |= HASWIDTH;
8090
8091                     Set_Node_Offset(ret, parse_start+1);
8092                     Set_Node_Cur_Length(ret); /* MJD */
8093
8094                     nextchar(pRExC_state);
8095                     return ret;
8096                 }
8097                 RExC_parse++;
8098                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8099                 /*NOTREACHED*/
8100             case '<':           /* (?<...) */
8101                 if (*RExC_parse == '!')
8102                     paren = ',';
8103                 else if (*RExC_parse != '=')
8104               named_capture:
8105                 {               /* (?<...>) */
8106                     char *name_start;
8107                     SV *svname;
8108                     paren= '>';
8109             case '\'':          /* (?'...') */
8110                     name_start= RExC_parse;
8111                     svname = reg_scan_name(pRExC_state,
8112                         SIZE_ONLY ?  /* reverse test from the others */
8113                         REG_RSN_RETURN_NAME :
8114                         REG_RSN_RETURN_NULL);
8115                     if (RExC_parse == name_start) {
8116                         RExC_parse++;
8117                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8118                         /*NOTREACHED*/
8119                     }
8120                     if (*RExC_parse != paren)
8121                         vFAIL2("Sequence (?%c... not terminated",
8122                             paren=='>' ? '<' : paren);
8123                     if (SIZE_ONLY) {
8124                         HE *he_str;
8125                         SV *sv_dat = NULL;
8126                         if (!svname) /* shouldn't happen */
8127                             Perl_croak(aTHX_
8128                                 "panic: reg_scan_name returned NULL");
8129                         if (!RExC_paren_names) {
8130                             RExC_paren_names= newHV();
8131                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8132 #ifdef DEBUGGING
8133                             RExC_paren_name_list= newAV();
8134                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8135 #endif
8136                         }
8137                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8138                         if ( he_str )
8139                             sv_dat = HeVAL(he_str);
8140                         if ( ! sv_dat ) {
8141                             /* croak baby croak */
8142                             Perl_croak(aTHX_
8143                                 "panic: paren_name hash element allocation failed");
8144                         } else if ( SvPOK(sv_dat) ) {
8145                             /* (?|...) can mean we have dupes so scan to check
8146                                its already been stored. Maybe a flag indicating
8147                                we are inside such a construct would be useful,
8148                                but the arrays are likely to be quite small, so
8149                                for now we punt -- dmq */
8150                             IV count = SvIV(sv_dat);
8151                             I32 *pv = (I32*)SvPVX(sv_dat);
8152                             IV i;
8153                             for ( i = 0 ; i < count ; i++ ) {
8154                                 if ( pv[i] == RExC_npar ) {
8155                                     count = 0;
8156                                     break;
8157                                 }
8158                             }
8159                             if ( count ) {
8160                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8161                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8162                                 pv[count] = RExC_npar;
8163                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8164                             }
8165                         } else {
8166                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8167                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8168                             SvIOK_on(sv_dat);
8169                             SvIV_set(sv_dat, 1);
8170                         }
8171 #ifdef DEBUGGING
8172                         /* Yes this does cause a memory leak in debugging Perls */
8173                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8174                             SvREFCNT_dec(svname);
8175 #endif
8176
8177                         /*sv_dump(sv_dat);*/
8178                     }
8179                     nextchar(pRExC_state);
8180                     paren = 1;
8181                     goto capturing_parens;
8182                 }
8183                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8184                 RExC_in_lookbehind++;
8185                 RExC_parse++;
8186             case '=':           /* (?=...) */
8187                 RExC_seen_zerolen++;
8188                 break;
8189             case '!':           /* (?!...) */
8190                 RExC_seen_zerolen++;
8191                 if (*RExC_parse == ')') {
8192                     ret=reg_node(pRExC_state, OPFAIL);
8193                     nextchar(pRExC_state);
8194                     return ret;
8195                 }
8196                 break;
8197             case '|':           /* (?|...) */
8198                 /* branch reset, behave like a (?:...) except that
8199                    buffers in alternations share the same numbers */
8200                 paren = ':';
8201                 after_freeze = freeze_paren = RExC_npar;
8202                 break;
8203             case ':':           /* (?:...) */
8204             case '>':           /* (?>...) */
8205                 break;
8206             case '$':           /* (?$...) */
8207             case '@':           /* (?@...) */
8208                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8209                 break;
8210             case '#':           /* (?#...) */
8211                 while (*RExC_parse && *RExC_parse != ')')
8212                     RExC_parse++;
8213                 if (*RExC_parse != ')')
8214                     FAIL("Sequence (?#... not terminated");
8215                 nextchar(pRExC_state);
8216                 *flagp = TRYAGAIN;
8217                 return NULL;
8218             case '0' :           /* (?0) */
8219             case 'R' :           /* (?R) */
8220                 if (*RExC_parse != ')')
8221                     FAIL("Sequence (?R) not terminated");
8222                 ret = reg_node(pRExC_state, GOSTART);
8223                 *flagp |= POSTPONED;
8224                 nextchar(pRExC_state);
8225                 return ret;
8226                 /*notreached*/
8227             { /* named and numeric backreferences */
8228                 I32 num;
8229             case '&':            /* (?&NAME) */
8230                 parse_start = RExC_parse - 1;
8231               named_recursion:
8232                 {
8233                     SV *sv_dat = reg_scan_name(pRExC_state,
8234                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8235                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8236                 }
8237                 goto gen_recurse_regop;
8238                 /* NOT REACHED */
8239             case '+':
8240                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8241                     RExC_parse++;
8242                     vFAIL("Illegal pattern");
8243                 }
8244                 goto parse_recursion;
8245                 /* NOT REACHED*/
8246             case '-': /* (?-1) */
8247                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8248                     RExC_parse--; /* rewind to let it be handled later */
8249                     goto parse_flags;
8250                 }
8251                 /*FALLTHROUGH */
8252             case '1': case '2': case '3': case '4': /* (?1) */
8253             case '5': case '6': case '7': case '8': case '9':
8254                 RExC_parse--;
8255               parse_recursion:
8256                 num = atoi(RExC_parse);
8257                 parse_start = RExC_parse - 1; /* MJD */
8258                 if (*RExC_parse == '-')
8259                     RExC_parse++;
8260                 while (isDIGIT(*RExC_parse))
8261                         RExC_parse++;
8262                 if (*RExC_parse!=')')
8263                     vFAIL("Expecting close bracket");
8264
8265               gen_recurse_regop:
8266                 if ( paren == '-' ) {
8267                     /*
8268                     Diagram of capture buffer numbering.
8269                     Top line is the normal capture buffer numbers
8270                     Bottom line is the negative indexing as from
8271                     the X (the (?-2))
8272
8273                     +   1 2    3 4 5 X          6 7
8274                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8275                     -   5 4    3 2 1 X          x x
8276
8277                     */
8278                     num = RExC_npar + num;
8279                     if (num < 1)  {
8280                         RExC_parse++;
8281                         vFAIL("Reference to nonexistent group");
8282                     }
8283                 } else if ( paren == '+' ) {
8284                     num = RExC_npar + num - 1;
8285                 }
8286
8287                 ret = reganode(pRExC_state, GOSUB, num);
8288                 if (!SIZE_ONLY) {
8289                     if (num > (I32)RExC_rx->nparens) {
8290                         RExC_parse++;
8291                         vFAIL("Reference to nonexistent group");
8292                     }
8293                     ARG2L_SET( ret, RExC_recurse_count++);
8294                     RExC_emit++;
8295                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8296                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8297                 } else {
8298                     RExC_size++;
8299                 }
8300                 RExC_seen |= REG_SEEN_RECURSE;
8301                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8302                 Set_Node_Offset(ret, parse_start); /* MJD */
8303
8304                 *flagp |= POSTPONED;
8305                 nextchar(pRExC_state);
8306                 return ret;
8307             } /* named and numeric backreferences */
8308             /* NOT REACHED */
8309
8310             case '?':           /* (??...) */
8311                 is_logical = 1;
8312                 if (*RExC_parse != '{') {
8313                     RExC_parse++;
8314                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8315                     /*NOTREACHED*/
8316                 }
8317                 *flagp |= POSTPONED;
8318                 paren = *RExC_parse++;
8319                 /* FALL THROUGH */
8320             case '{':           /* (?{...}) */
8321             {
8322                 I32 count = 1;
8323                 U32 n = 0;
8324                 char c;
8325                 char *s = RExC_parse;
8326
8327                 RExC_seen_zerolen++;
8328                 RExC_seen |= REG_SEEN_EVAL;
8329
8330                 if (   pRExC_state->num_code_blocks
8331                     && pRExC_state->code_index < pRExC_state->num_code_blocks
8332                     && pRExC_state->code_blocks[pRExC_state->code_index].start
8333                         == (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8334                             - RExC_start)
8335                 ) {
8336                     /* this is a pre-compiled literal (?{}) */
8337                     struct reg_code_block *cb =
8338                         &pRExC_state->code_blocks[pRExC_state->code_index];
8339                     RExC_parse = RExC_start + cb->end;
8340                     if (SIZE_ONLY)
8341                         RExC_seen_evals++;
8342                     else {
8343                         OP *o = cb->block;
8344                         if (cb->src_regex) {
8345                             n = add_data(pRExC_state, 2, "rl");
8346                             RExC_rxi->data->data[n] =
8347                                 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8348                         RExC_rxi->data->data[n+1] = (void*)o->op_next;
8349                         }
8350                         else {
8351                             n = add_data(pRExC_state, 1,
8352                                    (RExC_flags & PMf_HAS_CV) ? "L" : "l");
8353                             RExC_rxi->data->data[n] = (void*)o->op_next;
8354                         }
8355                     }
8356                     pRExC_state->code_index++;
8357                 }
8358                 else {
8359                     while (count && (c = *RExC_parse)) {
8360                         if (c == '\\') {
8361                             if (RExC_parse[1])
8362                                 RExC_parse++;
8363                         }
8364                         else if (c == '{')
8365                             count++;
8366                         else if (c == '}')
8367                             count--;
8368                         RExC_parse++;
8369                     }
8370                     if (*RExC_parse != ')') {
8371                         RExC_parse = s;
8372                         vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
8373                     }
8374                     if (!SIZE_ONLY) {
8375                         PAD *pad;
8376                         OP_4tree *sop, *rop;
8377                         SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
8378
8379                         ENTER;
8380                         Perl_save_re_context(aTHX);
8381                         rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
8382                         sop->op_private |= OPpREFCOUNTED;
8383                         /* re_dup will OpREFCNT_inc */
8384                         OpREFCNT_set(sop, 1);
8385                         LEAVE;
8386
8387                         n = add_data(pRExC_state, 3, "nop");
8388                         RExC_rxi->data->data[n] = (void*)rop;
8389                         RExC_rxi->data->data[n+1] = (void*)sop;
8390                         RExC_rxi->data->data[n+2] = (void*)pad;
8391                         SvREFCNT_dec(sv);
8392                     }
8393                     else {                                              /* First pass */
8394                         if (PL_reginterp_cnt < ++RExC_seen_evals
8395                             && IN_PERL_RUNTIME)
8396                             /* No compiled RE interpolated, has runtime
8397                                components ===> unsafe.  */
8398                             FAIL("Eval-group not allowed at runtime, use re 'eval'");
8399                         if (PL_tainting && PL_tainted)
8400                             FAIL("Eval-group in insecure regular expression");
8401     #if PERL_VERSION > 8
8402                         if (IN_PERL_COMPILETIME)
8403                             PL_cv_has_eval = 1;
8404     #endif
8405                     }
8406                 }
8407                 nextchar(pRExC_state);
8408
8409                 if (is_logical) {
8410                     ret = reg_node(pRExC_state, LOGICAL);
8411                     if (!SIZE_ONLY)
8412                         ret->flags = 2;
8413                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
8414                     /* deal with the length of this later - MJD */
8415                     return ret;
8416                 }
8417                 ret = reganode(pRExC_state, EVAL, n);
8418                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8419                 Set_Node_Offset(ret, parse_start);
8420                 return ret;
8421             }
8422             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8423             {
8424                 int is_define= 0;
8425                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8426                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8427                         || RExC_parse[1] == '<'
8428                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8429                         I32 flag;
8430
8431                         ret = reg_node(pRExC_state, LOGICAL);
8432                         if (!SIZE_ONLY)
8433                             ret->flags = 1;
8434                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8435                         goto insert_if;
8436                     }
8437                 }
8438                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8439                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8440                 {
8441                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8442                     char *name_start= RExC_parse++;
8443                     U32 num = 0;
8444                     SV *sv_dat=reg_scan_name(pRExC_state,
8445                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8446                     if (RExC_parse == name_start || *RExC_parse != ch)
8447                         vFAIL2("Sequence (?(%c... not terminated",
8448                             (ch == '>' ? '<' : ch));
8449                     RExC_parse++;
8450                     if (!SIZE_ONLY) {
8451                         num = add_data( pRExC_state, 1, "S" );
8452                         RExC_rxi->data->data[num]=(void*)sv_dat;
8453                         SvREFCNT_inc_simple_void(sv_dat);
8454                     }
8455                     ret = reganode(pRExC_state,NGROUPP,num);
8456                     goto insert_if_check_paren;
8457                 }
8458                 else if (RExC_parse[0] == 'D' &&
8459                          RExC_parse[1] == 'E' &&
8460                          RExC_parse[2] == 'F' &&
8461                          RExC_parse[3] == 'I' &&
8462                          RExC_parse[4] == 'N' &&
8463                          RExC_parse[5] == 'E')
8464                 {
8465                     ret = reganode(pRExC_state,DEFINEP,0);
8466                     RExC_parse +=6 ;
8467                     is_define = 1;
8468                     goto insert_if_check_paren;
8469                 }
8470                 else if (RExC_parse[0] == 'R') {
8471                     RExC_parse++;
8472                     parno = 0;
8473                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8474                         parno = atoi(RExC_parse++);
8475                         while (isDIGIT(*RExC_parse))
8476                             RExC_parse++;
8477                     } else if (RExC_parse[0] == '&') {
8478                         SV *sv_dat;
8479                         RExC_parse++;
8480                         sv_dat = reg_scan_name(pRExC_state,
8481                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8482                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8483                     }
8484                     ret = reganode(pRExC_state,INSUBP,parno);
8485                     goto insert_if_check_paren;
8486                 }
8487                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8488                     /* (?(1)...) */
8489                     char c;
8490                     parno = atoi(RExC_parse++);
8491
8492                     while (isDIGIT(*RExC_parse))
8493                         RExC_parse++;
8494                     ret = reganode(pRExC_state, GROUPP, parno);
8495
8496                  insert_if_check_paren:
8497                     if ((c = *nextchar(pRExC_state)) != ')')
8498                         vFAIL("Switch condition not recognized");
8499                   insert_if:
8500                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8501                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8502                     if (br == NULL)
8503                         br = reganode(pRExC_state, LONGJMP, 0);
8504                     else
8505                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8506                     c = *nextchar(pRExC_state);
8507                     if (flags&HASWIDTH)
8508                         *flagp |= HASWIDTH;
8509                     if (c == '|') {
8510                         if (is_define)
8511                             vFAIL("(?(DEFINE)....) does not allow branches");
8512                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8513                         regbranch(pRExC_state, &flags, 1,depth+1);
8514                         REGTAIL(pRExC_state, ret, lastbr);
8515                         if (flags&HASWIDTH)
8516                             *flagp |= HASWIDTH;
8517                         c = *nextchar(pRExC_state);
8518                     }
8519                     else
8520                         lastbr = NULL;
8521                     if (c != ')')
8522                         vFAIL("Switch (?(condition)... contains too many branches");
8523                     ender = reg_node(pRExC_state, TAIL);
8524                     REGTAIL(pRExC_state, br, ender);
8525                     if (lastbr) {
8526                         REGTAIL(pRExC_state, lastbr, ender);
8527                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8528                     }
8529                     else
8530                         REGTAIL(pRExC_state, ret, ender);
8531                     RExC_size++; /* XXX WHY do we need this?!!
8532                                     For large programs it seems to be required
8533                                     but I can't figure out why. -- dmq*/
8534                     return ret;
8535                 }
8536                 else {
8537                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8538                 }
8539             }
8540             case 0:
8541                 RExC_parse--; /* for vFAIL to print correctly */
8542                 vFAIL("Sequence (? incomplete");
8543                 break;
8544             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8545                                        that follow */
8546                 has_use_defaults = TRUE;
8547                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8548                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8549                                                 ? REGEX_UNICODE_CHARSET
8550                                                 : REGEX_DEPENDS_CHARSET);
8551                 goto parse_flags;
8552             default:
8553                 --RExC_parse;
8554                 parse_flags:      /* (?i) */
8555             {
8556                 U32 posflags = 0, negflags = 0;
8557                 U32 *flagsp = &posflags;
8558                 char has_charset_modifier = '\0';
8559                 regex_charset cs = get_regex_charset(RExC_flags);
8560                 if (cs == REGEX_DEPENDS_CHARSET
8561                     && (RExC_utf8 || RExC_uni_semantics))
8562                 {
8563                     cs = REGEX_UNICODE_CHARSET;
8564                 }
8565
8566                 while (*RExC_parse) {
8567                     /* && strchr("iogcmsx", *RExC_parse) */
8568                     /* (?g), (?gc) and (?o) are useless here
8569                        and must be globally applied -- japhy */
8570                     switch (*RExC_parse) {
8571                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8572                     case LOCALE_PAT_MOD:
8573                         if (has_charset_modifier) {
8574                             goto excess_modifier;
8575                         }
8576                         else if (flagsp == &negflags) {
8577                             goto neg_modifier;
8578                         }
8579                         cs = REGEX_LOCALE_CHARSET;
8580                         has_charset_modifier = LOCALE_PAT_MOD;
8581                         RExC_contains_locale = 1;
8582                         break;
8583                     case UNICODE_PAT_MOD:
8584                         if (has_charset_modifier) {
8585                             goto excess_modifier;
8586                         }
8587                         else if (flagsp == &negflags) {
8588                             goto neg_modifier;
8589                         }
8590                         cs = REGEX_UNICODE_CHARSET;
8591                         has_charset_modifier = UNICODE_PAT_MOD;
8592                         break;
8593                     case ASCII_RESTRICT_PAT_MOD:
8594                         if (flagsp == &negflags) {
8595                             goto neg_modifier;
8596                         }
8597                         if (has_charset_modifier) {
8598                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8599                                 goto excess_modifier;
8600                             }
8601                             /* Doubled modifier implies more restricted */
8602                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8603                         }
8604                         else {
8605                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8606                         }
8607                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8608                         break;
8609                     case DEPENDS_PAT_MOD:
8610                         if (has_use_defaults) {
8611                             goto fail_modifiers;
8612                         }
8613                         else if (flagsp == &negflags) {
8614                             goto neg_modifier;
8615                         }
8616                         else if (has_charset_modifier) {
8617                             goto excess_modifier;
8618                         }
8619
8620                         /* The dual charset means unicode semantics if the
8621                          * pattern (or target, not known until runtime) are
8622                          * utf8, or something in the pattern indicates unicode
8623                          * semantics */
8624                         cs = (RExC_utf8 || RExC_uni_semantics)
8625                              ? REGEX_UNICODE_CHARSET
8626                              : REGEX_DEPENDS_CHARSET;
8627                         has_charset_modifier = DEPENDS_PAT_MOD;
8628                         break;
8629                     excess_modifier:
8630                         RExC_parse++;
8631                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8632                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8633                         }
8634                         else if (has_charset_modifier == *(RExC_parse - 1)) {
8635                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8636                         }
8637                         else {
8638                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8639                         }
8640                         /*NOTREACHED*/
8641                     neg_modifier:
8642                         RExC_parse++;
8643                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8644                         /*NOTREACHED*/
8645                     case ONCE_PAT_MOD: /* 'o' */
8646                     case GLOBAL_PAT_MOD: /* 'g' */
8647                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8648                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8649                             if (! (wastedflags & wflagbit) ) {
8650                                 wastedflags |= wflagbit;
8651                                 vWARN5(
8652                                     RExC_parse + 1,
8653                                     "Useless (%s%c) - %suse /%c modifier",
8654                                     flagsp == &negflags ? "?-" : "?",
8655                                     *RExC_parse,
8656                                     flagsp == &negflags ? "don't " : "",
8657                                     *RExC_parse
8658                                 );
8659                             }
8660                         }
8661                         break;
8662
8663                     case CONTINUE_PAT_MOD: /* 'c' */
8664                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8665                             if (! (wastedflags & WASTED_C) ) {
8666                                 wastedflags |= WASTED_GC;
8667                                 vWARN3(
8668                                     RExC_parse + 1,
8669                                     "Useless (%sc) - %suse /gc modifier",
8670                                     flagsp == &negflags ? "?-" : "?",
8671                                     flagsp == &negflags ? "don't " : ""
8672                                 );
8673                             }
8674                         }
8675                         break;
8676                     case KEEPCOPY_PAT_MOD: /* 'p' */
8677                         if (flagsp == &negflags) {
8678                             if (SIZE_ONLY)
8679                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8680                         } else {
8681                             *flagsp |= RXf_PMf_KEEPCOPY;
8682                         }
8683                         break;
8684                     case '-':
8685                         /* A flag is a default iff it is following a minus, so
8686                          * if there is a minus, it means will be trying to
8687                          * re-specify a default which is an error */
8688                         if (has_use_defaults || flagsp == &negflags) {
8689             fail_modifiers:
8690                             RExC_parse++;
8691                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8692                             /*NOTREACHED*/
8693                         }
8694                         flagsp = &negflags;
8695                         wastedflags = 0;  /* reset so (?g-c) warns twice */
8696                         break;
8697                     case ':':
8698                         paren = ':';
8699                         /*FALLTHROUGH*/
8700                     case ')':
8701                         RExC_flags |= posflags;
8702                         RExC_flags &= ~negflags;
8703                         set_regex_charset(&RExC_flags, cs);
8704                         if (paren != ':') {
8705                             oregflags |= posflags;
8706                             oregflags &= ~negflags;
8707                             set_regex_charset(&oregflags, cs);
8708                         }
8709                         nextchar(pRExC_state);
8710                         if (paren != ':') {
8711                             *flagp = TRYAGAIN;
8712                             return NULL;
8713                         } else {
8714                             ret = NULL;
8715                             goto parse_rest;
8716                         }
8717                         /*NOTREACHED*/
8718                     default:
8719                         RExC_parse++;
8720                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8721                         /*NOTREACHED*/
8722                     }
8723                     ++RExC_parse;
8724                 }
8725             }} /* one for the default block, one for the switch */
8726         }
8727         else {                  /* (...) */
8728           capturing_parens:
8729             parno = RExC_npar;
8730             RExC_npar++;
8731
8732             ret = reganode(pRExC_state, OPEN, parno);
8733             if (!SIZE_ONLY ){
8734                 if (!RExC_nestroot)
8735                     RExC_nestroot = parno;
8736                 if (RExC_seen & REG_SEEN_RECURSE
8737                     && !RExC_open_parens[parno-1])
8738                 {
8739                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8740                         "Setting open paren #%"IVdf" to %d\n",
8741                         (IV)parno, REG_NODE_NUM(ret)));
8742                     RExC_open_parens[parno-1]= ret;
8743                 }
8744             }
8745             Set_Node_Length(ret, 1); /* MJD */
8746             Set_Node_Offset(ret, RExC_parse); /* MJD */
8747             is_open = 1;
8748         }
8749     }
8750     else                        /* ! paren */
8751         ret = NULL;
8752
8753    parse_rest:
8754     /* Pick up the branches, linking them together. */
8755     parse_start = RExC_parse;   /* MJD */
8756     br = regbranch(pRExC_state, &flags, 1,depth+1);
8757
8758     /*     branch_len = (paren != 0); */
8759
8760     if (br == NULL)
8761         return(NULL);
8762     if (*RExC_parse == '|') {
8763         if (!SIZE_ONLY && RExC_extralen) {
8764             reginsert(pRExC_state, BRANCHJ, br, depth+1);
8765         }
8766         else {                  /* MJD */
8767             reginsert(pRExC_state, BRANCH, br, depth+1);
8768             Set_Node_Length(br, paren != 0);
8769             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8770         }
8771         have_branch = 1;
8772         if (SIZE_ONLY)
8773             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
8774     }
8775     else if (paren == ':') {
8776         *flagp |= flags&SIMPLE;
8777     }
8778     if (is_open) {                              /* Starts with OPEN. */
8779         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
8780     }
8781     else if (paren != '?')              /* Not Conditional */
8782         ret = br;
8783     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8784     lastbr = br;
8785     while (*RExC_parse == '|') {
8786         if (!SIZE_ONLY && RExC_extralen) {
8787             ender = reganode(pRExC_state, LONGJMP,0);
8788             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8789         }
8790         if (SIZE_ONLY)
8791             RExC_extralen += 2;         /* Account for LONGJMP. */
8792         nextchar(pRExC_state);
8793         if (freeze_paren) {
8794             if (RExC_npar > after_freeze)
8795                 after_freeze = RExC_npar;
8796             RExC_npar = freeze_paren;
8797         }
8798         br = regbranch(pRExC_state, &flags, 0, depth+1);
8799
8800         if (br == NULL)
8801             return(NULL);
8802         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
8803         lastbr = br;
8804         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8805     }
8806
8807     if (have_branch || paren != ':') {
8808         /* Make a closing node, and hook it on the end. */
8809         switch (paren) {
8810         case ':':
8811             ender = reg_node(pRExC_state, TAIL);
8812             break;
8813         case 1:
8814             ender = reganode(pRExC_state, CLOSE, parno);
8815             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8816                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8817                         "Setting close paren #%"IVdf" to %d\n",
8818                         (IV)parno, REG_NODE_NUM(ender)));
8819                 RExC_close_parens[parno-1]= ender;
8820                 if (RExC_nestroot == parno)
8821                     RExC_nestroot = 0;
8822             }
8823             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8824             Set_Node_Length(ender,1); /* MJD */
8825             break;
8826         case '<':
8827         case ',':
8828         case '=':
8829         case '!':
8830             *flagp &= ~HASWIDTH;
8831             /* FALL THROUGH */
8832         case '>':
8833             ender = reg_node(pRExC_state, SUCCEED);
8834             break;
8835         case 0:
8836             ender = reg_node(pRExC_state, END);
8837             if (!SIZE_ONLY) {
8838                 assert(!RExC_opend); /* there can only be one! */
8839                 RExC_opend = ender;
8840             }
8841             break;
8842         }
8843         DEBUG_PARSE_r(if (!SIZE_ONLY) {
8844             SV * const mysv_val1=sv_newmortal();
8845             SV * const mysv_val2=sv_newmortal();
8846             DEBUG_PARSE_MSG("lsbr");
8847             regprop(RExC_rx, mysv_val1, lastbr);
8848             regprop(RExC_rx, mysv_val2, ender);
8849             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
8850                           SvPV_nolen_const(mysv_val1),
8851                           (IV)REG_NODE_NUM(lastbr),
8852                           SvPV_nolen_const(mysv_val2),
8853                           (IV)REG_NODE_NUM(ender),
8854                           (IV)(ender - lastbr)
8855             );
8856         });
8857         REGTAIL(pRExC_state, lastbr, ender);
8858
8859         if (have_branch && !SIZE_ONLY) {
8860             char is_nothing= 1;
8861             if (depth==1)
8862                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8863
8864             /* Hook the tails of the branches to the closing node. */
8865             for (br = ret; br; br = regnext(br)) {
8866                 const U8 op = PL_regkind[OP(br)];
8867                 if (op == BRANCH) {
8868                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8869                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
8870                         is_nothing= 0;
8871                 }
8872                 else if (op == BRANCHJ) {
8873                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8874                     /* for now we always disable this optimisation * /
8875                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
8876                     */
8877                         is_nothing= 0;
8878                 }
8879             }
8880             if (is_nothing) {
8881                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
8882                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
8883                     SV * const mysv_val1=sv_newmortal();
8884                     SV * const mysv_val2=sv_newmortal();
8885                     DEBUG_PARSE_MSG("NADA");
8886                     regprop(RExC_rx, mysv_val1, ret);
8887                     regprop(RExC_rx, mysv_val2, ender);
8888                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
8889                                   SvPV_nolen_const(mysv_val1),
8890                                   (IV)REG_NODE_NUM(ret),
8891                                   SvPV_nolen_const(mysv_val2),
8892                                   (IV)REG_NODE_NUM(ender),
8893                                   (IV)(ender - ret)
8894                     );
8895                 });
8896                 OP(br)= NOTHING;
8897                 if (OP(ender) == TAIL) {
8898                     NEXT_OFF(br)= 0;
8899                     RExC_emit= br + 1;
8900                 } else {
8901                     regnode *opt;
8902                     for ( opt= br + 1; opt < ender ; opt++ )
8903                         OP(opt)= OPTIMIZED;
8904                     NEXT_OFF(br)= ender - br;
8905                 }
8906             }
8907         }
8908     }
8909
8910     {
8911         const char *p;
8912         static const char parens[] = "=!<,>";
8913
8914         if (paren && (p = strchr(parens, paren))) {
8915             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8916             int flag = (p - parens) > 1;
8917
8918             if (paren == '>')
8919                 node = SUSPEND, flag = 0;
8920             reginsert(pRExC_state, node,ret, depth+1);
8921             Set_Node_Cur_Length(ret);
8922             Set_Node_Offset(ret, parse_start + 1);
8923             ret->flags = flag;
8924             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8925         }
8926     }
8927
8928     /* Check for proper termination. */
8929     if (paren) {
8930         RExC_flags = oregflags;
8931         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8932             RExC_parse = oregcomp_parse;
8933             vFAIL("Unmatched (");
8934         }
8935     }
8936     else if (!paren && RExC_parse < RExC_end) {
8937         if (*RExC_parse == ')') {
8938             RExC_parse++;
8939             vFAIL("Unmatched )");
8940         }
8941         else
8942             FAIL("Junk on end of regexp");      /* "Can't happen". */
8943         /* NOTREACHED */
8944     }
8945
8946     if (RExC_in_lookbehind) {
8947         RExC_in_lookbehind--;
8948     }
8949     if (after_freeze > RExC_npar)
8950         RExC_npar = after_freeze;
8951     return(ret);
8952 }
8953
8954 /*
8955  - regbranch - one alternative of an | operator
8956  *
8957  * Implements the concatenation operator.
8958  */
8959 STATIC regnode *
8960 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8961 {
8962     dVAR;
8963     register regnode *ret;
8964     register regnode *chain = NULL;
8965     register regnode *latest;
8966     I32 flags = 0, c = 0;
8967     GET_RE_DEBUG_FLAGS_DECL;
8968
8969     PERL_ARGS_ASSERT_REGBRANCH;
8970
8971     DEBUG_PARSE("brnc");
8972
8973     if (first)
8974         ret = NULL;
8975     else {
8976         if (!SIZE_ONLY && RExC_extralen)
8977             ret = reganode(pRExC_state, BRANCHJ,0);
8978         else {
8979             ret = reg_node(pRExC_state, BRANCH);
8980             Set_Node_Length(ret, 1);
8981         }
8982     }
8983
8984     if (!first && SIZE_ONLY)
8985         RExC_extralen += 1;                     /* BRANCHJ */
8986
8987     *flagp = WORST;                     /* Tentatively. */
8988
8989     RExC_parse--;
8990     nextchar(pRExC_state);
8991     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8992         flags &= ~TRYAGAIN;
8993         latest = regpiece(pRExC_state, &flags,depth+1);
8994         if (latest == NULL) {
8995             if (flags & TRYAGAIN)
8996                 continue;
8997             return(NULL);
8998         }
8999         else if (ret == NULL)
9000             ret = latest;
9001         *flagp |= flags&(HASWIDTH|POSTPONED);
9002         if (chain == NULL)      /* First piece. */
9003             *flagp |= flags&SPSTART;
9004         else {
9005             RExC_naughty++;
9006             REGTAIL(pRExC_state, chain, latest);
9007         }
9008         chain = latest;
9009         c++;
9010     }
9011     if (chain == NULL) {        /* Loop ran zero times. */
9012         chain = reg_node(pRExC_state, NOTHING);
9013         if (ret == NULL)
9014             ret = chain;
9015     }
9016     if (c == 1) {
9017         *flagp |= flags&SIMPLE;
9018     }
9019
9020     return ret;
9021 }
9022
9023 /*
9024  - regpiece - something followed by possible [*+?]
9025  *
9026  * Note that the branching code sequences used for ? and the general cases
9027  * of * and + are somewhat optimized:  they use the same NOTHING node as
9028  * both the endmarker for their branch list and the body of the last branch.
9029  * It might seem that this node could be dispensed with entirely, but the
9030  * endmarker role is not redundant.
9031  */
9032 STATIC regnode *
9033 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9034 {
9035     dVAR;
9036     register regnode *ret;
9037     register char op;
9038     register char *next;
9039     I32 flags;
9040     const char * const origparse = RExC_parse;
9041     I32 min;
9042     I32 max = REG_INFTY;
9043 #ifdef RE_TRACK_PATTERN_OFFSETS
9044     char *parse_start;
9045 #endif
9046     const char *maxpos = NULL;
9047     GET_RE_DEBUG_FLAGS_DECL;
9048
9049     PERL_ARGS_ASSERT_REGPIECE;
9050
9051     DEBUG_PARSE("piec");
9052
9053     ret = regatom(pRExC_state, &flags,depth+1);
9054     if (ret == NULL) {
9055         if (flags & TRYAGAIN)
9056             *flagp |= TRYAGAIN;
9057         return(NULL);
9058     }
9059
9060     op = *RExC_parse;
9061
9062     if (op == '{' && regcurly(RExC_parse)) {
9063         maxpos = NULL;
9064 #ifdef RE_TRACK_PATTERN_OFFSETS
9065         parse_start = RExC_parse; /* MJD */
9066 #endif
9067         next = RExC_parse + 1;
9068         while (isDIGIT(*next) || *next == ',') {
9069             if (*next == ',') {
9070                 if (maxpos)
9071                     break;
9072                 else
9073                     maxpos = next;
9074             }
9075             next++;
9076         }
9077         if (*next == '}') {             /* got one */
9078             if (!maxpos)
9079                 maxpos = next;
9080             RExC_parse++;
9081             min = atoi(RExC_parse);
9082             if (*maxpos == ',')
9083                 maxpos++;
9084             else
9085                 maxpos = RExC_parse;
9086             max = atoi(maxpos);
9087             if (!max && *maxpos != '0')
9088                 max = REG_INFTY;                /* meaning "infinity" */
9089             else if (max >= REG_INFTY)
9090                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9091             RExC_parse = next;
9092             nextchar(pRExC_state);
9093
9094         do_curly:
9095             if ((flags&SIMPLE)) {
9096                 RExC_naughty += 2 + RExC_naughty / 2;
9097                 reginsert(pRExC_state, CURLY, ret, depth+1);
9098                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9099                 Set_Node_Cur_Length(ret);
9100             }
9101             else {
9102                 regnode * const w = reg_node(pRExC_state, WHILEM);
9103
9104                 w->flags = 0;
9105                 REGTAIL(pRExC_state, ret, w);
9106                 if (!SIZE_ONLY && RExC_extralen) {
9107                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9108                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9109                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9110                 }
9111                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9112                                 /* MJD hk */
9113                 Set_Node_Offset(ret, parse_start+1);
9114                 Set_Node_Length(ret,
9115                                 op == '{' ? (RExC_parse - parse_start) : 1);
9116
9117                 if (!SIZE_ONLY && RExC_extralen)
9118                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9119                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9120                 if (SIZE_ONLY)
9121                     RExC_whilem_seen++, RExC_extralen += 3;
9122                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9123             }
9124             ret->flags = 0;
9125
9126             if (min > 0)
9127                 *flagp = WORST;
9128             if (max > 0)
9129                 *flagp |= HASWIDTH;
9130             if (max < min)
9131                 vFAIL("Can't do {n,m} with n > m");
9132             if (!SIZE_ONLY) {
9133                 ARG1_SET(ret, (U16)min);
9134                 ARG2_SET(ret, (U16)max);
9135             }
9136
9137             goto nest_check;
9138         }
9139     }
9140
9141     if (!ISMULT1(op)) {
9142         *flagp = flags;
9143         return(ret);
9144     }
9145
9146 #if 0                           /* Now runtime fix should be reliable. */
9147
9148     /* if this is reinstated, don't forget to put this back into perldiag:
9149
9150             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9151
9152            (F) The part of the regexp subject to either the * or + quantifier
9153            could match an empty string. The {#} shows in the regular
9154            expression about where the problem was discovered.
9155
9156     */
9157
9158     if (!(flags&HASWIDTH) && op != '?')
9159       vFAIL("Regexp *+ operand could be empty");
9160 #endif
9161
9162 #ifdef RE_TRACK_PATTERN_OFFSETS
9163     parse_start = RExC_parse;
9164 #endif
9165     nextchar(pRExC_state);
9166
9167     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9168
9169     if (op == '*' && (flags&SIMPLE)) {
9170         reginsert(pRExC_state, STAR, ret, depth+1);
9171         ret->flags = 0;
9172         RExC_naughty += 4;
9173     }
9174     else if (op == '*') {
9175         min = 0;
9176         goto do_curly;
9177     }
9178     else if (op == '+' && (flags&SIMPLE)) {
9179         reginsert(pRExC_state, PLUS, ret, depth+1);
9180         ret->flags = 0;
9181         RExC_naughty += 3;
9182     }
9183     else if (op == '+') {
9184         min = 1;
9185         goto do_curly;
9186     }
9187     else if (op == '?') {
9188         min = 0; max = 1;
9189         goto do_curly;
9190     }
9191   nest_check:
9192     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9193         ckWARN3reg(RExC_parse,
9194                    "%.*s matches null string many times",
9195                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9196                    origparse);
9197     }
9198
9199     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9200         nextchar(pRExC_state);
9201         reginsert(pRExC_state, MINMOD, ret, depth+1);
9202         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9203     }
9204 #ifndef REG_ALLOW_MINMOD_SUSPEND
9205     else
9206 #endif
9207     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9208         regnode *ender;
9209         nextchar(pRExC_state);
9210         ender = reg_node(pRExC_state, SUCCEED);
9211         REGTAIL(pRExC_state, ret, ender);
9212         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9213         ret->flags = 0;
9214         ender = reg_node(pRExC_state, TAIL);
9215         REGTAIL(pRExC_state, ret, ender);
9216         /*ret= ender;*/
9217     }
9218
9219     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9220         RExC_parse++;
9221         vFAIL("Nested quantifiers");
9222     }
9223
9224     return(ret);
9225 }
9226
9227
9228 /* reg_namedseq(pRExC_state,UVp, UV depth)
9229
9230    This is expected to be called by a parser routine that has
9231    recognized '\N' and needs to handle the rest. RExC_parse is
9232    expected to point at the first char following the N at the time
9233    of the call.
9234
9235    The \N may be inside (indicated by valuep not being NULL) or outside a
9236    character class.
9237
9238    \N may begin either a named sequence, or if outside a character class, mean
9239    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9240    attempted to decide which, and in the case of a named sequence converted it
9241    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9242    where c1... are the characters in the sequence.  For single-quoted regexes,
9243    the tokenizer passes the \N sequence through unchanged; this code will not
9244    attempt to determine this nor expand those.  The net effect is that if the
9245    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
9246    signals that this \N occurrence means to match a non-newline.
9247
9248    Only the \N{U+...} form should occur in a character class, for the same
9249    reason that '.' inside a character class means to just match a period: it
9250    just doesn't make sense.
9251
9252    If valuep is non-null then it is assumed that we are parsing inside
9253    of a charclass definition and the first codepoint in the resolved
9254    string is returned via *valuep and the routine will return NULL.
9255    In this mode if a multichar string is returned from the charnames
9256    handler, a warning will be issued, and only the first char in the
9257    sequence will be examined. If the string returned is zero length
9258    then the value of *valuep is undefined and NON-NULL will
9259    be returned to indicate failure. (This will NOT be a valid pointer
9260    to a regnode.)
9261
9262    If valuep is null then it is assumed that we are parsing normal text and a
9263    new EXACT node is inserted into the program containing the resolved string,
9264    and a pointer to the new node is returned.  But if the string is zero length
9265    a NOTHING node is emitted instead.
9266
9267    On success RExC_parse is set to the char following the endbrace.
9268    Parsing failures will generate a fatal error via vFAIL(...)
9269  */
9270 STATIC regnode *
9271 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
9272 {
9273     char * endbrace;    /* '}' following the name */
9274     regnode *ret = NULL;
9275     char* p;
9276
9277     GET_RE_DEBUG_FLAGS_DECL;
9278
9279     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
9280
9281     GET_RE_DEBUG_FLAGS;
9282
9283     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9284      * modifier.  The other meaning does not */
9285     p = (RExC_flags & RXf_PMf_EXTENDED)
9286         ? regwhite( pRExC_state, RExC_parse )
9287         : RExC_parse;
9288
9289     /* Disambiguate between \N meaning a named character versus \N meaning
9290      * [^\n].  The former is assumed when it can't be the latter. */
9291     if (*p != '{' || regcurly(p)) {
9292         RExC_parse = p;
9293         if (valuep) {
9294             /* no bare \N in a charclass */
9295             vFAIL("\\N in a character class must be a named character: \\N{...}");
9296         }
9297         nextchar(pRExC_state);
9298         ret = reg_node(pRExC_state, REG_ANY);
9299         *flagp |= HASWIDTH|SIMPLE;
9300         RExC_naughty++;
9301         RExC_parse--;
9302         Set_Node_Length(ret, 1); /* MJD */
9303         return ret;
9304     }
9305
9306     /* Here, we have decided it should be a named sequence */
9307
9308     /* The test above made sure that the next real character is a '{', but
9309      * under the /x modifier, it could be separated by space (or a comment and
9310      * \n) and this is not allowed (for consistency with \x{...} and the
9311      * tokenizer handling of \N{NAME}). */
9312     if (*RExC_parse != '{') {
9313         vFAIL("Missing braces on \\N{}");
9314     }
9315
9316     RExC_parse++;       /* Skip past the '{' */
9317
9318     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9319         || ! (endbrace == RExC_parse            /* nothing between the {} */
9320               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9321                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9322     {
9323         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9324         vFAIL("\\N{NAME} must be resolved by the lexer");
9325     }
9326
9327     if (endbrace == RExC_parse) {   /* empty: \N{} */
9328         if (! valuep) {
9329             RExC_parse = endbrace + 1;
9330             return reg_node(pRExC_state,NOTHING);
9331         }
9332
9333         if (SIZE_ONLY) {
9334             ckWARNreg(RExC_parse,
9335                     "Ignoring zero length \\N{} in character class"
9336             );
9337             RExC_parse = endbrace + 1;
9338         }
9339         *valuep = 0;
9340         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
9341     }
9342
9343     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
9344     RExC_parse += 2;    /* Skip past the 'U+' */
9345
9346     if (valuep) {   /* In a bracketed char class */
9347         /* We only pay attention to the first char of
9348         multichar strings being returned. I kinda wonder
9349         if this makes sense as it does change the behaviour
9350         from earlier versions, OTOH that behaviour was broken
9351         as well. XXX Solution is to recharacterize as
9352         [rest-of-class]|multi1|multi2... */
9353
9354         STRLEN length_of_hex;
9355         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9356             | PERL_SCAN_DISALLOW_PREFIX
9357             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9358
9359         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
9360         if (endchar < endbrace) {
9361             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9362         }
9363
9364         length_of_hex = (STRLEN)(endchar - RExC_parse);
9365         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
9366
9367         /* The tokenizer should have guaranteed validity, but it's possible to
9368          * bypass it by using single quoting, so check */
9369         if (length_of_hex == 0
9370             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9371         {
9372             RExC_parse += length_of_hex;        /* Includes all the valid */
9373             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9374                             ? UTF8SKIP(RExC_parse)
9375                             : 1;
9376             /* Guard against malformed utf8 */
9377             if (RExC_parse >= endchar) RExC_parse = endchar;
9378             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9379         }
9380
9381         RExC_parse = endbrace + 1;
9382         if (endchar == endbrace) return NULL;
9383
9384         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
9385     }
9386     else {      /* Not a char class */
9387
9388         /* What is done here is to convert this to a sub-pattern of the form
9389          * (?:\x{char1}\x{char2}...)
9390          * and then call reg recursively.  That way, it retains its atomicness,
9391          * while not having to worry about special handling that some code
9392          * points may have.  toke.c has converted the original Unicode values
9393          * to native, so that we can just pass on the hex values unchanged.  We
9394          * do have to set a flag to keep recoding from happening in the
9395          * recursion */
9396
9397         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9398         STRLEN len;
9399         char *endchar;      /* Points to '.' or '}' ending cur char in the input
9400                                stream */
9401         char *orig_end = RExC_end;
9402
9403         while (RExC_parse < endbrace) {
9404
9405             /* Code points are separated by dots.  If none, there is only one
9406              * code point, and is terminated by the brace */
9407             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9408
9409             /* Convert to notation the rest of the code understands */
9410             sv_catpv(substitute_parse, "\\x{");
9411             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9412             sv_catpv(substitute_parse, "}");
9413
9414             /* Point to the beginning of the next character in the sequence. */
9415             RExC_parse = endchar + 1;
9416         }
9417         sv_catpv(substitute_parse, ")");
9418
9419         RExC_parse = SvPV(substitute_parse, len);
9420
9421         /* Don't allow empty number */
9422         if (len < 8) {
9423             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9424         }
9425         RExC_end = RExC_parse + len;
9426
9427         /* The values are Unicode, and therefore not subject to recoding */
9428         RExC_override_recoding = 1;
9429
9430         ret = reg(pRExC_state, 1, flagp, depth+1);
9431
9432         RExC_parse = endbrace;
9433         RExC_end = orig_end;
9434         RExC_override_recoding = 0;
9435
9436         nextchar(pRExC_state);
9437     }
9438
9439     return ret;
9440 }
9441
9442
9443 /*
9444  * reg_recode
9445  *
9446  * It returns the code point in utf8 for the value in *encp.
9447  *    value: a code value in the source encoding
9448  *    encp:  a pointer to an Encode object
9449  *
9450  * If the result from Encode is not a single character,
9451  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9452  */
9453 STATIC UV
9454 S_reg_recode(pTHX_ const char value, SV **encp)
9455 {
9456     STRLEN numlen = 1;
9457     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9458     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9459     const STRLEN newlen = SvCUR(sv);
9460     UV uv = UNICODE_REPLACEMENT;
9461
9462     PERL_ARGS_ASSERT_REG_RECODE;
9463
9464     if (newlen)
9465         uv = SvUTF8(sv)
9466              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9467              : *(U8*)s;
9468
9469     if (!newlen || numlen != newlen) {
9470         uv = UNICODE_REPLACEMENT;
9471         *encp = NULL;
9472     }
9473     return uv;
9474 }
9475
9476
9477 /*
9478  - regatom - the lowest level
9479
9480    Try to identify anything special at the start of the pattern. If there
9481    is, then handle it as required. This may involve generating a single regop,
9482    such as for an assertion; or it may involve recursing, such as to
9483    handle a () structure.
9484
9485    If the string doesn't start with something special then we gobble up
9486    as much literal text as we can.
9487
9488    Once we have been able to handle whatever type of thing started the
9489    sequence, we return.
9490
9491    Note: we have to be careful with escapes, as they can be both literal
9492    and special, and in the case of \10 and friends can either, depending
9493    on context. Specifically there are two separate switches for handling
9494    escape sequences, with the one for handling literal escapes requiring
9495    a dummy entry for all of the special escapes that are actually handled
9496    by the other.
9497 */
9498
9499 STATIC regnode *
9500 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9501 {
9502     dVAR;
9503     register regnode *ret = NULL;
9504     I32 flags;
9505     char *parse_start = RExC_parse;
9506     U8 op;
9507     GET_RE_DEBUG_FLAGS_DECL;
9508     DEBUG_PARSE("atom");
9509     *flagp = WORST;             /* Tentatively. */
9510
9511     PERL_ARGS_ASSERT_REGATOM;
9512
9513 tryagain:
9514     switch ((U8)*RExC_parse) {
9515     case '^':
9516         RExC_seen_zerolen++;
9517         nextchar(pRExC_state);
9518         if (RExC_flags & RXf_PMf_MULTILINE)
9519             ret = reg_node(pRExC_state, MBOL);
9520         else if (RExC_flags & RXf_PMf_SINGLELINE)
9521             ret = reg_node(pRExC_state, SBOL);
9522         else
9523             ret = reg_node(pRExC_state, BOL);
9524         Set_Node_Length(ret, 1); /* MJD */
9525         break;
9526     case '$':
9527         nextchar(pRExC_state);
9528         if (*RExC_parse)
9529             RExC_seen_zerolen++;
9530         if (RExC_flags & RXf_PMf_MULTILINE)
9531             ret = reg_node(pRExC_state, MEOL);
9532         else if (RExC_flags & RXf_PMf_SINGLELINE)
9533             ret = reg_node(pRExC_state, SEOL);
9534         else
9535             ret = reg_node(pRExC_state, EOL);
9536         Set_Node_Length(ret, 1); /* MJD */
9537         break;
9538     case '.':
9539         nextchar(pRExC_state);
9540         if (RExC_flags & RXf_PMf_SINGLELINE)
9541             ret = reg_node(pRExC_state, SANY);
9542         else
9543             ret = reg_node(pRExC_state, REG_ANY);
9544         *flagp |= HASWIDTH|SIMPLE;
9545         RExC_naughty++;
9546         Set_Node_Length(ret, 1); /* MJD */
9547         break;
9548     case '[':
9549     {
9550         char * const oregcomp_parse = ++RExC_parse;
9551         ret = regclass(pRExC_state,depth+1);
9552         if (*RExC_parse != ']') {
9553             RExC_parse = oregcomp_parse;
9554             vFAIL("Unmatched [");
9555         }
9556         nextchar(pRExC_state);
9557         *flagp |= HASWIDTH|SIMPLE;
9558         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
9559         break;
9560     }
9561     case '(':
9562         nextchar(pRExC_state);
9563         ret = reg(pRExC_state, 1, &flags,depth+1);
9564         if (ret == NULL) {
9565                 if (flags & TRYAGAIN) {
9566                     if (RExC_parse == RExC_end) {
9567                          /* Make parent create an empty node if needed. */
9568                         *flagp |= TRYAGAIN;
9569                         return(NULL);
9570                     }
9571                     goto tryagain;
9572                 }
9573                 return(NULL);
9574         }
9575         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9576         break;
9577     case '|':
9578     case ')':
9579         if (flags & TRYAGAIN) {
9580             *flagp |= TRYAGAIN;
9581             return NULL;
9582         }
9583         vFAIL("Internal urp");
9584                                 /* Supposed to be caught earlier. */
9585         break;
9586     case '?':
9587     case '+':
9588     case '*':
9589         RExC_parse++;
9590         vFAIL("Quantifier follows nothing");
9591         break;
9592     case '\\':
9593         /* Special Escapes
9594
9595            This switch handles escape sequences that resolve to some kind
9596            of special regop and not to literal text. Escape sequnces that
9597            resolve to literal text are handled below in the switch marked
9598            "Literal Escapes".
9599
9600            Every entry in this switch *must* have a corresponding entry
9601            in the literal escape switch. However, the opposite is not
9602            required, as the default for this switch is to jump to the
9603            literal text handling code.
9604         */
9605         switch ((U8)*++RExC_parse) {
9606         /* Special Escapes */
9607         case 'A':
9608             RExC_seen_zerolen++;
9609             ret = reg_node(pRExC_state, SBOL);
9610             *flagp |= SIMPLE;
9611             goto finish_meta_pat;
9612         case 'G':
9613             ret = reg_node(pRExC_state, GPOS);
9614             RExC_seen |= REG_SEEN_GPOS;
9615             *flagp |= SIMPLE;
9616             goto finish_meta_pat;
9617         case 'K':
9618             RExC_seen_zerolen++;
9619             ret = reg_node(pRExC_state, KEEPS);
9620             *flagp |= SIMPLE;
9621             /* XXX:dmq : disabling in-place substitution seems to
9622              * be necessary here to avoid cases of memory corruption, as
9623              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9624              */
9625             RExC_seen |= REG_SEEN_LOOKBEHIND;
9626             goto finish_meta_pat;
9627         case 'Z':
9628             ret = reg_node(pRExC_state, SEOL);
9629             *flagp |= SIMPLE;
9630             RExC_seen_zerolen++;                /* Do not optimize RE away */
9631             goto finish_meta_pat;
9632         case 'z':
9633             ret = reg_node(pRExC_state, EOS);
9634             *flagp |= SIMPLE;
9635             RExC_seen_zerolen++;                /* Do not optimize RE away */
9636             goto finish_meta_pat;
9637         case 'C':
9638             ret = reg_node(pRExC_state, CANY);
9639             RExC_seen |= REG_SEEN_CANY;
9640             *flagp |= HASWIDTH|SIMPLE;
9641             goto finish_meta_pat;
9642         case 'X':
9643             ret = reg_node(pRExC_state, CLUMP);
9644             *flagp |= HASWIDTH;
9645             goto finish_meta_pat;
9646         case 'w':
9647             switch (get_regex_charset(RExC_flags)) {
9648                 case REGEX_LOCALE_CHARSET:
9649                     op = ALNUML;
9650                     break;
9651                 case REGEX_UNICODE_CHARSET:
9652                     op = ALNUMU;
9653                     break;
9654                 case REGEX_ASCII_RESTRICTED_CHARSET:
9655                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9656                     op = ALNUMA;
9657                     break;
9658                 case REGEX_DEPENDS_CHARSET:
9659                     op = ALNUM;
9660                     break;
9661                 default:
9662                     goto bad_charset;
9663             }
9664             ret = reg_node(pRExC_state, op);
9665             *flagp |= HASWIDTH|SIMPLE;
9666             goto finish_meta_pat;
9667         case 'W':
9668             switch (get_regex_charset(RExC_flags)) {
9669                 case REGEX_LOCALE_CHARSET:
9670                     op = NALNUML;
9671                     break;
9672                 case REGEX_UNICODE_CHARSET:
9673                     op = NALNUMU;
9674                     break;
9675                 case REGEX_ASCII_RESTRICTED_CHARSET:
9676                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9677                     op = NALNUMA;
9678                     break;
9679                 case REGEX_DEPENDS_CHARSET:
9680                     op = NALNUM;
9681                     break;
9682                 default:
9683                     goto bad_charset;
9684             }
9685             ret = reg_node(pRExC_state, op);
9686             *flagp |= HASWIDTH|SIMPLE;
9687             goto finish_meta_pat;
9688         case 'b':
9689             RExC_seen_zerolen++;
9690             RExC_seen |= REG_SEEN_LOOKBEHIND;
9691             switch (get_regex_charset(RExC_flags)) {
9692                 case REGEX_LOCALE_CHARSET:
9693                     op = BOUNDL;
9694                     break;
9695                 case REGEX_UNICODE_CHARSET:
9696                     op = BOUNDU;
9697                     break;
9698                 case REGEX_ASCII_RESTRICTED_CHARSET:
9699                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9700                     op = BOUNDA;
9701                     break;
9702                 case REGEX_DEPENDS_CHARSET:
9703                     op = BOUND;
9704                     break;
9705                 default:
9706                     goto bad_charset;
9707             }
9708             ret = reg_node(pRExC_state, op);
9709             FLAGS(ret) = get_regex_charset(RExC_flags);
9710             *flagp |= SIMPLE;
9711             goto finish_meta_pat;
9712         case 'B':
9713             RExC_seen_zerolen++;
9714             RExC_seen |= REG_SEEN_LOOKBEHIND;
9715             switch (get_regex_charset(RExC_flags)) {
9716                 case REGEX_LOCALE_CHARSET:
9717                     op = NBOUNDL;
9718                     break;
9719                 case REGEX_UNICODE_CHARSET:
9720                     op = NBOUNDU;
9721                     break;
9722                 case REGEX_ASCII_RESTRICTED_CHARSET:
9723                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9724                     op = NBOUNDA;
9725                     break;
9726                 case REGEX_DEPENDS_CHARSET:
9727                     op = NBOUND;
9728                     break;
9729                 default:
9730                     goto bad_charset;
9731             }
9732             ret = reg_node(pRExC_state, op);
9733             FLAGS(ret) = get_regex_charset(RExC_flags);
9734             *flagp |= SIMPLE;
9735             goto finish_meta_pat;
9736         case 's':
9737             switch (get_regex_charset(RExC_flags)) {
9738                 case REGEX_LOCALE_CHARSET:
9739                     op = SPACEL;
9740                     break;
9741                 case REGEX_UNICODE_CHARSET:
9742                     op = SPACEU;
9743                     break;
9744                 case REGEX_ASCII_RESTRICTED_CHARSET:
9745                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9746                     op = SPACEA;
9747                     break;
9748                 case REGEX_DEPENDS_CHARSET:
9749                     op = SPACE;
9750                     break;
9751                 default:
9752                     goto bad_charset;
9753             }
9754             ret = reg_node(pRExC_state, op);
9755             *flagp |= HASWIDTH|SIMPLE;
9756             goto finish_meta_pat;
9757         case 'S':
9758             switch (get_regex_charset(RExC_flags)) {
9759                 case REGEX_LOCALE_CHARSET:
9760                     op = NSPACEL;
9761                     break;
9762                 case REGEX_UNICODE_CHARSET:
9763                     op = NSPACEU;
9764                     break;
9765                 case REGEX_ASCII_RESTRICTED_CHARSET:
9766                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9767                     op = NSPACEA;
9768                     break;
9769                 case REGEX_DEPENDS_CHARSET:
9770                     op = NSPACE;
9771                     break;
9772                 default:
9773                     goto bad_charset;
9774             }
9775             ret = reg_node(pRExC_state, op);
9776             *flagp |= HASWIDTH|SIMPLE;
9777             goto finish_meta_pat;
9778         case 'd':
9779             switch (get_regex_charset(RExC_flags)) {
9780                 case REGEX_LOCALE_CHARSET:
9781                     op = DIGITL;
9782                     break;
9783                 case REGEX_ASCII_RESTRICTED_CHARSET:
9784                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9785                     op = DIGITA;
9786                     break;
9787                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9788                 case REGEX_UNICODE_CHARSET:
9789                     op = DIGIT;
9790                     break;
9791                 default:
9792                     goto bad_charset;
9793             }
9794             ret = reg_node(pRExC_state, op);
9795             *flagp |= HASWIDTH|SIMPLE;
9796             goto finish_meta_pat;
9797         case 'D':
9798             switch (get_regex_charset(RExC_flags)) {
9799                 case REGEX_LOCALE_CHARSET:
9800                     op = NDIGITL;
9801                     break;
9802                 case REGEX_ASCII_RESTRICTED_CHARSET:
9803                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9804                     op = NDIGITA;
9805                     break;
9806                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9807                 case REGEX_UNICODE_CHARSET:
9808                     op = NDIGIT;
9809                     break;
9810                 default:
9811                     goto bad_charset;
9812             }
9813             ret = reg_node(pRExC_state, op);
9814             *flagp |= HASWIDTH|SIMPLE;
9815             goto finish_meta_pat;
9816         case 'R':
9817             ret = reg_node(pRExC_state, LNBREAK);
9818             *flagp |= HASWIDTH|SIMPLE;
9819             goto finish_meta_pat;
9820         case 'h':
9821             ret = reg_node(pRExC_state, HORIZWS);
9822             *flagp |= HASWIDTH|SIMPLE;
9823             goto finish_meta_pat;
9824         case 'H':
9825             ret = reg_node(pRExC_state, NHORIZWS);
9826             *flagp |= HASWIDTH|SIMPLE;
9827             goto finish_meta_pat;
9828         case 'v':
9829             ret = reg_node(pRExC_state, VERTWS);
9830             *flagp |= HASWIDTH|SIMPLE;
9831             goto finish_meta_pat;
9832         case 'V':
9833             ret = reg_node(pRExC_state, NVERTWS);
9834             *flagp |= HASWIDTH|SIMPLE;
9835          finish_meta_pat:
9836             nextchar(pRExC_state);
9837             Set_Node_Length(ret, 2); /* MJD */
9838             break;
9839         case 'p':
9840         case 'P':
9841             {
9842                 char* const oldregxend = RExC_end;
9843 #ifdef DEBUGGING
9844                 char* parse_start = RExC_parse - 2;
9845 #endif
9846
9847                 if (RExC_parse[1] == '{') {
9848                   /* a lovely hack--pretend we saw [\pX] instead */
9849                     RExC_end = strchr(RExC_parse, '}');
9850                     if (!RExC_end) {
9851                         const U8 c = (U8)*RExC_parse;
9852                         RExC_parse += 2;
9853                         RExC_end = oldregxend;
9854                         vFAIL2("Missing right brace on \\%c{}", c);
9855                     }
9856                     RExC_end++;
9857                 }
9858                 else {
9859                     RExC_end = RExC_parse + 2;
9860                     if (RExC_end > oldregxend)
9861                         RExC_end = oldregxend;
9862                 }
9863                 RExC_parse--;
9864
9865                 ret = regclass(pRExC_state,depth+1);
9866
9867                 RExC_end = oldregxend;
9868                 RExC_parse--;
9869
9870                 Set_Node_Offset(ret, parse_start + 2);
9871                 Set_Node_Cur_Length(ret);
9872                 nextchar(pRExC_state);
9873                 *flagp |= HASWIDTH|SIMPLE;
9874             }
9875             break;
9876         case 'N':
9877             /* Handle \N and \N{NAME} here and not below because it can be
9878             multicharacter. join_exact() will join them up later on.
9879             Also this makes sure that things like /\N{BLAH}+/ and
9880             \N{BLAH} being multi char Just Happen. dmq*/
9881             ++RExC_parse;
9882             ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9883             break;
9884         case 'k':    /* Handle \k<NAME> and \k'NAME' */
9885         parse_named_seq:
9886         {
9887             char ch= RExC_parse[1];
9888             if (ch != '<' && ch != '\'' && ch != '{') {
9889                 RExC_parse++;
9890                 vFAIL2("Sequence %.2s... not terminated",parse_start);
9891             } else {
9892                 /* this pretty much dupes the code for (?P=...) in reg(), if
9893                    you change this make sure you change that */
9894                 char* name_start = (RExC_parse += 2);
9895                 U32 num = 0;
9896                 SV *sv_dat = reg_scan_name(pRExC_state,
9897                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9898                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9899                 if (RExC_parse == name_start || *RExC_parse != ch)
9900                     vFAIL2("Sequence %.3s... not terminated",parse_start);
9901
9902                 if (!SIZE_ONLY) {
9903                     num = add_data( pRExC_state, 1, "S" );
9904                     RExC_rxi->data->data[num]=(void*)sv_dat;
9905                     SvREFCNT_inc_simple_void(sv_dat);
9906                 }
9907
9908                 RExC_sawback = 1;
9909                 ret = reganode(pRExC_state,
9910                                ((! FOLD)
9911                                  ? NREF
9912                                  : (MORE_ASCII_RESTRICTED)
9913                                    ? NREFFA
9914                                    : (AT_LEAST_UNI_SEMANTICS)
9915                                      ? NREFFU
9916                                      : (LOC)
9917                                        ? NREFFL
9918                                        : NREFF),
9919                                 num);
9920                 *flagp |= HASWIDTH;
9921
9922                 /* override incorrect value set in reganode MJD */
9923                 Set_Node_Offset(ret, parse_start+1);
9924                 Set_Node_Cur_Length(ret); /* MJD */
9925                 nextchar(pRExC_state);
9926
9927             }
9928             break;
9929         }
9930         case 'g':
9931         case '1': case '2': case '3': case '4':
9932         case '5': case '6': case '7': case '8': case '9':
9933             {
9934                 I32 num;
9935                 bool isg = *RExC_parse == 'g';
9936                 bool isrel = 0;
9937                 bool hasbrace = 0;
9938                 if (isg) {
9939                     RExC_parse++;
9940                     if (*RExC_parse == '{') {
9941                         RExC_parse++;
9942                         hasbrace = 1;
9943                     }
9944                     if (*RExC_parse == '-') {
9945                         RExC_parse++;
9946                         isrel = 1;
9947                     }
9948                     if (hasbrace && !isDIGIT(*RExC_parse)) {
9949                         if (isrel) RExC_parse--;
9950                         RExC_parse -= 2;
9951                         goto parse_named_seq;
9952                 }   }
9953                 num = atoi(RExC_parse);
9954                 if (isg && num == 0)
9955                     vFAIL("Reference to invalid group 0");
9956                 if (isrel) {
9957                     num = RExC_npar - num;
9958                     if (num < 1)
9959                         vFAIL("Reference to nonexistent or unclosed group");
9960                 }
9961                 if (!isg && num > 9 && num >= RExC_npar)
9962                     goto defchar;
9963                 else {
9964                     char * const parse_start = RExC_parse - 1; /* MJD */
9965                     while (isDIGIT(*RExC_parse))
9966                         RExC_parse++;
9967                     if (parse_start == RExC_parse - 1)
9968                         vFAIL("Unterminated \\g... pattern");
9969                     if (hasbrace) {
9970                         if (*RExC_parse != '}')
9971                             vFAIL("Unterminated \\g{...} pattern");
9972                         RExC_parse++;
9973                     }
9974                     if (!SIZE_ONLY) {
9975                         if (num > (I32)RExC_rx->nparens)
9976                             vFAIL("Reference to nonexistent group");
9977                     }
9978                     RExC_sawback = 1;
9979                     ret = reganode(pRExC_state,
9980                                    ((! FOLD)
9981                                      ? REF
9982                                      : (MORE_ASCII_RESTRICTED)
9983                                        ? REFFA
9984                                        : (AT_LEAST_UNI_SEMANTICS)
9985                                          ? REFFU
9986                                          : (LOC)
9987                                            ? REFFL
9988                                            : REFF),
9989                                     num);
9990                     *flagp |= HASWIDTH;
9991
9992                     /* override incorrect value set in reganode MJD */
9993                     Set_Node_Offset(ret, parse_start+1);
9994                     Set_Node_Cur_Length(ret); /* MJD */
9995                     RExC_parse--;
9996                     nextchar(pRExC_state);
9997                 }
9998             }
9999             break;
10000         case '\0':
10001             if (RExC_parse >= RExC_end)
10002                 FAIL("Trailing \\");
10003             /* FALL THROUGH */
10004         default:
10005             /* Do not generate "unrecognized" warnings here, we fall
10006                back into the quick-grab loop below */
10007             parse_start--;
10008             goto defchar;
10009         }
10010         break;
10011
10012     case '#':
10013         if (RExC_flags & RXf_PMf_EXTENDED) {
10014             if ( reg_skipcomment( pRExC_state ) )
10015                 goto tryagain;
10016         }
10017         /* FALL THROUGH */
10018
10019     default:
10020
10021             parse_start = RExC_parse - 1;
10022
10023             RExC_parse++;
10024
10025         defchar: {
10026             register STRLEN len;
10027             register UV ender;
10028             register char *p;
10029             char *s;
10030             STRLEN foldlen;
10031             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
10032             U8 node_type;
10033
10034             /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node?  If so,
10035              * it is folded to 'ss' even if not utf8 */
10036             bool is_exactfu_sharp_s;
10037
10038             ender = 0;
10039             node_type = ((! FOLD) ? EXACT
10040                         : (LOC)
10041                           ? EXACTFL
10042                           : (MORE_ASCII_RESTRICTED)
10043                             ? EXACTFA
10044                             : (AT_LEAST_UNI_SEMANTICS)
10045                               ? EXACTFU
10046                               : EXACTF);
10047             ret = reg_node(pRExC_state, node_type);
10048             s = STRING(ret);
10049
10050             /* XXX The node can hold up to 255 bytes, yet this only goes to
10051              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10052              * 255 allows us to not have to worry about overflow due to
10053              * converting to utf8 and fold expansion, but that value is
10054              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10055              * split up by this limit into a single one using the real max of
10056              * 255.  Even at 127, this breaks under rare circumstances.  If
10057              * folding, we do not want to split a node at a character that is a
10058              * non-final in a multi-char fold, as an input string could just
10059              * happen to want to match across the node boundary.  The join
10060              * would solve that problem if the join actually happens.  But a
10061              * series of more than two nodes in a row each of 127 would cause
10062              * the first join to succeed to get to 254, but then there wouldn't
10063              * be room for the next one, which could at be one of those split
10064              * multi-char folds.  I don't know of any fool-proof solution.  One
10065              * could back off to end with only a code point that isn't such a
10066              * non-final, but it is possible for there not to be any in the
10067              * entire node. */
10068             for (len = 0, p = RExC_parse - 1;
10069                  len < 127 && p < RExC_end;
10070                  len++)
10071             {
10072                 char * const oldp = p;
10073
10074                 if (RExC_flags & RXf_PMf_EXTENDED)
10075                     p = regwhite( pRExC_state, p );
10076                 switch ((U8)*p) {
10077                 case '^':
10078                 case '$':
10079                 case '.':
10080                 case '[':
10081                 case '(':
10082                 case ')':
10083                 case '|':
10084                     goto loopdone;
10085                 case '\\':
10086                     /* Literal Escapes Switch
10087
10088                        This switch is meant to handle escape sequences that
10089                        resolve to a literal character.
10090
10091                        Every escape sequence that represents something
10092                        else, like an assertion or a char class, is handled
10093                        in the switch marked 'Special Escapes' above in this
10094                        routine, but also has an entry here as anything that
10095                        isn't explicitly mentioned here will be treated as
10096                        an unescaped equivalent literal.
10097                     */
10098
10099                     switch ((U8)*++p) {
10100                     /* These are all the special escapes. */
10101                     case 'A':             /* Start assertion */
10102                     case 'b': case 'B':   /* Word-boundary assertion*/
10103                     case 'C':             /* Single char !DANGEROUS! */
10104                     case 'd': case 'D':   /* digit class */
10105                     case 'g': case 'G':   /* generic-backref, pos assertion */
10106                     case 'h': case 'H':   /* HORIZWS */
10107                     case 'k': case 'K':   /* named backref, keep marker */
10108                     case 'N':             /* named char sequence */
10109                     case 'p': case 'P':   /* Unicode property */
10110                               case 'R':   /* LNBREAK */
10111                     case 's': case 'S':   /* space class */
10112                     case 'v': case 'V':   /* VERTWS */
10113                     case 'w': case 'W':   /* word class */
10114                     case 'X':             /* eXtended Unicode "combining character sequence" */
10115                     case 'z': case 'Z':   /* End of line/string assertion */
10116                         --p;
10117                         goto loopdone;
10118
10119                     /* Anything after here is an escape that resolves to a
10120                        literal. (Except digits, which may or may not)
10121                      */
10122                     case 'n':
10123                         ender = '\n';
10124                         p++;
10125                         break;
10126                     case 'r':
10127                         ender = '\r';
10128                         p++;
10129                         break;
10130                     case 't':
10131                         ender = '\t';
10132                         p++;
10133                         break;
10134                     case 'f':
10135                         ender = '\f';
10136                         p++;
10137                         break;
10138                     case 'e':
10139                           ender = ASCII_TO_NATIVE('\033');
10140                         p++;
10141                         break;
10142                     case 'a':
10143                           ender = ASCII_TO_NATIVE('\007');
10144                         p++;
10145                         break;
10146                     case 'o':
10147                         {
10148                             STRLEN brace_len = len;
10149                             UV result;
10150                             const char* error_msg;
10151
10152                             bool valid = grok_bslash_o(p,
10153                                                        &result,
10154                                                        &brace_len,
10155                                                        &error_msg,
10156                                                        1);
10157                             p += brace_len;
10158                             if (! valid) {
10159                                 RExC_parse = p; /* going to die anyway; point
10160                                                    to exact spot of failure */
10161                                 vFAIL(error_msg);
10162                             }
10163                             else
10164                             {
10165                                 ender = result;
10166                             }
10167                             if (PL_encoding && ender < 0x100) {
10168                                 goto recode_encoding;
10169                             }
10170                             if (ender > 0xff) {
10171                                 REQUIRE_UTF8;
10172                             }
10173                             break;
10174                         }
10175                     case 'x':
10176                         if (*++p == '{') {
10177                             char* const e = strchr(p, '}');
10178
10179                             if (!e) {
10180                                 RExC_parse = p + 1;
10181                                 vFAIL("Missing right brace on \\x{}");
10182                             }
10183                             else {
10184                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10185                                     | PERL_SCAN_DISALLOW_PREFIX;
10186                                 STRLEN numlen = e - p - 1;
10187                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
10188                                 if (ender > 0xff)
10189                                     REQUIRE_UTF8;
10190                                 p = e + 1;
10191                             }
10192                         }
10193                         else {
10194                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10195                             STRLEN numlen = 2;
10196                             ender = grok_hex(p, &numlen, &flags, NULL);
10197                             p += numlen;
10198                         }
10199                         if (PL_encoding && ender < 0x100)
10200                             goto recode_encoding;
10201                         break;
10202                     case 'c':
10203                         p++;
10204                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10205                         break;
10206                     case '0': case '1': case '2': case '3':case '4':
10207                     case '5': case '6': case '7': case '8':case '9':
10208                         if (*p == '0' ||
10209                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10210                         {
10211                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10212                             STRLEN numlen = 3;
10213                             ender = grok_oct(p, &numlen, &flags, NULL);
10214                             if (ender > 0xff) {
10215                                 REQUIRE_UTF8;
10216                             }
10217                             p += numlen;
10218                         }
10219                         else {
10220                             --p;
10221                             goto loopdone;
10222                         }
10223                         if (PL_encoding && ender < 0x100)
10224                             goto recode_encoding;
10225                         break;
10226                     recode_encoding:
10227                         if (! RExC_override_recoding) {
10228                             SV* enc = PL_encoding;
10229                             ender = reg_recode((const char)(U8)ender, &enc);
10230                             if (!enc && SIZE_ONLY)
10231                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10232                             REQUIRE_UTF8;
10233                         }
10234                         break;
10235                     case '\0':
10236                         if (p >= RExC_end)
10237                             FAIL("Trailing \\");
10238                         /* FALL THROUGH */
10239                     default:
10240                         if (!SIZE_ONLY&& isALPHA(*p)) {
10241                             ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10242                         }
10243                         goto normal_default;
10244                     }
10245                     break;
10246                 case '{':
10247                     /* Currently we don't warn when the lbrace is at the start
10248                      * of a construct.  This catches it in the middle of a
10249                      * literal string, or when its the first thing after
10250                      * something like "\b" */
10251                     if (! SIZE_ONLY
10252                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10253                     {
10254                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10255                     }
10256                     /*FALLTHROUGH*/
10257                 default:
10258                   normal_default:
10259                     if (UTF8_IS_START(*p) && UTF) {
10260                         STRLEN numlen;
10261                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10262                                                &numlen, UTF8_ALLOW_DEFAULT);
10263                         p += numlen;
10264                     }
10265                     else
10266                         ender = (U8) *p++;
10267                     break;
10268                 } /* End of switch on the literal */
10269
10270                 is_exactfu_sharp_s = (node_type == EXACTFU
10271                                       && ender == LATIN_SMALL_LETTER_SHARP_S);
10272                 if ( RExC_flags & RXf_PMf_EXTENDED)
10273                     p = regwhite( pRExC_state, p );
10274                 if ((UTF && FOLD) || is_exactfu_sharp_s) {
10275                     /* Prime the casefolded buffer.  Locale rules, which apply
10276                      * only to code points < 256, aren't known until execution,
10277                      * so for them, just output the original character using
10278                      * utf8.  If we start to fold non-UTF patterns, be sure to
10279                      * update join_exact() */
10280                     if (LOC && ender < 256) {
10281                         if (UNI_IS_INVARIANT(ender)) {
10282                             *tmpbuf = (U8) ender;
10283                             foldlen = 1;
10284                         } else {
10285                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
10286                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
10287                             foldlen = 2;
10288                         }
10289                     }
10290                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
10291                                                  */
10292                         ender = toLOWER(ender);
10293                         *tmpbuf = (U8) ender;
10294                         foldlen = 1;
10295                     }
10296                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
10297
10298                         /* Locale and /aa require more selectivity about the
10299                          * fold, so are handled below.  Otherwise, here, just
10300                          * use the fold */
10301                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
10302                     }
10303                     else {
10304                         /* Under locale rules or /aa we are not to mix,
10305                          * respectively, ords < 256 or ASCII with non-.  So
10306                          * reject folds that mix them, using only the
10307                          * non-folded code point.  So do the fold to a
10308                          * temporary, and inspect each character in it. */
10309                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
10310                         U8* s = trialbuf;
10311                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
10312                         U8* e = s + foldlen;
10313                         bool fold_ok = TRUE;
10314
10315                         while (s < e) {
10316                             if (isASCII(*s)
10317                                 || (LOC && (UTF8_IS_INVARIANT(*s)
10318                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
10319                             {
10320                                 fold_ok = FALSE;
10321                                 break;
10322                             }
10323                             s += UTF8SKIP(s);
10324                         }
10325                         if (fold_ok) {
10326                             Copy(trialbuf, tmpbuf, foldlen, U8);
10327                             ender = tmpender;
10328                         }
10329                         else {
10330                             uvuni_to_utf8(tmpbuf, ender);
10331                             foldlen = UNISKIP(ender);
10332                         }
10333                     }
10334                 }
10335                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
10336                     if (len)
10337                         p = oldp;
10338                     else if (UTF || is_exactfu_sharp_s) {
10339                          if (FOLD) {
10340                               /* Emit all the Unicode characters. */
10341                               STRLEN numlen;
10342                               for (foldbuf = tmpbuf;
10343                                    foldlen;
10344                                    foldlen -= numlen) {
10345
10346                                    /* tmpbuf has been constructed by us, so we
10347                                     * know it is valid utf8 */
10348                                    ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10349                                    if (numlen > 0) {
10350                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
10351                                         s       += unilen;
10352                                         len     += unilen;
10353                                         /* In EBCDIC the numlen
10354                                          * and unilen can differ. */
10355                                         foldbuf += numlen;
10356                                         if (numlen >= foldlen)
10357                                              break;
10358                                    }
10359                                    else
10360                                         break; /* "Can't happen." */
10361                               }
10362                          }
10363                          else {
10364                               const STRLEN unilen = reguni(pRExC_state, ender, s);
10365                               if (unilen > 0) {
10366                                    s   += unilen;
10367                                    len += unilen;
10368                               }
10369                          }
10370                     }
10371                     else {
10372                         len++;
10373                         REGC((char)ender, s++);
10374                     }
10375                     break;
10376                 }
10377                 if (UTF || is_exactfu_sharp_s) {
10378                      if (FOLD) {
10379                           /* Emit all the Unicode characters. */
10380                           STRLEN numlen;
10381                           for (foldbuf = tmpbuf;
10382                                foldlen;
10383                                foldlen -= numlen) {
10384                                ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10385                                if (numlen > 0) {
10386                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10387                                     len     += unilen;
10388                                     s       += unilen;
10389                                     /* In EBCDIC the numlen
10390                                      * and unilen can differ. */
10391                                     foldbuf += numlen;
10392                                     if (numlen >= foldlen)
10393                                          break;
10394                                }
10395                                else
10396                                     break;
10397                           }
10398                      }
10399                      else {
10400                           const STRLEN unilen = reguni(pRExC_state, ender, s);
10401                           if (unilen > 0) {
10402                                s   += unilen;
10403                                len += unilen;
10404                           }
10405                      }
10406                      len--;
10407                 }
10408                 else {
10409                     REGC((char)ender, s++);
10410                 }
10411             }
10412         loopdone:   /* Jumped to when encounters something that shouldn't be in
10413                        the node */
10414             RExC_parse = p - 1;
10415             Set_Node_Cur_Length(ret); /* MJD */
10416             nextchar(pRExC_state);
10417             {
10418                 /* len is STRLEN which is unsigned, need to copy to signed */
10419                 IV iv = len;
10420                 if (iv < 0)
10421                     vFAIL("Internal disaster");
10422             }
10423             if (len > 0)
10424                 *flagp |= HASWIDTH;
10425             if (len == 1 && UNI_IS_INVARIANT(ender))
10426                 *flagp |= SIMPLE;
10427
10428             if (SIZE_ONLY)
10429                 RExC_size += STR_SZ(len);
10430             else {
10431                 STR_LEN(ret) = len;
10432                 RExC_emit += STR_SZ(len);
10433             }
10434         }
10435         break;
10436     }
10437
10438     return(ret);
10439
10440 /* Jumped to when an unrecognized character set is encountered */
10441 bad_charset:
10442     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
10443     return(NULL);
10444 }
10445
10446 STATIC char *
10447 S_regwhite( RExC_state_t *pRExC_state, char *p )
10448 {
10449     const char *e = RExC_end;
10450
10451     PERL_ARGS_ASSERT_REGWHITE;
10452
10453     while (p < e) {
10454         if (isSPACE(*p))
10455             ++p;
10456         else if (*p == '#') {
10457             bool ended = 0;
10458             do {
10459                 if (*p++ == '\n') {
10460                     ended = 1;
10461                     break;
10462                 }
10463             } while (p < e);
10464             if (!ended)
10465                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10466         }
10467         else
10468             break;
10469     }
10470     return p;
10471 }
10472
10473 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
10474    Character classes ([:foo:]) can also be negated ([:^foo:]).
10475    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
10476    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
10477    but trigger failures because they are currently unimplemented. */
10478
10479 #define POSIXCC_DONE(c)   ((c) == ':')
10480 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
10481 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
10482
10483 STATIC I32
10484 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
10485 {
10486     dVAR;
10487     I32 namedclass = OOB_NAMEDCLASS;
10488
10489     PERL_ARGS_ASSERT_REGPPOSIXCC;
10490
10491     if (value == '[' && RExC_parse + 1 < RExC_end &&
10492         /* I smell either [: or [= or [. -- POSIX has been here, right? */
10493         POSIXCC(UCHARAT(RExC_parse))) {
10494         const char c = UCHARAT(RExC_parse);
10495         char* const s = RExC_parse++;
10496
10497         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
10498             RExC_parse++;
10499         if (RExC_parse == RExC_end)
10500             /* Grandfather lone [:, [=, [. */
10501             RExC_parse = s;
10502         else {
10503             const char* const t = RExC_parse++; /* skip over the c */
10504             assert(*t == c);
10505
10506             if (UCHARAT(RExC_parse) == ']') {
10507                 const char *posixcc = s + 1;
10508                 RExC_parse++; /* skip over the ending ] */
10509
10510                 if (*s == ':') {
10511                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
10512                     const I32 skip = t - posixcc;
10513
10514                     /* Initially switch on the length of the name.  */
10515                     switch (skip) {
10516                     case 4:
10517                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
10518                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
10519                         break;
10520                     case 5:
10521                         /* Names all of length 5.  */
10522                         /* alnum alpha ascii blank cntrl digit graph lower
10523                            print punct space upper  */
10524                         /* Offset 4 gives the best switch position.  */
10525                         switch (posixcc[4]) {
10526                         case 'a':
10527                             if (memEQ(posixcc, "alph", 4)) /* alpha */
10528                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
10529                             break;
10530                         case 'e':
10531                             if (memEQ(posixcc, "spac", 4)) /* space */
10532                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
10533                             break;
10534                         case 'h':
10535                             if (memEQ(posixcc, "grap", 4)) /* graph */
10536                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
10537                             break;
10538                         case 'i':
10539                             if (memEQ(posixcc, "asci", 4)) /* ascii */
10540                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
10541                             break;
10542                         case 'k':
10543                             if (memEQ(posixcc, "blan", 4)) /* blank */
10544                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
10545                             break;
10546                         case 'l':
10547                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10548                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
10549                             break;
10550                         case 'm':
10551                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
10552                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
10553                             break;
10554                         case 'r':
10555                             if (memEQ(posixcc, "lowe", 4)) /* lower */
10556                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10557                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
10558                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
10559                             break;
10560                         case 't':
10561                             if (memEQ(posixcc, "digi", 4)) /* digit */
10562                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10563                             else if (memEQ(posixcc, "prin", 4)) /* print */
10564                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10565                             else if (memEQ(posixcc, "punc", 4)) /* punct */
10566                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
10567                             break;
10568                         }
10569                         break;
10570                     case 6:
10571                         if (memEQ(posixcc, "xdigit", 6))
10572                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
10573                         break;
10574                     }
10575
10576                     if (namedclass == OOB_NAMEDCLASS)
10577                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10578                                       t - s - 1, s + 1);
10579                     assert (posixcc[skip] == ':');
10580                     assert (posixcc[skip+1] == ']');
10581                 } else if (!SIZE_ONLY) {
10582                     /* [[=foo=]] and [[.foo.]] are still future. */
10583
10584                     /* adjust RExC_parse so the warning shows after
10585                        the class closes */
10586                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
10587                         RExC_parse++;
10588                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10589                 }
10590             } else {
10591                 /* Maternal grandfather:
10592                  * "[:" ending in ":" but not in ":]" */
10593                 RExC_parse = s;
10594             }
10595         }
10596     }
10597
10598     return namedclass;
10599 }
10600
10601 STATIC void
10602 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10603 {
10604     dVAR;
10605
10606     PERL_ARGS_ASSERT_CHECKPOSIXCC;
10607
10608     if (POSIXCC(UCHARAT(RExC_parse))) {
10609         const char *s = RExC_parse;
10610         const char  c = *s++;
10611
10612         while (isALNUM(*s))
10613             s++;
10614         if (*s && c == *s && s[1] == ']') {
10615             ckWARN3reg(s+2,
10616                        "POSIX syntax [%c %c] belongs inside character classes",
10617                        c, c);
10618
10619             /* [[=foo=]] and [[.foo.]] are still future. */
10620             if (POSIXCC_NOTYET(c)) {
10621                 /* adjust RExC_parse so the error shows after
10622                    the class closes */
10623                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10624                     NOOP;
10625                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10626             }
10627         }
10628     }
10629 }
10630
10631 /* Generate the code to add a full posix character <class> to the bracketed
10632  * character class given by <node>.  (<node> is needed only under locale rules)
10633  * destlist     is the inversion list for non-locale rules that this class is
10634  *              to be added to
10635  * sourcelist   is the ASCII-range inversion list to add under /a rules
10636  * Xsourcelist  is the full Unicode range list to use otherwise. */
10637 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
10638     if (LOC) {                                                             \
10639         SV* scratch_list = NULL;                                           \
10640                                                                            \
10641         /* Set this class in the node for runtime matching */              \
10642         ANYOF_CLASS_SET(node, class);                                      \
10643                                                                            \
10644         /* For above Latin1 code points, we use the full Unicode range */  \
10645         _invlist_intersection(PL_AboveLatin1,                              \
10646                               Xsourcelist,                                 \
10647                               &scratch_list);                              \
10648         /* And set the output to it, adding instead if there already is an \
10649          * output.  Checking if <destlist> is NULL first saves an extra    \
10650          * clone.  Its reference count will be decremented at the next     \
10651          * union, etc, or if this is the only instance, at the end of the  \
10652          * routine */                                                      \
10653         if (! destlist) {                                                  \
10654             destlist = scratch_list;                                       \
10655         }                                                                  \
10656         else {                                                             \
10657             _invlist_union(destlist, scratch_list, &destlist);             \
10658             SvREFCNT_dec(scratch_list);                                    \
10659         }                                                                  \
10660     }                                                                      \
10661     else {                                                                 \
10662         /* For non-locale, just add it to any existing list */             \
10663         _invlist_union(destlist,                                           \
10664                        (AT_LEAST_ASCII_RESTRICTED)                         \
10665                            ? sourcelist                                    \
10666                            : Xsourcelist,                                  \
10667                        &destlist);                                         \
10668     }
10669
10670 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10671  */
10672 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
10673     if (LOC) {                                                             \
10674         SV* scratch_list = NULL;                                           \
10675         ANYOF_CLASS_SET(node, class);                                      \
10676         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
10677         if (! destlist) {                                                  \
10678             destlist = scratch_list;                                       \
10679         }                                                                  \
10680         else {                                                             \
10681             _invlist_union(destlist, scratch_list, &destlist);             \
10682             SvREFCNT_dec(scratch_list);                                    \
10683         }                                                                  \
10684     }                                                                      \
10685     else {                                                                 \
10686         _invlist_union_complement_2nd(destlist,                            \
10687                                     (AT_LEAST_ASCII_RESTRICTED)            \
10688                                         ? sourcelist                       \
10689                                         : Xsourcelist,                     \
10690                                     &destlist);                            \
10691         /* Under /d, everything in the upper half of the Latin1 range      \
10692          * matches this complement */                                      \
10693         if (DEPENDS_SEMANTICS) {                                           \
10694             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
10695         }                                                                  \
10696     }
10697
10698 /* Generate the code to add a posix character <class> to the bracketed
10699  * character class given by <node>.  (<node> is needed only under locale rules)
10700  * destlist       is the inversion list for non-locale rules that this class is
10701  *                to be added to
10702  * sourcelist     is the ASCII-range inversion list to add under /a rules
10703  * l1_sourcelist  is the Latin1 range list to use otherwise.
10704  * Xpropertyname  is the name to add to <run_time_list> of the property to
10705  *                specify the code points above Latin1 that will have to be
10706  *                determined at run-time
10707  * run_time_list  is a SV* that contains text names of properties that are to
10708  *                be computed at run time.  This concatenates <Xpropertyname>
10709  *                to it, apppropriately
10710  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10711  * time */
10712 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
10713                               l1_sourcelist, Xpropertyname, run_time_list) \
10714         /* First, resolve whether to use the ASCII-only list or the L1     \
10715          * list */                                                         \
10716         DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist,      \
10717                 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
10718                 Xpropertyname, run_time_list)
10719
10720 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
10721                 Xpropertyname, run_time_list)                              \
10722     /* If not /a matching, there are going to be code points we will have  \
10723      * to defer to runtime to look-up */                                   \
10724     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
10725         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10726     }                                                                      \
10727     if (LOC) {                                                             \
10728         ANYOF_CLASS_SET(node, class);                                      \
10729     }                                                                      \
10730     else {                                                                 \
10731         _invlist_union(destlist, sourcelist, &destlist);                   \
10732     }
10733
10734 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
10735  * this and DO_N_POSIX */
10736 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
10737                               l1_sourcelist, Xpropertyname, run_time_list) \
10738     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
10739         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
10740     }                                                                      \
10741     else {                                                                 \
10742         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10743         if (LOC) {                                                         \
10744             ANYOF_CLASS_SET(node, namedclass);                             \
10745         }                                                                  \
10746         else {                                                             \
10747             SV* scratch_list = NULL;                                       \
10748             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
10749             if (! destlist) {                                              \
10750                 destlist = scratch_list;                                   \
10751             }                                                              \
10752             else {                                                         \
10753                 _invlist_union(destlist, scratch_list, &destlist);         \
10754                 SvREFCNT_dec(scratch_list);                                \
10755             }                                                              \
10756             if (DEPENDS_SEMANTICS) {                                       \
10757                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
10758             }                                                              \
10759         }                                                                  \
10760     }
10761
10762 STATIC U8
10763 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10764 {
10765
10766     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10767      * Locale folding is done at run-time, so this function should not be
10768      * called for nodes that are for locales.
10769      *
10770      * This function sets the bit corresponding to the fold of the input
10771      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
10772      * 'F' is 'f'.
10773      *
10774      * It also knows about the characters that are in the bitmap that have
10775      * folds that are matchable only outside it, and sets the appropriate lists
10776      * and flags.
10777      *
10778      * It returns the number of bits that actually changed from 0 to 1 */
10779
10780     U8 stored = 0;
10781     U8 fold;
10782
10783     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10784
10785     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10786                                     : PL_fold[value];
10787
10788     /* It assumes the bit for 'value' has already been set */
10789     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10790         ANYOF_BITMAP_SET(node, fold);
10791         stored++;
10792     }
10793     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10794         /* Certain Latin1 characters have matches outside the bitmap.  To get
10795          * here, 'value' is one of those characters.   None of these matches is
10796          * valid for ASCII characters under /aa, which have been excluded by
10797          * the 'if' above.  The matches fall into three categories:
10798          * 1) They are singly folded-to or -from an above 255 character, as
10799          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10800          *    WITH DIAERESIS;
10801          * 2) They are part of a multi-char fold with another character in the
10802          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10803          * 3) They are part of a multi-char fold with a character not in the
10804          *    bitmap, such as various ligatures.
10805          * We aren't dealing fully with multi-char folds, except we do deal
10806          * with the pattern containing a character that has a multi-char fold
10807          * (not so much the inverse).
10808          * For types 1) and 3), the matches only happen when the target string
10809          * is utf8; that's not true for 2), and we set a flag for it.
10810          *
10811          * The code below adds to the passed in inversion list the single fold
10812          * closures for 'value'.  The values are hard-coded here so that an
10813          * innocent-looking character class, like /[ks]/i won't have to go out
10814          * to disk to find the possible matches.  XXX It would be better to
10815          * generate these via regen, in case a new version of the Unicode
10816          * standard adds new mappings, though that is not really likely. */
10817         switch (value) {
10818             case 'k':
10819             case 'K':
10820                 /* KELVIN SIGN */
10821                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10822                 break;
10823             case 's':
10824             case 'S':
10825                 /* LATIN SMALL LETTER LONG S */
10826                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10827                 break;
10828             case MICRO_SIGN:
10829                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10830                                                  GREEK_SMALL_LETTER_MU);
10831                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10832                                                  GREEK_CAPITAL_LETTER_MU);
10833                 break;
10834             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10835             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10836                 /* ANGSTROM SIGN */
10837                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10838                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
10839                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10840                                                      PL_fold_latin1[value]);
10841                 }
10842                 break;
10843             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10844                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10845                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10846                 break;
10847             case LATIN_SMALL_LETTER_SHARP_S:
10848                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10849                                         LATIN_CAPITAL_LETTER_SHARP_S);
10850
10851                 /* Under /a, /d, and /u, this can match the two chars "ss" */
10852                 if (! MORE_ASCII_RESTRICTED) {
10853                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
10854
10855                     /* And under /u or /a, it can match even if the target is
10856                      * not utf8 */
10857                     if (AT_LEAST_UNI_SEMANTICS) {
10858                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10859                     }
10860                 }
10861                 break;
10862             case 'F': case 'f':
10863             case 'I': case 'i':
10864             case 'L': case 'l':
10865             case 'T': case 't':
10866             case 'A': case 'a':
10867             case 'H': case 'h':
10868             case 'J': case 'j':
10869             case 'N': case 'n':
10870             case 'W': case 'w':
10871             case 'Y': case 'y':
10872                 /* These all are targets of multi-character folds from code
10873                  * points that require UTF8 to express, so they can't match
10874                  * unless the target string is in UTF-8, so no action here is
10875                  * necessary, as regexec.c properly handles the general case
10876                  * for UTF-8 matching */
10877                 break;
10878             default:
10879                 /* Use deprecated warning to increase the chances of this
10880                  * being output */
10881                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10882                 break;
10883         }
10884     }
10885     else if (DEPENDS_SEMANTICS
10886             && ! isASCII(value)
10887             && PL_fold_latin1[value] != value)
10888     {
10889            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10890             * folds only when the target string is in UTF-8.  We add the fold
10891             * here to the list of things to match outside the bitmap, which
10892             * won't be looked at unless it is UTF8 (or else if something else
10893             * says to look even if not utf8, but those things better not happen
10894             * under DEPENDS semantics. */
10895         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10896     }
10897
10898     return stored;
10899 }
10900
10901
10902 PERL_STATIC_INLINE U8
10903 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10904 {
10905     /* This inline function sets a bit in the bitmap if not already set, and if
10906      * appropriate, its fold, returning the number of bits that actually
10907      * changed from 0 to 1 */
10908
10909     U8 stored;
10910
10911     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10912
10913     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
10914         return 0;
10915     }
10916
10917     ANYOF_BITMAP_SET(node, value);
10918     stored = 1;
10919
10920     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
10921         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10922     }
10923
10924     return stored;
10925 }
10926
10927 STATIC void
10928 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10929 {
10930     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10931      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
10932      * the multi-character folds of characters in the node */
10933     SV *sv;
10934
10935     PERL_ARGS_ASSERT_ADD_ALTERNATE;
10936
10937     if (! *alternate_ptr) {
10938         *alternate_ptr = newAV();
10939     }
10940     sv = newSVpvn_utf8((char*)string, len, TRUE);
10941     av_push(*alternate_ptr, sv);
10942     return;
10943 }
10944
10945 /*
10946    parse a class specification and produce either an ANYOF node that
10947    matches the pattern or perhaps will be optimized into an EXACTish node
10948    instead. The node contains a bit map for the first 256 characters, with the
10949    corresponding bit set if that character is in the list.  For characters
10950    above 255, a range list is used */
10951
10952 STATIC regnode *
10953 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10954 {
10955     dVAR;
10956     register UV nextvalue;
10957     register IV prevvalue = OOB_UNICODE;
10958     register IV range = 0;
10959     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10960     register regnode *ret;
10961     STRLEN numlen;
10962     IV namedclass;
10963     char *rangebegin = NULL;
10964     bool need_class = 0;
10965     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
10966     SV *listsv = NULL;
10967     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10968                                       than just initialized.  */
10969     SV* properties = NULL;    /* Code points that match \p{} \P{} */
10970     UV element_count = 0;   /* Number of distinct elements in the class.
10971                                Optimizations may be possible if this is tiny */
10972     UV n;
10973
10974     /* Unicode properties are stored in a swash; this holds the current one
10975      * being parsed.  If this swash is the only above-latin1 component of the
10976      * character class, an optimization is to pass it directly on to the
10977      * execution engine.  Otherwise, it is set to NULL to indicate that there
10978      * are other things in the class that have to be dealt with at execution
10979      * time */
10980     SV* swash = NULL;           /* Code points that match \p{} \P{} */
10981
10982     /* Set if a component of this character class is user-defined; just passed
10983      * on to the engine */
10984     UV has_user_defined_property = 0;
10985
10986     /* code points this node matches that can't be stored in the bitmap */
10987     SV* nonbitmap = NULL;
10988
10989     /* The items that are to match that aren't stored in the bitmap, but are a
10990      * result of things that are stored there.  This is the fold closure of
10991      * such a character, either because it has DEPENDS semantics and shouldn't
10992      * be matched unless the target string is utf8, or is a code point that is
10993      * too large for the bit map, as for example, the fold of the MICRO SIGN is
10994      * above 255.  This all is solely for performance reasons.  By having this
10995      * code know the outside-the-bitmap folds that the bitmapped characters are
10996      * involved with, we don't have to go out to disk to find the list of
10997      * matches, unless the character class includes code points that aren't
10998      * storable in the bit map.  That means that a character class with an 's'
10999      * in it, for example, doesn't need to go out to disk to find everything
11000      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
11001      * empty unless there is something whose fold we don't know about, and will
11002      * have to go out to the disk to find. */
11003     SV* l1_fold_invlist = NULL;
11004
11005     /* List of multi-character folds that are matched by this node */
11006     AV* unicode_alternate  = NULL;
11007 #ifdef EBCDIC
11008     UV literal_endpoint = 0;
11009 #endif
11010     UV stored = 0;  /* how many chars stored in the bitmap */
11011
11012     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11013         case we need to change the emitted regop to an EXACT. */
11014     const char * orig_parse = RExC_parse;
11015     GET_RE_DEBUG_FLAGS_DECL;
11016
11017     PERL_ARGS_ASSERT_REGCLASS;
11018 #ifndef DEBUGGING
11019     PERL_UNUSED_ARG(depth);
11020 #endif
11021
11022     DEBUG_PARSE("clas");
11023
11024     /* Assume we are going to generate an ANYOF node. */
11025     ret = reganode(pRExC_state, ANYOF, 0);
11026
11027
11028     if (!SIZE_ONLY) {
11029         ANYOF_FLAGS(ret) = 0;
11030     }
11031
11032     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11033         RExC_naughty++;
11034         RExC_parse++;
11035         if (!SIZE_ONLY)
11036             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
11037
11038         /* We have decided to not allow multi-char folds in inverted character
11039          * classes, due to the confusion that can happen, especially with
11040          * classes that are designed for a non-Unicode world:  You have the
11041          * peculiar case that:
11042             "s s" =~ /^[^\xDF]+$/i => Y
11043             "ss"  =~ /^[^\xDF]+$/i => N
11044          *
11045          * See [perl #89750] */
11046         allow_full_fold = FALSE;
11047     }
11048
11049     if (SIZE_ONLY) {
11050         RExC_size += ANYOF_SKIP;
11051         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11052     }
11053     else {
11054         RExC_emit += ANYOF_SKIP;
11055         if (LOC) {
11056             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11057         }
11058         ANYOF_BITMAP_ZERO(ret);
11059         listsv = newSVpvs("# comment\n");
11060         initial_listsv_len = SvCUR(listsv);
11061     }
11062
11063     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11064
11065     if (!SIZE_ONLY && POSIXCC(nextvalue))
11066         checkposixcc(pRExC_state);
11067
11068     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11069     if (UCHARAT(RExC_parse) == ']')
11070         goto charclassloop;
11071
11072 parseit:
11073     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11074
11075     charclassloop:
11076
11077         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11078
11079         if (!range) {
11080             rangebegin = RExC_parse;
11081             element_count++;
11082         }
11083         if (UTF) {
11084             value = utf8n_to_uvchr((U8*)RExC_parse,
11085                                    RExC_end - RExC_parse,
11086                                    &numlen, UTF8_ALLOW_DEFAULT);
11087             RExC_parse += numlen;
11088         }
11089         else
11090             value = UCHARAT(RExC_parse++);
11091
11092         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11093         if (value == '[' && POSIXCC(nextvalue))
11094             namedclass = regpposixcc(pRExC_state, value);
11095         else if (value == '\\') {
11096             if (UTF) {
11097                 value = utf8n_to_uvchr((U8*)RExC_parse,
11098                                    RExC_end - RExC_parse,
11099                                    &numlen, UTF8_ALLOW_DEFAULT);
11100                 RExC_parse += numlen;
11101             }
11102             else
11103                 value = UCHARAT(RExC_parse++);
11104             /* Some compilers cannot handle switching on 64-bit integer
11105              * values, therefore value cannot be an UV.  Yes, this will
11106              * be a problem later if we want switch on Unicode.
11107              * A similar issue a little bit later when switching on
11108              * namedclass. --jhi */
11109             switch ((I32)value) {
11110             case 'w':   namedclass = ANYOF_ALNUM;       break;
11111             case 'W':   namedclass = ANYOF_NALNUM;      break;
11112             case 's':   namedclass = ANYOF_SPACE;       break;
11113             case 'S':   namedclass = ANYOF_NSPACE;      break;
11114             case 'd':   namedclass = ANYOF_DIGIT;       break;
11115             case 'D':   namedclass = ANYOF_NDIGIT;      break;
11116             case 'v':   namedclass = ANYOF_VERTWS;      break;
11117             case 'V':   namedclass = ANYOF_NVERTWS;     break;
11118             case 'h':   namedclass = ANYOF_HORIZWS;     break;
11119             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
11120             case 'N':  /* Handle \N{NAME} in class */
11121                 {
11122                     /* We only pay attention to the first char of
11123                     multichar strings being returned. I kinda wonder
11124                     if this makes sense as it does change the behaviour
11125                     from earlier versions, OTOH that behaviour was broken
11126                     as well. */
11127                     UV v; /* value is register so we cant & it /grrr */
11128                     if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
11129                         goto parseit;
11130                     }
11131                     value= v;
11132                 }
11133                 break;
11134             case 'p':
11135             case 'P':
11136                 {
11137                 char *e;
11138                 if (RExC_parse >= RExC_end)
11139                     vFAIL2("Empty \\%c{}", (U8)value);
11140                 if (*RExC_parse == '{') {
11141                     const U8 c = (U8)value;
11142                     e = strchr(RExC_parse++, '}');
11143                     if (!e)
11144                         vFAIL2("Missing right brace on \\%c{}", c);
11145                     while (isSPACE(UCHARAT(RExC_parse)))
11146                         RExC_parse++;
11147                     if (e == RExC_parse)
11148                         vFAIL2("Empty \\%c{}", c);
11149                     n = e - RExC_parse;
11150                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11151                         n--;
11152                 }
11153                 else {
11154                     e = RExC_parse;
11155                     n = 1;
11156                 }
11157                 if (!SIZE_ONLY) {
11158                     SV** invlistsvp;
11159                     SV* invlist;
11160                     char* name;
11161                     if (UCHARAT(RExC_parse) == '^') {
11162                          RExC_parse++;
11163                          n--;
11164                          value = value == 'p' ? 'P' : 'p'; /* toggle */
11165                          while (isSPACE(UCHARAT(RExC_parse))) {
11166                               RExC_parse++;
11167                               n--;
11168                          }
11169                     }
11170                     /* Try to get the definition of the property into
11171                      * <invlist>.  If /i is in effect, the effective property
11172                      * will have its name be <__NAME_i>.  The design is
11173                      * discussed in commit
11174                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11175                     Newx(name, n + sizeof("_i__\n"), char);
11176
11177                     sprintf(name, "%s%.*s%s\n",
11178                                     (FOLD) ? "__" : "",
11179                                     (int)n,
11180                                     RExC_parse,
11181                                     (FOLD) ? "_i" : ""
11182                     );
11183
11184                     /* Look up the property name, and get its swash and
11185                      * inversion list, if the property is found  */
11186                     if (swash) {
11187                         SvREFCNT_dec(swash);
11188                     }
11189                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
11190                                              1, /* binary */
11191                                              0, /* not tr/// */
11192                                              TRUE, /* this routine will handle
11193                                                       undefined properties */
11194                                              NULL, FALSE /* No inversion list */
11195                                             );
11196                     if (   ! swash
11197                         || ! SvROK(swash)
11198                         || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11199                         || ! (invlistsvp =
11200                                 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11201                                 "INVLIST", FALSE))
11202                         || ! (invlist = *invlistsvp))
11203                     {
11204                         if (swash) {
11205                             SvREFCNT_dec(swash);
11206                             swash = NULL;
11207                         }
11208
11209                         /* Here didn't find it.  It could be a user-defined
11210                          * property that will be available at run-time.  Add it
11211                          * to the list to look up then */
11212                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11213                                         (value == 'p' ? '+' : '!'),
11214                                         name);
11215                         has_user_defined_property = 1;
11216
11217                         /* We don't know yet, so have to assume that the
11218                          * property could match something in the Latin1 range,
11219                          * hence something that isn't utf8 */
11220                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11221                     }
11222                     else {
11223
11224                         /* Here, did get the swash and its inversion list.  If
11225                          * the swash is from a user-defined property, then this
11226                          * whole character class should be regarded as such */
11227                         SV** user_defined_svp =
11228                                             hv_fetchs(MUTABLE_HV(SvRV(swash)),
11229                                                         "USER_DEFINED", FALSE);
11230                         if (user_defined_svp) {
11231                             has_user_defined_property
11232                                                     |= SvUV(*user_defined_svp);
11233                         }
11234
11235                         /* Invert if asking for the complement */
11236                         if (value == 'P') {
11237                             _invlist_union_complement_2nd(properties, invlist, &properties);
11238
11239                             /* The swash can't be used as-is, because we've
11240                              * inverted things; delay removing it to here after
11241                              * have copied its invlist above */
11242                             SvREFCNT_dec(swash);
11243                             swash = NULL;
11244                         }
11245                         else {
11246                             _invlist_union(properties, invlist, &properties);
11247                         }
11248                     }
11249                     Safefree(name);
11250                 }
11251                 RExC_parse = e + 1;
11252                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
11253
11254                 /* \p means they want Unicode semantics */
11255                 RExC_uni_semantics = 1;
11256                 }
11257                 break;
11258             case 'n':   value = '\n';                   break;
11259             case 'r':   value = '\r';                   break;
11260             case 't':   value = '\t';                   break;
11261             case 'f':   value = '\f';                   break;
11262             case 'b':   value = '\b';                   break;
11263             case 'e':   value = ASCII_TO_NATIVE('\033');break;
11264             case 'a':   value = ASCII_TO_NATIVE('\007');break;
11265             case 'o':
11266                 RExC_parse--;   /* function expects to be pointed at the 'o' */
11267                 {
11268                     const char* error_msg;
11269                     bool valid = grok_bslash_o(RExC_parse,
11270                                                &value,
11271                                                &numlen,
11272                                                &error_msg,
11273                                                SIZE_ONLY);
11274                     RExC_parse += numlen;
11275                     if (! valid) {
11276                         vFAIL(error_msg);
11277                     }
11278                 }
11279                 if (PL_encoding && value < 0x100) {
11280                     goto recode_encoding;
11281                 }
11282                 break;
11283             case 'x':
11284                 if (*RExC_parse == '{') {
11285                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
11286                         | PERL_SCAN_DISALLOW_PREFIX;
11287                     char * const e = strchr(RExC_parse++, '}');
11288                     if (!e)
11289                         vFAIL("Missing right brace on \\x{}");
11290
11291                     numlen = e - RExC_parse;
11292                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
11293                     RExC_parse = e + 1;
11294                 }
11295                 else {
11296                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
11297                     numlen = 2;
11298                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
11299                     RExC_parse += numlen;
11300                 }
11301                 if (PL_encoding && value < 0x100)
11302                     goto recode_encoding;
11303                 break;
11304             case 'c':
11305                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11306                 break;
11307             case '0': case '1': case '2': case '3': case '4':
11308             case '5': case '6': case '7':
11309                 {
11310                     /* Take 1-3 octal digits */
11311                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11312                     numlen = 3;
11313                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11314                     RExC_parse += numlen;
11315                     if (PL_encoding && value < 0x100)
11316                         goto recode_encoding;
11317                     break;
11318                 }
11319             recode_encoding:
11320                 if (! RExC_override_recoding) {
11321                     SV* enc = PL_encoding;
11322                     value = reg_recode((const char)(U8)value, &enc);
11323                     if (!enc && SIZE_ONLY)
11324                         ckWARNreg(RExC_parse,
11325                                   "Invalid escape in the specified encoding");
11326                     break;
11327                 }
11328             default:
11329                 /* Allow \_ to not give an error */
11330                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11331                     ckWARN2reg(RExC_parse,
11332                                "Unrecognized escape \\%c in character class passed through",
11333                                (int)value);
11334                 }
11335                 break;
11336             }
11337         } /* end of \blah */
11338 #ifdef EBCDIC
11339         else
11340             literal_endpoint++;
11341 #endif
11342
11343         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11344
11345             /* What matches in a locale is not known until runtime, so need to
11346              * (one time per class) allocate extra space to pass to regexec.
11347              * The space will contain a bit for each named class that is to be
11348              * matched against.  This isn't needed for \p{} and pseudo-classes,
11349              * as they are not affected by locale, and hence are dealt with
11350              * separately */
11351             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
11352                 need_class = 1;
11353                 if (SIZE_ONLY) {
11354                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11355                 }
11356                 else {
11357                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11358                     ANYOF_CLASS_ZERO(ret);
11359                 }
11360                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11361             }
11362
11363             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
11364              * literal, as is the character that began the false range, i.e.
11365              * the 'a' in the examples */
11366             if (range) {
11367                 if (!SIZE_ONLY) {
11368                     const int w =
11369                         RExC_parse >= rangebegin ?
11370                         RExC_parse - rangebegin : 0;
11371                     ckWARN4reg(RExC_parse,
11372                                "False [] range \"%*.*s\"",
11373                                w, w, rangebegin);
11374
11375                     stored +=
11376                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11377                     if (prevvalue < 256) {
11378                         stored +=
11379                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
11380                     }
11381                     else {
11382                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
11383                     }
11384                 }
11385
11386                 range = 0; /* this was not a true range */
11387             }
11388
11389             if (!SIZE_ONLY) {
11390
11391                 /* Possible truncation here but in some 64-bit environments
11392                  * the compiler gets heartburn about switch on 64-bit values.
11393                  * A similar issue a little earlier when switching on value.
11394                  * --jhi */
11395                 switch ((I32)namedclass) {
11396
11397                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11398                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11399                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11400                     break;
11401                 case ANYOF_NALNUMC:
11402                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11403                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11404                     break;
11405                 case ANYOF_ALPHA:
11406                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11407                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11408                     break;
11409                 case ANYOF_NALPHA:
11410                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11411                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11412                     break;
11413                 case ANYOF_ASCII:
11414                     if (LOC) {
11415                         ANYOF_CLASS_SET(ret, namedclass);
11416                     }
11417                     else {
11418                         _invlist_union(properties, PL_ASCII, &properties);
11419                     }
11420                     break;
11421                 case ANYOF_NASCII:
11422                     if (LOC) {
11423                         ANYOF_CLASS_SET(ret, namedclass);
11424                     }
11425                     else {
11426                         _invlist_union_complement_2nd(properties,
11427                                                     PL_ASCII, &properties);
11428                         if (DEPENDS_SEMANTICS) {
11429                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11430                         }
11431                     }
11432                     break;
11433                 case ANYOF_BLANK:
11434                     DO_POSIX(ret, namedclass, properties,
11435                                             PL_PosixBlank, PL_XPosixBlank);
11436                     break;
11437                 case ANYOF_NBLANK:
11438                     DO_N_POSIX(ret, namedclass, properties,
11439                                             PL_PosixBlank, PL_XPosixBlank);
11440                     break;
11441                 case ANYOF_CNTRL:
11442                     DO_POSIX(ret, namedclass, properties,
11443                                             PL_PosixCntrl, PL_XPosixCntrl);
11444                     break;
11445                 case ANYOF_NCNTRL:
11446                     DO_N_POSIX(ret, namedclass, properties,
11447                                             PL_PosixCntrl, PL_XPosixCntrl);
11448                     break;
11449                 case ANYOF_DIGIT:
11450                     /* There are no digits in the Latin1 range outside of
11451                      * ASCII, so call the macro that doesn't have to resolve
11452                      * them */
11453                     DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
11454                         PL_PosixDigit, "XPosixDigit", listsv);
11455                     break;
11456                 case ANYOF_NDIGIT:
11457                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11458                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
11459                     break;
11460                 case ANYOF_GRAPH:
11461                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11462                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11463                     break;
11464                 case ANYOF_NGRAPH:
11465                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11466                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11467                     break;
11468                 case ANYOF_HORIZWS:
11469                     /* For these, we use the nonbitmap, as /d doesn't make a
11470                      * difference in what these match.  There would be problems
11471                      * if these characters had folds other than themselves, as
11472                      * nonbitmap is subject to folding.  It turns out that \h
11473                      * is just a synonym for XPosixBlank */
11474                     _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
11475                     break;
11476                 case ANYOF_NHORIZWS:
11477                     _invlist_union_complement_2nd(nonbitmap,
11478                                                  PL_XPosixBlank, &nonbitmap);
11479                     break;
11480                 case ANYOF_LOWER:
11481                 case ANYOF_NLOWER:
11482                 {   /* These require special handling, as they differ under
11483                        folding, matching Cased there (which in the ASCII range
11484                        is the same as Alpha */
11485
11486                     SV* ascii_source;
11487                     SV* l1_source;
11488                     const char *Xname;
11489
11490                     if (FOLD && ! LOC) {
11491                         ascii_source = PL_PosixAlpha;
11492                         l1_source = PL_L1Cased;
11493                         Xname = "Cased";
11494                     }
11495                     else {
11496                         ascii_source = PL_PosixLower;
11497                         l1_source = PL_L1PosixLower;
11498                         Xname = "XPosixLower";
11499                     }
11500                     if (namedclass == ANYOF_LOWER) {
11501                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11502                                     ascii_source, l1_source, Xname, listsv);
11503                     }
11504                     else {
11505                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11506                             properties, ascii_source, l1_source, Xname, listsv);
11507                     }
11508                     break;
11509                 }
11510                 case ANYOF_PRINT:
11511                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11512                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11513                     break;
11514                 case ANYOF_NPRINT:
11515                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11516                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11517                     break;
11518                 case ANYOF_PUNCT:
11519                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11520                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11521                     break;
11522                 case ANYOF_NPUNCT:
11523                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11524                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11525                     break;
11526                 case ANYOF_PSXSPC:
11527                     DO_POSIX(ret, namedclass, properties,
11528                                             PL_PosixSpace, PL_XPosixSpace);
11529                     break;
11530                 case ANYOF_NPSXSPC:
11531                     DO_N_POSIX(ret, namedclass, properties,
11532                                             PL_PosixSpace, PL_XPosixSpace);
11533                     break;
11534                 case ANYOF_SPACE:
11535                     DO_POSIX(ret, namedclass, properties,
11536                                             PL_PerlSpace, PL_XPerlSpace);
11537                     break;
11538                 case ANYOF_NSPACE:
11539                     DO_N_POSIX(ret, namedclass, properties,
11540                                             PL_PerlSpace, PL_XPerlSpace);
11541                     break;
11542                 case ANYOF_UPPER:   /* Same as LOWER, above */
11543                 case ANYOF_NUPPER:
11544                 {
11545                     SV* ascii_source;
11546                     SV* l1_source;
11547                     const char *Xname;
11548
11549                     if (FOLD && ! LOC) {
11550                         ascii_source = PL_PosixAlpha;
11551                         l1_source = PL_L1Cased;
11552                         Xname = "Cased";
11553                     }
11554                     else {
11555                         ascii_source = PL_PosixUpper;
11556                         l1_source = PL_L1PosixUpper;
11557                         Xname = "XPosixUpper";
11558                     }
11559                     if (namedclass == ANYOF_UPPER) {
11560                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11561                                     ascii_source, l1_source, Xname, listsv);
11562                     }
11563                     else {
11564                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11565                         properties, ascii_source, l1_source, Xname, listsv);
11566                     }
11567                     break;
11568                 }
11569                 case ANYOF_ALNUM:   /* Really is 'Word' */
11570                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11571                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11572                     break;
11573                 case ANYOF_NALNUM:
11574                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11575                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11576                     break;
11577                 case ANYOF_VERTWS:
11578                     /* For these, we use the nonbitmap, as /d doesn't make a
11579                      * difference in what these match.  There would be problems
11580                      * if these characters had folds other than themselves, as
11581                      * nonbitmap is subject to folding */
11582                     _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
11583                     break;
11584                 case ANYOF_NVERTWS:
11585                     _invlist_union_complement_2nd(nonbitmap,
11586                                                     PL_VertSpace, &nonbitmap);
11587                     break;
11588                 case ANYOF_XDIGIT:
11589                     DO_POSIX(ret, namedclass, properties,
11590                                             PL_PosixXDigit, PL_XPosixXDigit);
11591                     break;
11592                 case ANYOF_NXDIGIT:
11593                     DO_N_POSIX(ret, namedclass, properties,
11594                                             PL_PosixXDigit, PL_XPosixXDigit);
11595                     break;
11596                 case ANYOF_MAX:
11597                     /* this is to handle \p and \P */
11598                     break;
11599                 default:
11600                     vFAIL("Invalid [::] class");
11601                     break;
11602                 }
11603
11604                 continue;
11605             }
11606         } /* end of namedclass \blah */
11607
11608         if (range) {
11609             if (prevvalue > (IV)value) /* b-a */ {
11610                 const int w = RExC_parse - rangebegin;
11611                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11612                 range = 0; /* not a valid range */
11613             }
11614         }
11615         else {
11616             prevvalue = value; /* save the beginning of the range */
11617             if (RExC_parse+1 < RExC_end
11618                 && *RExC_parse == '-'
11619                 && RExC_parse[1] != ']')
11620             {
11621                 RExC_parse++;
11622
11623                 /* a bad range like \w-, [:word:]- ? */
11624                 if (namedclass > OOB_NAMEDCLASS) {
11625                     if (ckWARN(WARN_REGEXP)) {
11626                         const int w =
11627                             RExC_parse >= rangebegin ?
11628                             RExC_parse - rangebegin : 0;
11629                         vWARN4(RExC_parse,
11630                                "False [] range \"%*.*s\"",
11631                                w, w, rangebegin);
11632                     }
11633                     if (!SIZE_ONLY)
11634                         stored +=
11635                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11636                 } else
11637                     range = 1;  /* yeah, it's a range! */
11638                 continue;       /* but do it the next time */
11639             }
11640         }
11641
11642         /* non-Latin1 code point implies unicode semantics.  Must be set in
11643          * pass1 so is there for the whole of pass 2 */
11644         if (value > 255) {
11645             RExC_uni_semantics = 1;
11646         }
11647
11648         /* now is the next time */
11649         if (!SIZE_ONLY) {
11650             if (prevvalue < 256) {
11651                 const IV ceilvalue = value < 256 ? value : 255;
11652                 IV i;
11653 #ifdef EBCDIC
11654                 /* In EBCDIC [\x89-\x91] should include
11655                  * the \x8e but [i-j] should not. */
11656                 if (literal_endpoint == 2 &&
11657                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11658                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11659                 {
11660                     if (isLOWER(prevvalue)) {
11661                         for (i = prevvalue; i <= ceilvalue; i++)
11662                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11663                                 stored +=
11664                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11665                             }
11666                     } else {
11667                         for (i = prevvalue; i <= ceilvalue; i++)
11668                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11669                                 stored +=
11670                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11671                             }
11672                     }
11673                 }
11674                 else
11675 #endif
11676                       for (i = prevvalue; i <= ceilvalue; i++) {
11677                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11678                       }
11679           }
11680           if (value > 255) {
11681             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
11682             const UV natvalue      = NATIVE_TO_UNI(value);
11683             nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11684         }
11685 #ifdef EBCDIC
11686             literal_endpoint = 0;
11687 #endif
11688         }
11689
11690         range = 0; /* this range (if it was one) is done now */
11691     }
11692
11693
11694
11695     if (SIZE_ONLY)
11696         return ret;
11697     /****** !SIZE_ONLY AFTER HERE *********/
11698
11699     /* If folding and there are code points above 255, we calculate all
11700      * characters that could fold to or from the ones already on the list */
11701     if (FOLD && nonbitmap) {
11702         UV start, end;  /* End points of code point ranges */
11703
11704         SV* fold_intersection = NULL;
11705
11706         /* This is a list of all the characters that participate in folds
11707             * (except marks, etc in multi-char folds */
11708         if (! PL_utf8_foldable) {
11709             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11710             PL_utf8_foldable = _swash_to_invlist(swash);
11711             SvREFCNT_dec(swash);
11712         }
11713
11714         /* This is a hash that for a particular fold gives all characters
11715             * that are involved in it */
11716         if (! PL_utf8_foldclosures) {
11717
11718             /* If we were unable to find any folds, then we likely won't be
11719              * able to find the closures.  So just create an empty list.
11720              * Folding will effectively be restricted to the non-Unicode rules
11721              * hard-coded into Perl.  (This case happens legitimately during
11722              * compilation of Perl itself before the Unicode tables are
11723              * generated) */
11724             if (invlist_len(PL_utf8_foldable) == 0) {
11725                 PL_utf8_foldclosures = newHV();
11726             } else {
11727                 /* If the folds haven't been read in, call a fold function
11728                     * to force that */
11729                 if (! PL_utf8_tofold) {
11730                     U8 dummy[UTF8_MAXBYTES+1];
11731                     STRLEN dummy_len;
11732
11733                     /* This particular string is above \xff in both UTF-8 and
11734                      * UTFEBCDIC */
11735                     to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11736                     assert(PL_utf8_tofold); /* Verify that worked */
11737                 }
11738                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11739             }
11740         }
11741
11742         /* Only the characters in this class that participate in folds need be
11743          * checked.  Get the intersection of this class and all the possible
11744          * characters that are foldable.  This can quickly narrow down a large
11745          * class */
11746         _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11747
11748         /* Now look at the foldable characters in this class individually */
11749         invlist_iterinit(fold_intersection);
11750         while (invlist_iternext(fold_intersection, &start, &end)) {
11751             UV j;
11752
11753             /* Look at every character in the range */
11754             for (j = start; j <= end; j++) {
11755
11756                 /* Get its fold */
11757                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11758                 STRLEN foldlen;
11759                 const UV f =
11760                     _to_uni_fold_flags(j, foldbuf, &foldlen,
11761                                        (allow_full_fold) ? FOLD_FLAGS_FULL : 0);
11762
11763                 if (foldlen > (STRLEN)UNISKIP(f)) {
11764
11765                     /* Any multicharacter foldings (disallowed in lookbehind
11766                      * patterns) require the following transform: [ABCDEF] ->
11767                      * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11768                      * folds into "rst", all other characters fold to single
11769                      * characters.  We save away these multicharacter foldings,
11770                      * to be later saved as part of the additional "s" data. */
11771                     if (! RExC_in_lookbehind) {
11772                         U8* loc = foldbuf;
11773                         U8* e = foldbuf + foldlen;
11774
11775                         /* If any of the folded characters of this are in the
11776                          * Latin1 range, tell the regex engine that this can
11777                          * match a non-utf8 target string.  The only multi-byte
11778                          * fold whose source is in the Latin1 range (U+00DF)
11779                          * applies only when the target string is utf8, or
11780                          * under unicode rules */
11781                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11782                             while (loc < e) {
11783
11784                                 /* Can't mix ascii with non- under /aa */
11785                                 if (MORE_ASCII_RESTRICTED
11786                                     && (isASCII(*loc) != isASCII(j)))
11787                                 {
11788                                     goto end_multi_fold;
11789                                 }
11790                                 if (UTF8_IS_INVARIANT(*loc)
11791                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
11792                                 {
11793                                     /* Can't mix above and below 256 under LOC
11794                                      */
11795                                     if (LOC) {
11796                                         goto end_multi_fold;
11797                                     }
11798                                     ANYOF_FLAGS(ret)
11799                                             |= ANYOF_NONBITMAP_NON_UTF8;
11800                                     break;
11801                                 }
11802                                 loc += UTF8SKIP(loc);
11803                             }
11804                         }
11805
11806                         add_alternate(&unicode_alternate, foldbuf, foldlen);
11807                     end_multi_fold: ;
11808                     }
11809
11810                     /* This is special-cased, as it is the only letter which
11811                      * has both a multi-fold and single-fold in Latin1.  All
11812                      * the other chars that have single and multi-folds are
11813                      * always in utf8, and the utf8 folding algorithm catches
11814                      * them */
11815                     if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11816                         stored += set_regclass_bit(pRExC_state,
11817                                         ret,
11818                                         LATIN_SMALL_LETTER_SHARP_S,
11819                                         &l1_fold_invlist, &unicode_alternate);
11820                     }
11821                 }
11822                 else {
11823                     /* Single character fold.  Add everything in its fold
11824                      * closure to the list that this node should match */
11825                     SV** listp;
11826
11827                     /* The fold closures data structure is a hash with the keys
11828                      * being every character that is folded to, like 'k', and
11829                      * the values each an array of everything that folds to its
11830                      * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
11831                     if ((listp = hv_fetch(PL_utf8_foldclosures,
11832                                     (char *) foldbuf, foldlen, FALSE)))
11833                     {
11834                         AV* list = (AV*) *listp;
11835                         IV k;
11836                         for (k = 0; k <= av_len(list); k++) {
11837                             SV** c_p = av_fetch(list, k, FALSE);
11838                             UV c;
11839                             if (c_p == NULL) {
11840                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11841                             }
11842                             c = SvUV(*c_p);
11843
11844                             /* /aa doesn't allow folds between ASCII and non-;
11845                              * /l doesn't allow them between above and below
11846                              * 256 */
11847                             if ((MORE_ASCII_RESTRICTED
11848                                  && (isASCII(c) != isASCII(j)))
11849                                     || (LOC && ((c < 256) != (j < 256))))
11850                             {
11851                                 continue;
11852                             }
11853
11854                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11855                                 stored += set_regclass_bit(pRExC_state,
11856                                         ret,
11857                                         (U8) c,
11858                                         &l1_fold_invlist, &unicode_alternate);
11859                             }
11860                                 /* It may be that the code point is already in
11861                                  * this range or already in the bitmap, in
11862                                  * which case we need do nothing */
11863                             else if ((c < start || c > end)
11864                                         && (c > 255
11865                                             || ! ANYOF_BITMAP_TEST(ret, c)))
11866                             {
11867                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11868                             }
11869                         }
11870                     }
11871                 }
11872             }
11873         }
11874         SvREFCNT_dec(fold_intersection);
11875     }
11876
11877     /* Combine the two lists into one. */
11878     if (l1_fold_invlist) {
11879         if (nonbitmap) {
11880             _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11881             SvREFCNT_dec(l1_fold_invlist);
11882         }
11883         else {
11884             nonbitmap = l1_fold_invlist;
11885         }
11886     }
11887
11888     /* And combine the result (if any) with any inversion list from properties.
11889      * The lists are kept separate up to now because we don't want to fold the
11890      * properties */
11891     if (properties) {
11892         if (nonbitmap) {
11893             _invlist_union(nonbitmap, properties, &nonbitmap);
11894             SvREFCNT_dec(properties);
11895         }
11896         else {
11897             nonbitmap = properties;
11898         }
11899     }
11900
11901     /* Here, <nonbitmap> contains all the code points we can determine at
11902      * compile time that we haven't put into the bitmap.  Go through it, and
11903      * for things that belong in the bitmap, put them there, and delete from
11904      * <nonbitmap> */
11905     if (nonbitmap) {
11906
11907         /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11908          * possibly only should match when the target string is UTF-8 */
11909         UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11910
11911         /* This gets set if we actually need to modify things */
11912         bool change_invlist = FALSE;
11913
11914         UV start, end;
11915
11916         /* Start looking through <nonbitmap> */
11917         invlist_iterinit(nonbitmap);
11918         while (invlist_iternext(nonbitmap, &start, &end)) {
11919             UV high;
11920             int i;
11921
11922             /* Quit if are above what we should change */
11923             if (start > max_cp_to_set) {
11924                 break;
11925             }
11926
11927             change_invlist = TRUE;
11928
11929             /* Set all the bits in the range, up to the max that we are doing */
11930             high = (end < max_cp_to_set) ? end : max_cp_to_set;
11931             for (i = start; i <= (int) high; i++) {
11932                 if (! ANYOF_BITMAP_TEST(ret, i)) {
11933                     ANYOF_BITMAP_SET(ret, i);
11934                     stored++;
11935                     prevvalue = value;
11936                     value = i;
11937                 }
11938             }
11939         }
11940
11941         /* Done with loop; remove any code points that are in the bitmap from
11942          * <nonbitmap> */
11943         if (change_invlist) {
11944             _invlist_subtract(nonbitmap,
11945                               (DEPENDS_SEMANTICS)
11946                                 ? PL_ASCII
11947                                 : PL_Latin1,
11948                               &nonbitmap);
11949         }
11950
11951         /* If have completely emptied it, remove it completely */
11952         if (invlist_len(nonbitmap) == 0) {
11953             SvREFCNT_dec(nonbitmap);
11954             nonbitmap = NULL;
11955         }
11956     }
11957
11958     /* Here, we have calculated what code points should be in the character
11959      * class.  <nonbitmap> does not overlap the bitmap except possibly in the
11960      * case of DEPENDS rules.
11961      *
11962      * Now we can see about various optimizations.  Fold calculation (which we
11963      * did above) needs to take place before inversion.  Otherwise /[^k]/i
11964      * would invert to include K, which under /i would match k, which it
11965      * shouldn't. */
11966
11967     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
11968      * set the FOLD flag yet, so this does optimize those.  It doesn't
11969      * optimize locale.  Doing so perhaps could be done as long as there is
11970      * nothing like \w in it; some thought also would have to be given to the
11971      * interaction with above 0x100 chars */
11972     if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11973         && ! LOC
11974         && ! unicode_alternate
11975         /* In case of /d, there are some things that should match only when in
11976          * not in the bitmap, i.e., they require UTF8 to match.  These are
11977          * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11978          * case, they don't require UTF8, so can invert here */
11979         && (! nonbitmap
11980             || ! DEPENDS_SEMANTICS
11981             || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11982         && SvCUR(listsv) == initial_listsv_len)
11983     {
11984         int i;
11985         if (! nonbitmap) {
11986             for (i = 0; i < 256; ++i) {
11987                 if (ANYOF_BITMAP_TEST(ret, i)) {
11988                     ANYOF_BITMAP_CLEAR(ret, i);
11989                 }
11990                 else {
11991                     ANYOF_BITMAP_SET(ret, i);
11992                     prevvalue = value;
11993                     value = i;
11994                 }
11995             }
11996             /* The inversion means that everything above 255 is matched */
11997             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11998         }
11999         else {
12000             /* Here, also has things outside the bitmap that may overlap with
12001              * the bitmap.  We have to sync them up, so that they get inverted
12002              * in both places.  Earlier, we removed all overlaps except in the
12003              * case of /d rules, so no syncing is needed except for this case
12004              */
12005             SV *remove_list = NULL;
12006
12007             if (DEPENDS_SEMANTICS) {
12008                 UV start, end;
12009
12010                 /* Set the bits that correspond to the ones that aren't in the
12011                  * bitmap.  Otherwise, when we invert, we'll miss these.
12012                  * Earlier, we removed from the nonbitmap all code points
12013                  * < 128, so there is no extra work here */
12014                 invlist_iterinit(nonbitmap);
12015                 while (invlist_iternext(nonbitmap, &start, &end)) {
12016                     if (start > 255) {  /* The bit map goes to 255 */
12017                         break;
12018                     }
12019                     if (end > 255) {
12020                         end = 255;
12021                     }
12022                     for (i = start; i <= (int) end; ++i) {
12023                         ANYOF_BITMAP_SET(ret, i);
12024                         prevvalue = value;
12025                         value = i;
12026                     }
12027                 }
12028             }
12029
12030             /* Now invert both the bitmap and the nonbitmap.  Anything in the
12031              * bitmap has to also be removed from the non-bitmap, but again,
12032              * there should not be overlap unless is /d rules. */
12033             _invlist_invert(nonbitmap);
12034
12035             /* Any swash can't be used as-is, because we've inverted things */
12036             if (swash) {
12037                 SvREFCNT_dec(swash);
12038                 swash = NULL;
12039             }
12040
12041             for (i = 0; i < 256; ++i) {
12042                 if (ANYOF_BITMAP_TEST(ret, i)) {
12043                     ANYOF_BITMAP_CLEAR(ret, i);
12044                     if (DEPENDS_SEMANTICS) {
12045                         if (! remove_list) {
12046                             remove_list = _new_invlist(2);
12047                         }
12048                         remove_list = add_cp_to_invlist(remove_list, i);
12049                     }
12050                 }
12051                 else {
12052                     ANYOF_BITMAP_SET(ret, i);
12053                     prevvalue = value;
12054                     value = i;
12055                 }
12056             }
12057
12058             /* And do the removal */
12059             if (DEPENDS_SEMANTICS) {
12060                 if (remove_list) {
12061                     _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
12062                     SvREFCNT_dec(remove_list);
12063                 }
12064             }
12065             else {
12066                 /* There is no overlap for non-/d, so just delete anything
12067                  * below 256 */
12068                 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
12069             }
12070         }
12071
12072         stored = 256 - stored;
12073
12074         /* Clear the invert flag since have just done it here */
12075         ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
12076     }
12077
12078     /* Folding in the bitmap is taken care of above, but not for locale (for
12079      * which we have to wait to see what folding is in effect at runtime), and
12080      * for some things not in the bitmap (only the upper latin folds in this
12081      * case, as all other single-char folding has been set above).  Set
12082      * run-time fold flag for these */
12083     if (FOLD && (LOC
12084                 || (DEPENDS_SEMANTICS
12085                     && nonbitmap
12086                     && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12087                 || unicode_alternate))
12088     {
12089         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12090     }
12091
12092     /* A single character class can be "optimized" into an EXACTish node.
12093      * Note that since we don't currently count how many characters there are
12094      * outside the bitmap, we are XXX missing optimization possibilities for
12095      * them.  This optimization can't happen unless this is a truly single
12096      * character class, which means that it can't be an inversion into a
12097      * many-character class, and there must be no possibility of there being
12098      * things outside the bitmap.  'stored' (only) for locales doesn't include
12099      * \w, etc, so have to make a special test that they aren't present
12100      *
12101      * Similarly A 2-character class of the very special form like [bB] can be
12102      * optimized into an EXACTFish node, but only for non-locales, and for
12103      * characters which only have the two folds; so things like 'fF' and 'Ii'
12104      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
12105      * FI'. */
12106     if (! nonbitmap
12107         && ! unicode_alternate
12108         && SvCUR(listsv) == initial_listsv_len
12109         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
12110         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12111                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
12112             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12113                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
12114                                  /* If the latest code point has a fold whose
12115                                   * bit is set, it must be the only other one */
12116                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
12117                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
12118     {
12119         /* Note that the information needed to decide to do this optimization
12120          * is not currently available until the 2nd pass, and that the actually
12121          * used EXACTish node takes less space than the calculated ANYOF node,
12122          * and hence the amount of space calculated in the first pass is larger
12123          * than actually used, so this optimization doesn't gain us any space.
12124          * But an EXACT node is faster than an ANYOF node, and can be combined
12125          * with any adjacent EXACT nodes later by the optimizer for further
12126          * gains.  The speed of executing an EXACTF is similar to an ANYOF
12127          * node, so the optimization advantage comes from the ability to join
12128          * it to adjacent EXACT nodes */
12129
12130         const char * cur_parse= RExC_parse;
12131         U8 op;
12132         RExC_emit = (regnode *)orig_emit;
12133         RExC_parse = (char *)orig_parse;
12134
12135         if (stored == 1) {
12136
12137             /* A locale node with one point can be folded; all the other cases
12138              * with folding will have two points, since we calculate them above
12139              */
12140             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
12141                  op = EXACTFL;
12142             }
12143             else {
12144                 op = EXACT;
12145             }
12146         }
12147         else {   /* else 2 chars in the bit map: the folds of each other */
12148
12149             /* Use the folded value, which for the cases where we get here,
12150              * is just the lower case of the current one (which may resolve to
12151              * itself, or to the other one */
12152             value = toLOWER_LATIN1(value);
12153
12154             /* To join adjacent nodes, they must be the exact EXACTish type.
12155              * Try to use the most likely type, by using EXACTFA if possible,
12156              * then EXACTFU if the regex calls for it, or is required because
12157              * the character is non-ASCII.  (If <value> is ASCII, its fold is
12158              * also ASCII for the cases where we get here.) */
12159             if (MORE_ASCII_RESTRICTED && isASCII(value)) {
12160                 op = EXACTFA;
12161             }
12162             else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
12163                 op = EXACTFU;
12164             }
12165             else {    /* Otherwise, more likely to be EXACTF type */
12166                 op = EXACTF;
12167             }
12168         }
12169
12170         ret = reg_node(pRExC_state, op);
12171         RExC_parse = (char *)cur_parse;
12172         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
12173             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
12174             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
12175             STR_LEN(ret)= 2;
12176             RExC_emit += STR_SZ(2);
12177         }
12178         else {
12179             *STRING(ret)= (char)value;
12180             STR_LEN(ret)= 1;
12181             RExC_emit += STR_SZ(1);
12182         }
12183         SvREFCNT_dec(listsv);
12184         return ret;
12185     }
12186
12187     /* If there is a swash and more than one element, we can't use the swash in
12188      * the optimization below. */
12189     if (swash && element_count > 1) {
12190         SvREFCNT_dec(swash);
12191         swash = NULL;
12192     }
12193     if (! nonbitmap
12194         && SvCUR(listsv) == initial_listsv_len
12195         && ! unicode_alternate)
12196     {
12197         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12198         SvREFCNT_dec(listsv);
12199         SvREFCNT_dec(unicode_alternate);
12200     }
12201     else {
12202         /* av[0] stores the character class description in its textual form:
12203          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
12204          *       appropriate swash, and is also useful for dumping the regnode.
12205          * av[1] if NULL, is a placeholder to later contain the swash computed
12206          *       from av[0].  But if no further computation need be done, the
12207          *       swash is stored there now.
12208          * av[2] stores the multicharacter foldings, used later in
12209          *       regexec.c:S_reginclass().
12210          * av[3] stores the nonbitmap inversion list for use in addition or
12211          *       instead of av[0]; not used if av[1] isn't NULL
12212          * av[4] is set if any component of the class is from a user-defined
12213          *       property; not used if av[1] isn't NULL */
12214         AV * const av = newAV();
12215         SV *rv;
12216
12217         av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
12218                         ? &PL_sv_undef
12219                         : listsv);
12220         if (swash) {
12221             av_store(av, 1, swash);
12222             SvREFCNT_dec(nonbitmap);
12223         }
12224         else {
12225             av_store(av, 1, NULL);
12226             if (nonbitmap) {
12227                 av_store(av, 3, nonbitmap);
12228                 av_store(av, 4, newSVuv(has_user_defined_property));
12229             }
12230         }
12231
12232         /* Store any computed multi-char folds only if we are allowing
12233          * them */
12234         if (allow_full_fold) {
12235             av_store(av, 2, MUTABLE_SV(unicode_alternate));
12236             if (unicode_alternate) { /* This node is variable length */
12237                 OP(ret) = ANYOFV;
12238             }
12239         }
12240         else {
12241             av_store(av, 2, NULL);
12242         }
12243         rv = newRV_noinc(MUTABLE_SV(av));
12244         n = add_data(pRExC_state, 1, "s");
12245         RExC_rxi->data->data[n] = (void*)rv;
12246         ARG_SET(ret, n);
12247     }
12248     return ret;
12249 }
12250
12251
12252 /* reg_skipcomment()
12253
12254    Absorbs an /x style # comments from the input stream.
12255    Returns true if there is more text remaining in the stream.
12256    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12257    terminates the pattern without including a newline.
12258
12259    Note its the callers responsibility to ensure that we are
12260    actually in /x mode
12261
12262 */
12263
12264 STATIC bool
12265 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12266 {
12267     bool ended = 0;
12268
12269     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12270
12271     while (RExC_parse < RExC_end)
12272         if (*RExC_parse++ == '\n') {
12273             ended = 1;
12274             break;
12275         }
12276     if (!ended) {
12277         /* we ran off the end of the pattern without ending
12278            the comment, so we have to add an \n when wrapping */
12279         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12280         return 0;
12281     } else
12282         return 1;
12283 }
12284
12285 /* nextchar()
12286
12287    Advances the parse position, and optionally absorbs
12288    "whitespace" from the inputstream.
12289
12290    Without /x "whitespace" means (?#...) style comments only,
12291    with /x this means (?#...) and # comments and whitespace proper.
12292
12293    Returns the RExC_parse point from BEFORE the scan occurs.
12294
12295    This is the /x friendly way of saying RExC_parse++.
12296 */
12297
12298 STATIC char*
12299 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12300 {
12301     char* const retval = RExC_parse++;
12302
12303     PERL_ARGS_ASSERT_NEXTCHAR;
12304
12305     for (;;) {
12306         if (RExC_end - RExC_parse >= 3
12307             && *RExC_parse == '('
12308             && RExC_parse[1] == '?'
12309             && RExC_parse[2] == '#')
12310         {
12311             while (*RExC_parse != ')') {
12312                 if (RExC_parse == RExC_end)
12313                     FAIL("Sequence (?#... not terminated");
12314                 RExC_parse++;
12315             }
12316             RExC_parse++;
12317             continue;
12318         }
12319         if (RExC_flags & RXf_PMf_EXTENDED) {
12320             if (isSPACE(*RExC_parse)) {
12321                 RExC_parse++;
12322                 continue;
12323             }
12324             else if (*RExC_parse == '#') {
12325                 if ( reg_skipcomment( pRExC_state ) )
12326                     continue;
12327             }
12328         }
12329         return retval;
12330     }
12331 }
12332
12333 /*
12334 - reg_node - emit a node
12335 */
12336 STATIC regnode *                        /* Location. */
12337 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12338 {
12339     dVAR;
12340     register regnode *ptr;
12341     regnode * const ret = RExC_emit;
12342     GET_RE_DEBUG_FLAGS_DECL;
12343
12344     PERL_ARGS_ASSERT_REG_NODE;
12345
12346     if (SIZE_ONLY) {
12347         SIZE_ALIGN(RExC_size);
12348         RExC_size += 1;
12349         return(ret);
12350     }
12351     if (RExC_emit >= RExC_emit_bound)
12352         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12353                    op, RExC_emit, RExC_emit_bound);
12354
12355     NODE_ALIGN_FILL(ret);
12356     ptr = ret;
12357     FILL_ADVANCE_NODE(ptr, op);
12358 #ifdef RE_TRACK_PATTERN_OFFSETS
12359     if (RExC_offsets) {         /* MJD */
12360         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
12361               "reg_node", __LINE__,
12362               PL_reg_name[op],
12363               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
12364                 ? "Overwriting end of array!\n" : "OK",
12365               (UV)(RExC_emit - RExC_emit_start),
12366               (UV)(RExC_parse - RExC_start),
12367               (UV)RExC_offsets[0]));
12368         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
12369     }
12370 #endif
12371     RExC_emit = ptr;
12372     return(ret);
12373 }
12374
12375 /*
12376 - reganode - emit a node with an argument
12377 */
12378 STATIC regnode *                        /* Location. */
12379 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12380 {
12381     dVAR;
12382     register regnode *ptr;
12383     regnode * const ret = RExC_emit;
12384     GET_RE_DEBUG_FLAGS_DECL;
12385
12386     PERL_ARGS_ASSERT_REGANODE;
12387
12388     if (SIZE_ONLY) {
12389         SIZE_ALIGN(RExC_size);
12390         RExC_size += 2;
12391         /*
12392            We can't do this:
12393
12394            assert(2==regarglen[op]+1);
12395
12396            Anything larger than this has to allocate the extra amount.
12397            If we changed this to be:
12398
12399            RExC_size += (1 + regarglen[op]);
12400
12401            then it wouldn't matter. Its not clear what side effect
12402            might come from that so its not done so far.
12403            -- dmq
12404         */
12405         return(ret);
12406     }
12407     if (RExC_emit >= RExC_emit_bound)
12408         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12409                    op, RExC_emit, RExC_emit_bound);
12410
12411     NODE_ALIGN_FILL(ret);
12412     ptr = ret;
12413     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
12414 #ifdef RE_TRACK_PATTERN_OFFSETS
12415     if (RExC_offsets) {         /* MJD */
12416         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12417               "reganode",
12418               __LINE__,
12419               PL_reg_name[op],
12420               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
12421               "Overwriting end of array!\n" : "OK",
12422               (UV)(RExC_emit - RExC_emit_start),
12423               (UV)(RExC_parse - RExC_start),
12424               (UV)RExC_offsets[0]));
12425         Set_Cur_Node_Offset;
12426     }
12427 #endif
12428     RExC_emit = ptr;
12429     return(ret);
12430 }
12431
12432 /*
12433 - reguni - emit (if appropriate) a Unicode character
12434 */
12435 STATIC STRLEN
12436 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
12437 {
12438     dVAR;
12439
12440     PERL_ARGS_ASSERT_REGUNI;
12441
12442     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
12443 }
12444
12445 /*
12446 - reginsert - insert an operator in front of already-emitted operand
12447 *
12448 * Means relocating the operand.
12449 */
12450 STATIC void
12451 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
12452 {
12453     dVAR;
12454     register regnode *src;
12455     register regnode *dst;
12456     register regnode *place;
12457     const int offset = regarglen[(U8)op];
12458     const int size = NODE_STEP_REGNODE + offset;
12459     GET_RE_DEBUG_FLAGS_DECL;
12460
12461     PERL_ARGS_ASSERT_REGINSERT;
12462     PERL_UNUSED_ARG(depth);
12463 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
12464     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
12465     if (SIZE_ONLY) {
12466         RExC_size += size;
12467         return;
12468     }
12469
12470     src = RExC_emit;
12471     RExC_emit += size;
12472     dst = RExC_emit;
12473     if (RExC_open_parens) {
12474         int paren;
12475         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
12476         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12477             if ( RExC_open_parens[paren] >= opnd ) {
12478                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
12479                 RExC_open_parens[paren] += size;
12480             } else {
12481                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12482             }
12483             if ( RExC_close_parens[paren] >= opnd ) {
12484                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
12485                 RExC_close_parens[paren] += size;
12486             } else {
12487                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12488             }
12489         }
12490     }
12491
12492     while (src > opnd) {
12493         StructCopy(--src, --dst, regnode);
12494 #ifdef RE_TRACK_PATTERN_OFFSETS
12495         if (RExC_offsets) {     /* MJD 20010112 */
12496             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
12497                   "reg_insert",
12498                   __LINE__,
12499                   PL_reg_name[op],
12500                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
12501                     ? "Overwriting end of array!\n" : "OK",
12502                   (UV)(src - RExC_emit_start),
12503                   (UV)(dst - RExC_emit_start),
12504                   (UV)RExC_offsets[0]));
12505             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
12506             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
12507         }
12508 #endif
12509     }
12510
12511
12512     place = opnd;               /* Op node, where operand used to be. */
12513 #ifdef RE_TRACK_PATTERN_OFFSETS
12514     if (RExC_offsets) {         /* MJD */
12515         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12516               "reginsert",
12517               __LINE__,
12518               PL_reg_name[op],
12519               (UV)(place - RExC_emit_start) > RExC_offsets[0]
12520               ? "Overwriting end of array!\n" : "OK",
12521               (UV)(place - RExC_emit_start),
12522               (UV)(RExC_parse - RExC_start),
12523               (UV)RExC_offsets[0]));
12524         Set_Node_Offset(place, RExC_parse);
12525         Set_Node_Length(place, 1);
12526     }
12527 #endif
12528     src = NEXTOPER(place);
12529     FILL_ADVANCE_NODE(place, op);
12530     Zero(src, offset, regnode);
12531 }
12532
12533 /*
12534 - regtail - set the next-pointer at the end of a node chain of p to val.
12535 - SEE ALSO: regtail_study
12536 */
12537 /* TODO: All three parms should be const */
12538 STATIC void
12539 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12540 {
12541     dVAR;
12542     register regnode *scan;
12543     GET_RE_DEBUG_FLAGS_DECL;
12544
12545     PERL_ARGS_ASSERT_REGTAIL;
12546 #ifndef DEBUGGING
12547     PERL_UNUSED_ARG(depth);
12548 #endif
12549
12550     if (SIZE_ONLY)
12551         return;
12552
12553     /* Find last node. */
12554     scan = p;
12555     for (;;) {
12556         regnode * const temp = regnext(scan);
12557         DEBUG_PARSE_r({
12558             SV * const mysv=sv_newmortal();
12559             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12560             regprop(RExC_rx, mysv, scan);
12561             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12562                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12563                     (temp == NULL ? "->" : ""),
12564                     (temp == NULL ? PL_reg_name[OP(val)] : "")
12565             );
12566         });
12567         if (temp == NULL)
12568             break;
12569         scan = temp;
12570     }
12571
12572     if (reg_off_by_arg[OP(scan)]) {
12573         ARG_SET(scan, val - scan);
12574     }
12575     else {
12576         NEXT_OFF(scan) = val - scan;
12577     }
12578 }
12579
12580 #ifdef DEBUGGING
12581 /*
12582 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12583 - Look for optimizable sequences at the same time.
12584 - currently only looks for EXACT chains.
12585
12586 This is experimental code. The idea is to use this routine to perform
12587 in place optimizations on branches and groups as they are constructed,
12588 with the long term intention of removing optimization from study_chunk so
12589 that it is purely analytical.
12590
12591 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12592 to control which is which.
12593
12594 */
12595 /* TODO: All four parms should be const */
12596
12597 STATIC U8
12598 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12599 {
12600     dVAR;
12601     register regnode *scan;
12602     U8 exact = PSEUDO;
12603 #ifdef EXPERIMENTAL_INPLACESCAN
12604     I32 min = 0;
12605 #endif
12606     GET_RE_DEBUG_FLAGS_DECL;
12607
12608     PERL_ARGS_ASSERT_REGTAIL_STUDY;
12609
12610
12611     if (SIZE_ONLY)
12612         return exact;
12613
12614     /* Find last node. */
12615
12616     scan = p;
12617     for (;;) {
12618         regnode * const temp = regnext(scan);
12619 #ifdef EXPERIMENTAL_INPLACESCAN
12620         if (PL_regkind[OP(scan)] == EXACT) {
12621             bool has_exactf_sharp_s;    /* Unexamined in this routine */
12622             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12623                 return EXACT;
12624         }
12625 #endif
12626         if ( exact ) {
12627             switch (OP(scan)) {
12628                 case EXACT:
12629                 case EXACTF:
12630                 case EXACTFA:
12631                 case EXACTFU:
12632                 case EXACTFU_SS:
12633                 case EXACTFU_TRICKYFOLD:
12634                 case EXACTFL:
12635                         if( exact == PSEUDO )
12636                             exact= OP(scan);
12637                         else if ( exact != OP(scan) )
12638                             exact= 0;
12639                 case NOTHING:
12640                     break;
12641                 default:
12642                     exact= 0;
12643             }
12644         }
12645         DEBUG_PARSE_r({
12646             SV * const mysv=sv_newmortal();
12647             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12648             regprop(RExC_rx, mysv, scan);
12649             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12650                 SvPV_nolen_const(mysv),
12651                 REG_NODE_NUM(scan),
12652                 PL_reg_name[exact]);
12653         });
12654         if (temp == NULL)
12655             break;
12656         scan = temp;
12657     }
12658     DEBUG_PARSE_r({
12659         SV * const mysv_val=sv_newmortal();
12660         DEBUG_PARSE_MSG("");
12661         regprop(RExC_rx, mysv_val, val);
12662         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12663                       SvPV_nolen_const(mysv_val),
12664                       (IV)REG_NODE_NUM(val),
12665                       (IV)(val - scan)
12666         );
12667     });
12668     if (reg_off_by_arg[OP(scan)]) {
12669         ARG_SET(scan, val - scan);
12670     }
12671     else {
12672         NEXT_OFF(scan) = val - scan;
12673     }
12674
12675     return exact;
12676 }
12677 #endif
12678
12679 /*
12680  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12681  */
12682 #ifdef DEBUGGING
12683 static void
12684 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12685 {
12686     int bit;
12687     int set=0;
12688     regex_charset cs;
12689
12690     for (bit=0; bit<32; bit++) {
12691         if (flags & (1<<bit)) {
12692             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
12693                 continue;
12694             }
12695             if (!set++ && lead)
12696                 PerlIO_printf(Perl_debug_log, "%s",lead);
12697             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12698         }
12699     }
12700     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12701             if (!set++ && lead) {
12702                 PerlIO_printf(Perl_debug_log, "%s",lead);
12703             }
12704             switch (cs) {
12705                 case REGEX_UNICODE_CHARSET:
12706                     PerlIO_printf(Perl_debug_log, "UNICODE");
12707                     break;
12708                 case REGEX_LOCALE_CHARSET:
12709                     PerlIO_printf(Perl_debug_log, "LOCALE");
12710                     break;
12711                 case REGEX_ASCII_RESTRICTED_CHARSET:
12712                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12713                     break;
12714                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12715                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12716                     break;
12717                 default:
12718                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12719                     break;
12720             }
12721     }
12722     if (lead)  {
12723         if (set)
12724             PerlIO_printf(Perl_debug_log, "\n");
12725         else
12726             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12727     }
12728 }
12729 #endif
12730
12731 void
12732 Perl_regdump(pTHX_ const regexp *r)
12733 {
12734 #ifdef DEBUGGING
12735     dVAR;
12736     SV * const sv = sv_newmortal();
12737     SV *dsv= sv_newmortal();
12738     RXi_GET_DECL(r,ri);
12739     GET_RE_DEBUG_FLAGS_DECL;
12740
12741     PERL_ARGS_ASSERT_REGDUMP;
12742
12743     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12744
12745     /* Header fields of interest. */
12746     if (r->anchored_substr) {
12747         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12748             RE_SV_DUMPLEN(r->anchored_substr), 30);
12749         PerlIO_printf(Perl_debug_log,
12750                       "anchored %s%s at %"IVdf" ",
12751                       s, RE_SV_TAIL(r->anchored_substr),
12752                       (IV)r->anchored_offset);
12753     } else if (r->anchored_utf8) {
12754         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12755             RE_SV_DUMPLEN(r->anchored_utf8), 30);
12756         PerlIO_printf(Perl_debug_log,
12757                       "anchored utf8 %s%s at %"IVdf" ",
12758                       s, RE_SV_TAIL(r->anchored_utf8),
12759                       (IV)r->anchored_offset);
12760     }
12761     if (r->float_substr) {
12762         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12763             RE_SV_DUMPLEN(r->float_substr), 30);
12764         PerlIO_printf(Perl_debug_log,
12765                       "floating %s%s at %"IVdf"..%"UVuf" ",
12766                       s, RE_SV_TAIL(r->float_substr),
12767                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12768     } else if (r->float_utf8) {
12769         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12770             RE_SV_DUMPLEN(r->float_utf8), 30);
12771         PerlIO_printf(Perl_debug_log,
12772                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12773                       s, RE_SV_TAIL(r->float_utf8),
12774                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12775     }
12776     if (r->check_substr || r->check_utf8)
12777         PerlIO_printf(Perl_debug_log,
12778                       (const char *)
12779                       (r->check_substr == r->float_substr
12780                        && r->check_utf8 == r->float_utf8
12781                        ? "(checking floating" : "(checking anchored"));
12782     if (r->extflags & RXf_NOSCAN)
12783         PerlIO_printf(Perl_debug_log, " noscan");
12784     if (r->extflags & RXf_CHECK_ALL)
12785         PerlIO_printf(Perl_debug_log, " isall");
12786     if (r->check_substr || r->check_utf8)
12787         PerlIO_printf(Perl_debug_log, ") ");
12788
12789     if (ri->regstclass) {
12790         regprop(r, sv, ri->regstclass);
12791         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12792     }
12793     if (r->extflags & RXf_ANCH) {
12794         PerlIO_printf(Perl_debug_log, "anchored");
12795         if (r->extflags & RXf_ANCH_BOL)
12796             PerlIO_printf(Perl_debug_log, "(BOL)");
12797         if (r->extflags & RXf_ANCH_MBOL)
12798             PerlIO_printf(Perl_debug_log, "(MBOL)");
12799         if (r->extflags & RXf_ANCH_SBOL)
12800             PerlIO_printf(Perl_debug_log, "(SBOL)");
12801         if (r->extflags & RXf_ANCH_GPOS)
12802             PerlIO_printf(Perl_debug_log, "(GPOS)");
12803         PerlIO_putc(Perl_debug_log, ' ');
12804     }
12805     if (r->extflags & RXf_GPOS_SEEN)
12806         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12807     if (r->intflags & PREGf_SKIP)
12808         PerlIO_printf(Perl_debug_log, "plus ");
12809     if (r->intflags & PREGf_IMPLICIT)
12810         PerlIO_printf(Perl_debug_log, "implicit ");
12811     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12812     if (r->extflags & RXf_EVAL_SEEN)
12813         PerlIO_printf(Perl_debug_log, "with eval ");
12814     PerlIO_printf(Perl_debug_log, "\n");
12815     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12816 #else
12817     PERL_ARGS_ASSERT_REGDUMP;
12818     PERL_UNUSED_CONTEXT;
12819     PERL_UNUSED_ARG(r);
12820 #endif  /* DEBUGGING */
12821 }
12822
12823 /*
12824 - regprop - printable representation of opcode
12825 */
12826 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12827 STMT_START { \
12828         if (do_sep) {                           \
12829             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12830             if (flags & ANYOF_INVERT)           \
12831                 /*make sure the invert info is in each */ \
12832                 sv_catpvs(sv, "^");             \
12833             do_sep = 0;                         \
12834         }                                       \
12835 } STMT_END
12836
12837 void
12838 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12839 {
12840 #ifdef DEBUGGING
12841     dVAR;
12842     register int k;
12843     RXi_GET_DECL(prog,progi);
12844     GET_RE_DEBUG_FLAGS_DECL;
12845
12846     PERL_ARGS_ASSERT_REGPROP;
12847
12848     sv_setpvs(sv, "");
12849
12850     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
12851         /* It would be nice to FAIL() here, but this may be called from
12852            regexec.c, and it would be hard to supply pRExC_state. */
12853         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12854     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12855
12856     k = PL_regkind[OP(o)];
12857
12858     if (k == EXACT) {
12859         sv_catpvs(sv, " ");
12860         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12861          * is a crude hack but it may be the best for now since
12862          * we have no flag "this EXACTish node was UTF-8"
12863          * --jhi */
12864         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12865                   PERL_PV_ESCAPE_UNI_DETECT |
12866                   PERL_PV_ESCAPE_NONASCII   |
12867                   PERL_PV_PRETTY_ELLIPSES   |
12868                   PERL_PV_PRETTY_LTGT       |
12869                   PERL_PV_PRETTY_NOCLEAR
12870                   );
12871     } else if (k == TRIE) {
12872         /* print the details of the trie in dumpuntil instead, as
12873          * progi->data isn't available here */
12874         const char op = OP(o);
12875         const U32 n = ARG(o);
12876         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12877                (reg_ac_data *)progi->data->data[n] :
12878                NULL;
12879         const reg_trie_data * const trie
12880             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12881
12882         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12883         DEBUG_TRIE_COMPILE_r(
12884             Perl_sv_catpvf(aTHX_ sv,
12885                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12886                 (UV)trie->startstate,
12887                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12888                 (UV)trie->wordcount,
12889                 (UV)trie->minlen,
12890                 (UV)trie->maxlen,
12891                 (UV)TRIE_CHARCOUNT(trie),
12892                 (UV)trie->uniquecharcount
12893             )
12894         );
12895         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12896             int i;
12897             int rangestart = -1;
12898             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12899             sv_catpvs(sv, "[");
12900             for (i = 0; i <= 256; i++) {
12901                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12902                     if (rangestart == -1)
12903                         rangestart = i;
12904                 } else if (rangestart != -1) {
12905                     if (i <= rangestart + 3)
12906                         for (; rangestart < i; rangestart++)
12907                             put_byte(sv, rangestart);
12908                     else {
12909                         put_byte(sv, rangestart);
12910                         sv_catpvs(sv, "-");
12911                         put_byte(sv, i - 1);
12912                     }
12913                     rangestart = -1;
12914                 }
12915             }
12916             sv_catpvs(sv, "]");
12917         }
12918
12919     } else if (k == CURLY) {
12920         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12921             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12922         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12923     }
12924     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
12925         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12926     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12927         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
12928         if ( RXp_PAREN_NAMES(prog) ) {
12929             if ( k != REF || (OP(o) < NREF)) {
12930                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12931                 SV **name= av_fetch(list, ARG(o), 0 );
12932                 if (name)
12933                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12934             }
12935             else {
12936                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12937                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12938                 I32 *nums=(I32*)SvPVX(sv_dat);
12939                 SV **name= av_fetch(list, nums[0], 0 );
12940                 I32 n;
12941                 if (name) {
12942                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
12943                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12944                                     (n ? "," : ""), (IV)nums[n]);
12945                     }
12946                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12947                 }
12948             }
12949         }
12950     } else if (k == GOSUB)
12951         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12952     else if (k == VERB) {
12953         if (!o->flags)
12954             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12955                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12956     } else if (k == LOGICAL)
12957         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
12958     else if (k == ANYOF) {
12959         int i, rangestart = -1;
12960         const U8 flags = ANYOF_FLAGS(o);
12961         int do_sep = 0;
12962
12963         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12964         static const char * const anyofs[] = {
12965             "\\w",
12966             "\\W",
12967             "\\s",
12968             "\\S",
12969             "\\d",
12970             "\\D",
12971             "[:alnum:]",
12972             "[:^alnum:]",
12973             "[:alpha:]",
12974             "[:^alpha:]",
12975             "[:ascii:]",
12976             "[:^ascii:]",
12977             "[:cntrl:]",
12978             "[:^cntrl:]",
12979             "[:graph:]",
12980             "[:^graph:]",
12981             "[:lower:]",
12982             "[:^lower:]",
12983             "[:print:]",
12984             "[:^print:]",
12985             "[:punct:]",
12986             "[:^punct:]",
12987             "[:upper:]",
12988             "[:^upper:]",
12989             "[:xdigit:]",
12990             "[:^xdigit:]",
12991             "[:space:]",
12992             "[:^space:]",
12993             "[:blank:]",
12994             "[:^blank:]"
12995         };
12996
12997         if (flags & ANYOF_LOCALE)
12998             sv_catpvs(sv, "{loc}");
12999         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13000             sv_catpvs(sv, "{i}");
13001         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13002         if (flags & ANYOF_INVERT)
13003             sv_catpvs(sv, "^");
13004
13005         /* output what the standard cp 0-255 bitmap matches */
13006         for (i = 0; i <= 256; i++) {
13007             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13008                 if (rangestart == -1)
13009                     rangestart = i;
13010             } else if (rangestart != -1) {
13011                 if (i <= rangestart + 3)
13012                     for (; rangestart < i; rangestart++)
13013                         put_byte(sv, rangestart);
13014                 else {
13015                     put_byte(sv, rangestart);
13016                     sv_catpvs(sv, "-");
13017                     put_byte(sv, i - 1);
13018                 }
13019                 do_sep = 1;
13020                 rangestart = -1;
13021             }
13022         }
13023
13024         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13025         /* output any special charclass tests (used entirely under use locale) */
13026         if (ANYOF_CLASS_TEST_ANY_SET(o))
13027             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13028                 if (ANYOF_CLASS_TEST(o,i)) {
13029                     sv_catpv(sv, anyofs[i]);
13030                     do_sep = 1;
13031                 }
13032
13033         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13034
13035         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13036             sv_catpvs(sv, "{non-utf8-latin1-all}");
13037         }
13038
13039         /* output information about the unicode matching */
13040         if (flags & ANYOF_UNICODE_ALL)
13041             sv_catpvs(sv, "{unicode_all}");
13042         else if (ANYOF_NONBITMAP(o))
13043             sv_catpvs(sv, "{unicode}");
13044         if (flags & ANYOF_NONBITMAP_NON_UTF8)
13045             sv_catpvs(sv, "{outside bitmap}");
13046
13047         if (ANYOF_NONBITMAP(o)) {
13048             SV *lv; /* Set if there is something outside the bit map */
13049             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13050             bool byte_output = FALSE;   /* If something in the bitmap has been
13051                                            output */
13052
13053             if (lv && lv != &PL_sv_undef) {
13054                 if (sw) {
13055                     U8 s[UTF8_MAXBYTES_CASE+1];
13056
13057                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13058                         uvchr_to_utf8(s, i);
13059
13060                         if (i < 256
13061                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
13062                                                                things already
13063                                                                output as part
13064                                                                of the bitmap */
13065                             && swash_fetch(sw, s, TRUE))
13066                         {
13067                             if (rangestart == -1)
13068                                 rangestart = i;
13069                         } else if (rangestart != -1) {
13070                             byte_output = TRUE;
13071                             if (i <= rangestart + 3)
13072                                 for (; rangestart < i; rangestart++) {
13073                                     put_byte(sv, rangestart);
13074                                 }
13075                             else {
13076                                 put_byte(sv, rangestart);
13077                                 sv_catpvs(sv, "-");
13078                                 put_byte(sv, i-1);
13079                             }
13080                             rangestart = -1;
13081                         }
13082                     }
13083                 }
13084
13085                 {
13086                     char *s = savesvpv(lv);
13087                     char * const origs = s;
13088
13089                     while (*s && *s != '\n')
13090                         s++;
13091
13092                     if (*s == '\n') {
13093                         const char * const t = ++s;
13094
13095                         if (byte_output) {
13096                             sv_catpvs(sv, " ");
13097                         }
13098
13099                         while (*s) {
13100                             if (*s == '\n') {
13101
13102                                 /* Truncate very long output */
13103                                 if (s - origs > 256) {
13104                                     Perl_sv_catpvf(aTHX_ sv,
13105                                                    "%.*s...",
13106                                                    (int) (s - origs - 1),
13107                                                    t);
13108                                     goto out_dump;
13109                                 }
13110                                 *s = ' ';
13111                             }
13112                             else if (*s == '\t') {
13113                                 *s = '-';
13114                             }
13115                             s++;
13116                         }
13117                         if (s[-1] == ' ')
13118                             s[-1] = 0;
13119
13120                         sv_catpv(sv, t);
13121                     }
13122
13123                 out_dump:
13124
13125                     Safefree(origs);
13126                 }
13127                 SvREFCNT_dec(lv);
13128             }
13129         }
13130
13131         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13132     }
13133     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13134         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13135 #else
13136     PERL_UNUSED_CONTEXT;
13137     PERL_UNUSED_ARG(sv);
13138     PERL_UNUSED_ARG(o);
13139     PERL_UNUSED_ARG(prog);
13140 #endif  /* DEBUGGING */
13141 }
13142
13143 SV *
13144 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13145 {                               /* Assume that RE_INTUIT is set */
13146     dVAR;
13147     struct regexp *const prog = (struct regexp *)SvANY(r);
13148     GET_RE_DEBUG_FLAGS_DECL;
13149
13150     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13151     PERL_UNUSED_CONTEXT;
13152
13153     DEBUG_COMPILE_r(
13154         {
13155             const char * const s = SvPV_nolen_const(prog->check_substr
13156                       ? prog->check_substr : prog->check_utf8);
13157
13158             if (!PL_colorset) reginitcolors();
13159             PerlIO_printf(Perl_debug_log,
13160                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13161                       PL_colors[4],
13162                       prog->check_substr ? "" : "utf8 ",
13163                       PL_colors[5],PL_colors[0],
13164                       s,
13165                       PL_colors[1],
13166                       (strlen(s) > 60 ? "..." : ""));
13167         } );
13168
13169     return prog->check_substr ? prog->check_substr : prog->check_utf8;
13170 }
13171
13172 /*
13173    pregfree()
13174
13175    handles refcounting and freeing the perl core regexp structure. When
13176    it is necessary to actually free the structure the first thing it
13177    does is call the 'free' method of the regexp_engine associated to
13178    the regexp, allowing the handling of the void *pprivate; member
13179    first. (This routine is not overridable by extensions, which is why
13180    the extensions free is called first.)
13181
13182    See regdupe and regdupe_internal if you change anything here.
13183 */
13184 #ifndef PERL_IN_XSUB_RE
13185 void
13186 Perl_pregfree(pTHX_ REGEXP *r)
13187 {
13188     SvREFCNT_dec(r);
13189 }
13190
13191 void
13192 Perl_pregfree2(pTHX_ REGEXP *rx)
13193 {
13194     dVAR;
13195     struct regexp *const r = (struct regexp *)SvANY(rx);
13196     GET_RE_DEBUG_FLAGS_DECL;
13197
13198     PERL_ARGS_ASSERT_PREGFREE2;
13199
13200     if (r->mother_re) {
13201         ReREFCNT_dec(r->mother_re);
13202     } else {
13203         CALLREGFREE_PVT(rx); /* free the private data */
13204         SvREFCNT_dec(RXp_PAREN_NAMES(r));
13205     }
13206     if (r->substrs) {
13207         SvREFCNT_dec(r->anchored_substr);
13208         SvREFCNT_dec(r->anchored_utf8);
13209         SvREFCNT_dec(r->float_substr);
13210         SvREFCNT_dec(r->float_utf8);
13211         Safefree(r->substrs);
13212     }
13213     RX_MATCH_COPY_FREE(rx);
13214 #ifdef PERL_OLD_COPY_ON_WRITE
13215     SvREFCNT_dec(r->saved_copy);
13216 #endif
13217     Safefree(r->offs);
13218     SvREFCNT_dec(r->qr_anoncv);
13219 }
13220
13221 /*  reg_temp_copy()
13222
13223     This is a hacky workaround to the structural issue of match results
13224     being stored in the regexp structure which is in turn stored in
13225     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13226     could be PL_curpm in multiple contexts, and could require multiple
13227     result sets being associated with the pattern simultaneously, such
13228     as when doing a recursive match with (??{$qr})
13229
13230     The solution is to make a lightweight copy of the regexp structure
13231     when a qr// is returned from the code executed by (??{$qr}) this
13232     lightweight copy doesn't actually own any of its data except for
13233     the starp/end and the actual regexp structure itself.
13234
13235 */
13236
13237
13238 REGEXP *
13239 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13240 {
13241     struct regexp *ret;
13242     struct regexp *const r = (struct regexp *)SvANY(rx);
13243     register const I32 npar = r->nparens+1;
13244
13245     PERL_ARGS_ASSERT_REG_TEMP_COPY;
13246
13247     if (!ret_x)
13248         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13249     ret = (struct regexp *)SvANY(ret_x);
13250
13251     (void)ReREFCNT_inc(rx);
13252     /* We can take advantage of the existing "copied buffer" mechanism in SVs
13253        by pointing directly at the buffer, but flagging that the allocated
13254        space in the copy is zero. As we've just done a struct copy, it's now
13255        a case of zero-ing that, rather than copying the current length.  */
13256     SvPV_set(ret_x, RX_WRAPPED(rx));
13257     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
13258     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13259            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13260     SvLEN_set(ret_x, 0);
13261     SvSTASH_set(ret_x, NULL);
13262     SvMAGIC_set(ret_x, NULL);
13263     Newx(ret->offs, npar, regexp_paren_pair);
13264     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13265     if (r->substrs) {
13266         Newx(ret->substrs, 1, struct reg_substr_data);
13267         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13268
13269         SvREFCNT_inc_void(ret->anchored_substr);
13270         SvREFCNT_inc_void(ret->anchored_utf8);
13271         SvREFCNT_inc_void(ret->float_substr);
13272         SvREFCNT_inc_void(ret->float_utf8);
13273
13274         /* check_substr and check_utf8, if non-NULL, point to either their
13275            anchored or float namesakes, and don't hold a second reference.  */
13276     }
13277     RX_MATCH_COPIED_off(ret_x);
13278 #ifdef PERL_OLD_COPY_ON_WRITE
13279     ret->saved_copy = NULL;
13280 #endif
13281     ret->mother_re = rx;
13282     SvREFCNT_inc_void(ret->qr_anoncv);
13283
13284     return ret_x;
13285 }
13286 #endif
13287
13288 /* regfree_internal()
13289
13290    Free the private data in a regexp. This is overloadable by
13291    extensions. Perl takes care of the regexp structure in pregfree(),
13292    this covers the *pprivate pointer which technically perl doesn't
13293    know about, however of course we have to handle the
13294    regexp_internal structure when no extension is in use.
13295
13296    Note this is called before freeing anything in the regexp
13297    structure.
13298  */
13299
13300 void
13301 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13302 {
13303     dVAR;
13304     struct regexp *const r = (struct regexp *)SvANY(rx);
13305     RXi_GET_DECL(r,ri);
13306     GET_RE_DEBUG_FLAGS_DECL;
13307
13308     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13309
13310     DEBUG_COMPILE_r({
13311         if (!PL_colorset)
13312             reginitcolors();
13313         {
13314             SV *dsv= sv_newmortal();
13315             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13316                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
13317             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
13318                 PL_colors[4],PL_colors[5],s);
13319         }
13320     });
13321 #ifdef RE_TRACK_PATTERN_OFFSETS
13322     if (ri->u.offsets)
13323         Safefree(ri->u.offsets);             /* 20010421 MJD */
13324 #endif
13325     if (ri->code_blocks) {
13326         int n;
13327         for (n = 0; n < ri->num_code_blocks; n++)
13328             SvREFCNT_dec(ri->code_blocks[n].src_regex);
13329         Safefree(ri->code_blocks);
13330     }
13331
13332     if (ri->data) {
13333         int n = ri->data->count;
13334         PAD* new_comppad = NULL;
13335         PAD* old_comppad;
13336         PADOFFSET refcnt;
13337
13338         while (--n >= 0) {
13339           /* If you add a ->what type here, update the comment in regcomp.h */
13340             switch (ri->data->what[n]) {
13341             case 'a':
13342             case 'r':
13343             case 's':
13344             case 'S':
13345             case 'u':
13346                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13347                 break;
13348             case 'f':
13349                 Safefree(ri->data->data[n]);
13350                 break;
13351             case 'p':
13352                 new_comppad = MUTABLE_AV(ri->data->data[n]);
13353                 break;
13354             case 'o':
13355                 if (new_comppad == NULL)
13356                     Perl_croak(aTHX_ "panic: pregfree comppad");
13357                 PAD_SAVE_LOCAL(old_comppad,
13358                     /* Watch out for global destruction's random ordering. */
13359                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
13360                 );
13361                 OP_REFCNT_LOCK;
13362                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
13363                 OP_REFCNT_UNLOCK;
13364                 if (!refcnt)
13365                     op_free((OP_4tree*)ri->data->data[n]);
13366
13367                 PAD_RESTORE_LOCAL(old_comppad);
13368                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
13369                 new_comppad = NULL;
13370                 break;
13371             case 'l':
13372             case 'L':
13373             case 'n':
13374                 break;
13375             case 'T':
13376                 { /* Aho Corasick add-on structure for a trie node.
13377                      Used in stclass optimization only */
13378                     U32 refcount;
13379                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13380                     OP_REFCNT_LOCK;
13381                     refcount = --aho->refcount;
13382                     OP_REFCNT_UNLOCK;
13383                     if ( !refcount ) {
13384                         PerlMemShared_free(aho->states);
13385                         PerlMemShared_free(aho->fail);
13386                          /* do this last!!!! */
13387                         PerlMemShared_free(ri->data->data[n]);
13388                         PerlMemShared_free(ri->regstclass);
13389                     }
13390                 }
13391                 break;
13392             case 't':
13393                 {
13394                     /* trie structure. */
13395                     U32 refcount;
13396                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13397                     OP_REFCNT_LOCK;
13398                     refcount = --trie->refcount;
13399                     OP_REFCNT_UNLOCK;
13400                     if ( !refcount ) {
13401                         PerlMemShared_free(trie->charmap);
13402                         PerlMemShared_free(trie->states);
13403                         PerlMemShared_free(trie->trans);
13404                         if (trie->bitmap)
13405                             PerlMemShared_free(trie->bitmap);
13406                         if (trie->jump)
13407                             PerlMemShared_free(trie->jump);
13408                         PerlMemShared_free(trie->wordinfo);
13409                         /* do this last!!!! */
13410                         PerlMemShared_free(ri->data->data[n]);
13411                     }
13412                 }
13413                 break;
13414             default:
13415                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
13416             }
13417         }
13418         Safefree(ri->data->what);
13419         Safefree(ri->data);
13420     }
13421
13422     Safefree(ri);
13423 }
13424
13425 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
13426 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
13427 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
13428
13429 /*
13430    re_dup - duplicate a regexp.
13431
13432    This routine is expected to clone a given regexp structure. It is only
13433    compiled under USE_ITHREADS.
13434
13435    After all of the core data stored in struct regexp is duplicated
13436    the regexp_engine.dupe method is used to copy any private data
13437    stored in the *pprivate pointer. This allows extensions to handle
13438    any duplication it needs to do.
13439
13440    See pregfree() and regfree_internal() if you change anything here.
13441 */
13442 #if defined(USE_ITHREADS)
13443 #ifndef PERL_IN_XSUB_RE
13444 void
13445 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13446 {
13447     dVAR;
13448     I32 npar;
13449     const struct regexp *r = (const struct regexp *)SvANY(sstr);
13450     struct regexp *ret = (struct regexp *)SvANY(dstr);
13451
13452     PERL_ARGS_ASSERT_RE_DUP_GUTS;
13453
13454     npar = r->nparens+1;
13455     Newx(ret->offs, npar, regexp_paren_pair);
13456     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13457     if(ret->swap) {
13458         /* no need to copy these */
13459         Newx(ret->swap, npar, regexp_paren_pair);
13460     }
13461
13462     if (ret->substrs) {
13463         /* Do it this way to avoid reading from *r after the StructCopy().
13464            That way, if any of the sv_dup_inc()s dislodge *r from the L1
13465            cache, it doesn't matter.  */
13466         const bool anchored = r->check_substr
13467             ? r->check_substr == r->anchored_substr
13468             : r->check_utf8 == r->anchored_utf8;
13469         Newx(ret->substrs, 1, struct reg_substr_data);
13470         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13471
13472         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
13473         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
13474         ret->float_substr = sv_dup_inc(ret->float_substr, param);
13475         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
13476
13477         /* check_substr and check_utf8, if non-NULL, point to either their
13478            anchored or float namesakes, and don't hold a second reference.  */
13479
13480         if (ret->check_substr) {
13481             if (anchored) {
13482                 assert(r->check_utf8 == r->anchored_utf8);
13483                 ret->check_substr = ret->anchored_substr;
13484                 ret->check_utf8 = ret->anchored_utf8;
13485             } else {
13486                 assert(r->check_substr == r->float_substr);
13487                 assert(r->check_utf8 == r->float_utf8);
13488                 ret->check_substr = ret->float_substr;
13489                 ret->check_utf8 = ret->float_utf8;
13490             }
13491         } else if (ret->check_utf8) {
13492             if (anchored) {
13493                 ret->check_utf8 = ret->anchored_utf8;
13494             } else {
13495                 ret->check_utf8 = ret->float_utf8;
13496             }
13497         }
13498     }
13499
13500     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13501     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13502
13503     if (ret->pprivate)
13504         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
13505
13506     if (RX_MATCH_COPIED(dstr))
13507         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
13508     else
13509         ret->subbeg = NULL;
13510 #ifdef PERL_OLD_COPY_ON_WRITE
13511     ret->saved_copy = NULL;
13512 #endif
13513
13514     if (ret->mother_re) {
13515         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
13516             /* Our storage points directly to our mother regexp, but that's
13517                1: a buffer in a different thread
13518                2: something we no longer hold a reference on
13519                so we need to copy it locally.  */
13520             /* Note we need to use SvCUR(), rather than
13521                SvLEN(), on our mother_re, because it, in
13522                turn, may well be pointing to its own mother_re.  */
13523             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
13524                                    SvCUR(ret->mother_re)+1));
13525             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
13526         }
13527         ret->mother_re      = NULL;
13528     }
13529     ret->gofs = 0;
13530 }
13531 #endif /* PERL_IN_XSUB_RE */
13532
13533 /*
13534    regdupe_internal()
13535
13536    This is the internal complement to regdupe() which is used to copy
13537    the structure pointed to by the *pprivate pointer in the regexp.
13538    This is the core version of the extension overridable cloning hook.
13539    The regexp structure being duplicated will be copied by perl prior
13540    to this and will be provided as the regexp *r argument, however
13541    with the /old/ structures pprivate pointer value. Thus this routine
13542    may override any copying normally done by perl.
13543
13544    It returns a pointer to the new regexp_internal structure.
13545 */
13546
13547 void *
13548 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13549 {
13550     dVAR;
13551     struct regexp *const r = (struct regexp *)SvANY(rx);
13552     regexp_internal *reti;
13553     int len;
13554     RXi_GET_DECL(r,ri);
13555
13556     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13557
13558     len = ProgLen(ri);
13559
13560     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
13561     Copy(ri->program, reti->program, len+1, regnode);
13562
13563     reti->num_code_blocks = ri->num_code_blocks;
13564     if (ri->code_blocks) {
13565         int n;
13566         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
13567                 struct reg_code_block);
13568         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
13569                 struct reg_code_block);
13570         for (n = 0; n < ri->num_code_blocks; n++)
13571              reti->code_blocks[n].src_regex = (REGEXP*)
13572                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
13573     }
13574     else
13575         reti->code_blocks = NULL;
13576
13577     reti->regstclass = NULL;
13578
13579     if (ri->data) {
13580         struct reg_data *d;
13581         const int count = ri->data->count;
13582         int i;
13583
13584         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13585                 char, struct reg_data);
13586         Newx(d->what, count, U8);
13587
13588         d->count = count;
13589         for (i = 0; i < count; i++) {
13590             d->what[i] = ri->data->what[i];
13591             switch (d->what[i]) {
13592                 /* legal options are one of: sSfpontTua
13593                    see also regcomp.h and pregfree() */
13594             case 'a': /* actually an AV, but the dup function is identical.  */
13595             case 'r':
13596             case 's':
13597             case 'S':
13598             case 'p': /* actually an AV, but the dup function is identical.  */
13599             case 'u': /* actually an HV, but the dup function is identical.  */
13600                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13601                 break;
13602             case 'f':
13603                 /* This is cheating. */
13604                 Newx(d->data[i], 1, struct regnode_charclass_class);
13605                 StructCopy(ri->data->data[i], d->data[i],
13606                             struct regnode_charclass_class);
13607                 reti->regstclass = (regnode*)d->data[i];
13608                 break;
13609             case 'o':
13610                 /* Compiled op trees are readonly and in shared memory,
13611                    and can thus be shared without duplication. */
13612                 OP_REFCNT_LOCK;
13613                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
13614                 OP_REFCNT_UNLOCK;
13615                 break;
13616             case 'T':
13617                 /* Trie stclasses are readonly and can thus be shared
13618                  * without duplication. We free the stclass in pregfree
13619                  * when the corresponding reg_ac_data struct is freed.
13620                  */
13621                 reti->regstclass= ri->regstclass;
13622                 /* Fall through */
13623             case 't':
13624                 OP_REFCNT_LOCK;
13625                 ((reg_trie_data*)ri->data->data[i])->refcount++;
13626                 OP_REFCNT_UNLOCK;
13627                 /* Fall through */
13628             case 'l':
13629             case 'L':
13630             case 'n':
13631                 d->data[i] = ri->data->data[i];
13632                 break;
13633             default:
13634                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13635             }
13636         }
13637
13638         reti->data = d;
13639     }
13640     else
13641         reti->data = NULL;
13642
13643     reti->name_list_idx = ri->name_list_idx;
13644
13645 #ifdef RE_TRACK_PATTERN_OFFSETS
13646     if (ri->u.offsets) {
13647         Newx(reti->u.offsets, 2*len+1, U32);
13648         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13649     }
13650 #else
13651     SetProgLen(reti,len);
13652 #endif
13653
13654     return (void*)reti;
13655 }
13656
13657 #endif    /* USE_ITHREADS */
13658
13659 #ifndef PERL_IN_XSUB_RE
13660
13661 /*
13662  - regnext - dig the "next" pointer out of a node
13663  */
13664 regnode *
13665 Perl_regnext(pTHX_ register regnode *p)
13666 {
13667     dVAR;
13668     register I32 offset;
13669
13670     if (!p)
13671         return(NULL);
13672
13673     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
13674         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13675     }
13676
13677     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13678     if (offset == 0)
13679         return(NULL);
13680
13681     return(p+offset);
13682 }
13683 #endif
13684
13685 STATIC void
13686 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13687 {
13688     va_list args;
13689     STRLEN l1 = strlen(pat1);
13690     STRLEN l2 = strlen(pat2);
13691     char buf[512];
13692     SV *msv;
13693     const char *message;
13694
13695     PERL_ARGS_ASSERT_RE_CROAK2;
13696
13697     if (l1 > 510)
13698         l1 = 510;
13699     if (l1 + l2 > 510)
13700         l2 = 510 - l1;
13701     Copy(pat1, buf, l1 , char);
13702     Copy(pat2, buf + l1, l2 , char);
13703     buf[l1 + l2] = '\n';
13704     buf[l1 + l2 + 1] = '\0';
13705 #ifdef I_STDARG
13706     /* ANSI variant takes additional second argument */
13707     va_start(args, pat2);
13708 #else
13709     va_start(args);
13710 #endif
13711     msv = vmess(buf, &args);
13712     va_end(args);
13713     message = SvPV_const(msv,l1);
13714     if (l1 > 512)
13715         l1 = 512;
13716     Copy(message, buf, l1 , char);
13717     buf[l1-1] = '\0';                   /* Overwrite \n */
13718     Perl_croak(aTHX_ "%s", buf);
13719 }
13720
13721 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
13722
13723 #ifndef PERL_IN_XSUB_RE
13724 void
13725 Perl_save_re_context(pTHX)
13726 {
13727     dVAR;
13728
13729     struct re_save_state *state;
13730
13731     SAVEVPTR(PL_curcop);
13732     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13733
13734     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13735     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13736     SSPUSHUV(SAVEt_RE_STATE);
13737
13738     Copy(&PL_reg_state, state, 1, struct re_save_state);
13739
13740     PL_reg_start_tmp = 0;
13741     PL_reg_start_tmpl = 0;
13742     PL_reg_oldsaved = NULL;
13743     PL_reg_oldsavedlen = 0;
13744     PL_reg_maxiter = 0;
13745     PL_reg_leftiter = 0;
13746     PL_reg_poscache = NULL;
13747     PL_reg_poscache_size = 0;
13748 #ifdef PERL_OLD_COPY_ON_WRITE
13749     PL_nrs = NULL;
13750 #endif
13751
13752     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13753     if (PL_curpm) {
13754         const REGEXP * const rx = PM_GETRE(PL_curpm);
13755         if (rx) {
13756             U32 i;
13757             for (i = 1; i <= RX_NPARENS(rx); i++) {
13758                 char digits[TYPE_CHARS(long)];
13759                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13760                 GV *const *const gvp
13761                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13762
13763                 if (gvp) {
13764                     GV * const gv = *gvp;
13765                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13766                         save_scalar(gv);
13767                 }
13768             }
13769         }
13770     }
13771 }
13772 #endif
13773
13774 static void
13775 clear_re(pTHX_ void *r)
13776 {
13777     dVAR;
13778     ReREFCNT_dec((REGEXP *)r);
13779 }
13780
13781 #ifdef DEBUGGING
13782
13783 STATIC void
13784 S_put_byte(pTHX_ SV *sv, int c)
13785 {
13786     PERL_ARGS_ASSERT_PUT_BYTE;
13787
13788     /* Our definition of isPRINT() ignores locales, so only bytes that are
13789        not part of UTF-8 are considered printable. I assume that the same
13790        holds for UTF-EBCDIC.
13791        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13792        which Wikipedia says:
13793
13794        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13795        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13796        identical, to the ASCII delete (DEL) or rubout control character.
13797        ) So the old condition can be simplified to !isPRINT(c)  */
13798     if (!isPRINT(c)) {
13799         if (c < 256) {
13800             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13801         }
13802         else {
13803             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13804         }
13805     }
13806     else {
13807         const char string = c;
13808         if (c == '-' || c == ']' || c == '\\' || c == '^')
13809             sv_catpvs(sv, "\\");
13810         sv_catpvn(sv, &string, 1);
13811     }
13812 }
13813
13814
13815 #define CLEAR_OPTSTART \
13816     if (optstart) STMT_START { \
13817             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13818             optstart=NULL; \
13819     } STMT_END
13820
13821 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13822
13823 STATIC const regnode *
13824 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13825             const regnode *last, const regnode *plast,
13826             SV* sv, I32 indent, U32 depth)
13827 {
13828     dVAR;
13829     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
13830     register const regnode *next;
13831     const regnode *optstart= NULL;
13832
13833     RXi_GET_DECL(r,ri);
13834     GET_RE_DEBUG_FLAGS_DECL;
13835
13836     PERL_ARGS_ASSERT_DUMPUNTIL;
13837
13838 #ifdef DEBUG_DUMPUNTIL
13839     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13840         last ? last-start : 0,plast ? plast-start : 0);
13841 #endif
13842
13843     if (plast && plast < last)
13844         last= plast;
13845
13846     while (PL_regkind[op] != END && (!last || node < last)) {
13847         /* While that wasn't END last time... */
13848         NODE_ALIGN(node);
13849         op = OP(node);
13850         if (op == CLOSE || op == WHILEM)
13851             indent--;
13852         next = regnext((regnode *)node);
13853
13854         /* Where, what. */
13855         if (OP(node) == OPTIMIZED) {
13856             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13857                 optstart = node;
13858             else
13859                 goto after_print;
13860         } else
13861             CLEAR_OPTSTART;
13862
13863         regprop(r, sv, node);
13864         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13865                       (int)(2*indent + 1), "", SvPVX_const(sv));
13866
13867         if (OP(node) != OPTIMIZED) {
13868             if (next == NULL)           /* Next ptr. */
13869                 PerlIO_printf(Perl_debug_log, " (0)");
13870             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13871                 PerlIO_printf(Perl_debug_log, " (FAIL)");
13872             else
13873                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13874             (void)PerlIO_putc(Perl_debug_log, '\n');
13875         }
13876
13877       after_print:
13878         if (PL_regkind[(U8)op] == BRANCHJ) {
13879             assert(next);
13880             {
13881                 register const regnode *nnode = (OP(next) == LONGJMP
13882                                              ? regnext((regnode *)next)
13883                                              : next);
13884                 if (last && nnode > last)
13885                     nnode = last;
13886                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13887             }
13888         }
13889         else if (PL_regkind[(U8)op] == BRANCH) {
13890             assert(next);
13891             DUMPUNTIL(NEXTOPER(node), next);
13892         }
13893         else if ( PL_regkind[(U8)op]  == TRIE ) {
13894             const regnode *this_trie = node;
13895             const char op = OP(node);
13896             const U32 n = ARG(node);
13897             const reg_ac_data * const ac = op>=AHOCORASICK ?
13898                (reg_ac_data *)ri->data->data[n] :
13899                NULL;
13900             const reg_trie_data * const trie =
13901                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13902 #ifdef DEBUGGING
13903             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13904 #endif
13905             const regnode *nextbranch= NULL;
13906             I32 word_idx;
13907             sv_setpvs(sv, "");
13908             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13909                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13910
13911                 PerlIO_printf(Perl_debug_log, "%*s%s ",
13912                    (int)(2*(indent+3)), "",
13913                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13914                             PL_colors[0], PL_colors[1],
13915                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13916                             PERL_PV_PRETTY_ELLIPSES    |
13917                             PERL_PV_PRETTY_LTGT
13918                             )
13919                             : "???"
13920                 );
13921                 if (trie->jump) {
13922                     U16 dist= trie->jump[word_idx+1];
13923                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13924                                   (UV)((dist ? this_trie + dist : next) - start));
13925                     if (dist) {
13926                         if (!nextbranch)
13927                             nextbranch= this_trie + trie->jump[0];
13928                         DUMPUNTIL(this_trie + dist, nextbranch);
13929                     }
13930                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13931                         nextbranch= regnext((regnode *)nextbranch);
13932                 } else {
13933                     PerlIO_printf(Perl_debug_log, "\n");
13934                 }
13935             }
13936             if (last && next > last)
13937                 node= last;
13938             else
13939                 node= next;
13940         }
13941         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
13942             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13943                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13944         }
13945         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13946             assert(next);
13947             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13948         }
13949         else if ( op == PLUS || op == STAR) {
13950             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13951         }
13952         else if (PL_regkind[(U8)op] == ANYOF) {
13953             /* arglen 1 + class block */
13954             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13955                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13956             node = NEXTOPER(node);
13957         }
13958         else if (PL_regkind[(U8)op] == EXACT) {
13959             /* Literal string, where present. */
13960             node += NODE_SZ_STR(node) - 1;
13961             node = NEXTOPER(node);
13962         }
13963         else {
13964             node = NEXTOPER(node);
13965             node += regarglen[(U8)op];
13966         }
13967         if (op == CURLYX || op == OPEN)
13968             indent++;
13969     }
13970     CLEAR_OPTSTART;
13971 #ifdef DEBUG_DUMPUNTIL
13972     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13973 #endif
13974     return node;
13975 }
13976
13977 #endif  /* DEBUGGING */
13978
13979 /*
13980  * Local variables:
13981  * c-indentation-style: bsd
13982  * c-basic-offset: 4
13983  * indent-tabs-mode: nil
13984  * End:
13985  *
13986  * ex: set ts=8 sts=4 sw=4 et:
13987  */