regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     I32         uni_semantics;  /* If a d charset modifier should use unicode
 138                                    rules, even if the pattern is not in
 139                                    utf8 */
 140     HV          *paren_names;           /* Paren names */
 141
 142     regnode     **recurse;              /* Recurse regops */
 143     I32         recurse_count;          /* Number of recurse regops */
 144     I32         in_lookbehind;
 145 #if ADD_TO_REGEXEC
 146     char        *starttry;              /* -Dr: where regtry was called. */
 147 #define RExC_starttry   (pRExC_state->starttry)
 148 #endif
 149 #ifdef DEBUGGING
 150     const char  *lastparse;
 151     I32         lastnum;
 152     AV          *paren_name_list;       /* idx -> name */
 153 #define RExC_lastparse  (pRExC_state->lastparse)
 154 #define RExC_lastnum    (pRExC_state->lastnum)
 155 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 156 #endif
 157 } RExC_state_t;
 158
 159 #define RExC_flags      (pRExC_state->flags)
 160 #define RExC_precomp    (pRExC_state->precomp)
 161 #define RExC_rx_sv      (pRExC_state->rx_sv)
 162 #define RExC_rx         (pRExC_state->rx)
 163 #define RExC_rxi        (pRExC_state->rxi)
 164 #define RExC_start      (pRExC_state->start)
 165 #define RExC_end        (pRExC_state->end)
 166 #define RExC_parse      (pRExC_state->parse)
 167 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 168 #ifdef RE_TRACK_PATTERN_OFFSETS
 169 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 170 #endif
 171 #define RExC_emit       (pRExC_state->emit)
 172 #define RExC_emit_start (pRExC_state->emit_start)
 173 #define RExC_emit_bound (pRExC_state->emit_bound)
 174 #define RExC_naughty    (pRExC_state->naughty)
 175 #define RExC_sawback    (pRExC_state->sawback)
 176 #define RExC_seen       (pRExC_state->seen)
 177 #define RExC_size       (pRExC_state->size)
 178 #define RExC_npar       (pRExC_state->npar)
 179 #define RExC_nestroot   (pRExC_state->nestroot)
 180 #define RExC_extralen   (pRExC_state->extralen)
 181 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 182 #define RExC_seen_evals (pRExC_state->seen_evals)
 183 #define RExC_utf8       (pRExC_state->utf8)
 184 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 185 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 186 #define RExC_open_parens        (pRExC_state->open_parens)
 187 #define RExC_close_parens       (pRExC_state->close_parens)
 188 #define RExC_opend      (pRExC_state->opend)
 189 #define RExC_paren_names        (pRExC_state->paren_names)
 190 #define RExC_recurse    (pRExC_state->recurse)
 191 #define RExC_recurse_count      (pRExC_state->recurse_count)
 192 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 193
 194
 195 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 196 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 197         ((*s) == '{' && regcurly(s)))
 198
 199 #ifdef SPSTART
 200 #undef SPSTART          /* dratted cpp namespace... */
 201 #endif
 202 /*
 203  * Flags to be passed up and down.
 204  */
 205 #define WORST           0       /* Worst case. */
 206 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 207
 208 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 209  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 210 #define SIMPLE          0x02
 211 #define SPSTART         0x04    /* Starts with * or +. */
 212 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 213 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 214
 215 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 216
 217 /* whether trie related optimizations are enabled */
 218 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 219 #define TRIE_STUDY_OPT
 220 #define FULL_TRIE_STUDY
 221 #define TRIE_STCLASS
 222 #endif
 223
 224
 225
 226 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 227 #define PBITVAL(paren) (1 << ((paren) & 7))
 228 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 229 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 230 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 231
 232 /* If not already in utf8, do a longjmp back to the beginning */
 233 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 234 #define REQUIRE_UTF8    STMT_START {                                       \
 235                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 236                         } STMT_END
 237
 238 /* About scan_data_t.
 239
 240   During optimisation we recurse through the regexp program performing
 241   various inplace (keyhole style) optimisations. In addition study_chunk
 242   and scan_commit populate this data structure with information about
 243   what strings MUST appear in the pattern. We look for the longest
 244   string that must appear at a fixed location, and we look for the
 245   longest string that may appear at a floating location. So for instance
 246   in the pattern:
 247
 248     /FOO[xX]A.*B[xX]BAR/
 249
 250   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 251   strings (because they follow a .* construct). study_chunk will identify
 252   both FOO and BAR as being the longest fixed and floating strings respectively.
 253
 254   The strings can be composites, for instance
 255
 256      /(f)(o)(o)/
 257
 258   will result in a composite fixed substring 'foo'.
 259
 260   For each string some basic information is maintained:
 261
 262   - offset or min_offset
 263     This is the position the string must appear at, or not before.
 264     It also implicitly (when combined with minlenp) tells us how many
 265     characters must match before the string we are searching for.
 266     Likewise when combined with minlenp and the length of the string it
 267     tells us how many characters must appear after the string we have
 268     found.
 269
 270   - max_offset
 271     Only used for floating strings. This is the rightmost point that
 272     the string can appear at. If set to I32 max it indicates that the
 273     string can occur infinitely far to the right.
 274
 275   - minlenp
 276     A pointer to the minimum length of the pattern that the string
 277     was found inside. This is important as in the case of positive
 278     lookahead or positive lookbehind we can have multiple patterns
 279     involved. Consider
 280
 281     /(?=FOO).*F/
 282
 283     The minimum length of the pattern overall is 3, the minimum length
 284     of the lookahead part is 3, but the minimum length of the part that
 285     will actually match is 1. So 'FOO's minimum length is 3, but the
 286     minimum length for the F is 1. This is important as the minimum length
 287     is used to determine offsets in front of and behind the string being
 288     looked for.  Since strings can be composites this is the length of the
 289     pattern at the time it was committed with a scan_commit. Note that
 290     the length is calculated by study_chunk, so that the minimum lengths
 291     are not known until the full pattern has been compiled, thus the
 292     pointer to the value.
 293
 294   - lookbehind
 295
 296     In the case of lookbehind the string being searched for can be
 297     offset past the start point of the final matching string.
 298     If this value was just blithely removed from the min_offset it would
 299     invalidate some of the calculations for how many chars must match
 300     before or after (as they are derived from min_offset and minlen and
 301     the length of the string being searched for).
 302     When the final pattern is compiled and the data is moved from the
 303     scan_data_t structure into the regexp structure the information
 304     about lookbehind is factored in, with the information that would
 305     have been lost precalculated in the end_shift field for the
 306     associated string.
 307
 308   The fields pos_min and pos_delta are used to store the minimum offset
 309   and the delta to the maximum offset at the current point in the pattern.
 310
 311 */
 312
 313 typedef struct scan_data_t {
 314     /*I32 len_min;      unused */
 315     /*I32 len_delta;    unused */
 316     I32 pos_min;
 317     I32 pos_delta;
 318     SV *last_found;
 319     I32 last_end;           /* min value, <0 unless valid. */
 320     I32 last_start_min;
 321     I32 last_start_max;
 322     SV **longest;           /* Either &l_fixed, or &l_float. */
 323     SV *longest_fixed;      /* longest fixed string found in pattern */
 324     I32 offset_fixed;       /* offset where it starts */
 325     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 326     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 327     SV *longest_float;      /* longest floating string found in pattern */
 328     I32 offset_float_min;   /* earliest point in string it can appear */
 329     I32 offset_float_max;   /* latest point in string it can appear */
 330     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 331     I32 lookbehind_float;   /* is the position of the string modified by LB */
 332     I32 flags;
 333     I32 whilem_c;
 334     I32 *last_closep;
 335     struct regnode_charclass_class *start_class;
 336 } scan_data_t;
 337
 338 /*
 339  * Forward declarations for pregcomp()'s friends.
 340  */
 341
 342 static const scan_data_t zero_scan_data =
 343   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 344
 345 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 346 #define SF_BEFORE_SEOL          0x0001
 347 #define SF_BEFORE_MEOL          0x0002
 348 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 349 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 350
 351 #ifdef NO_UNARY_PLUS
 352 #  define SF_FIX_SHIFT_EOL      (0+2)
 353 #  define SF_FL_SHIFT_EOL               (0+4)
 354 #else
 355 #  define SF_FIX_SHIFT_EOL      (+2)
 356 #  define SF_FL_SHIFT_EOL               (+4)
 357 #endif
 358
 359 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 360 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 361
 362 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 363 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 364 #define SF_IS_INF               0x0040
 365 #define SF_HAS_PAR              0x0080
 366 #define SF_IN_PAR               0x0100
 367 #define SF_HAS_EVAL             0x0200
 368 #define SCF_DO_SUBSTR           0x0400
 369 #define SCF_DO_STCLASS_AND      0x0800
 370 #define SCF_DO_STCLASS_OR       0x1000
 371 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 372 #define SCF_WHILEM_VISITED_POS  0x2000
 373
 374 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 375 #define SCF_SEEN_ACCEPT         0x8000
 376
 377 #define UTF cBOOL(RExC_utf8)
 378 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 379 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 380 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 381 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 382 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 383 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 384 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 385
 386 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 387
 388 #define OOB_UNICODE             12345678
 389 #define OOB_NAMEDCLASS          -1
 390
 391 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 392 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 393
 394
 395 /* length of regex to show in messages that don't mark a position within */
 396 #define RegexLengthToShowInErrorMessages 127
 397
 398 /*
 399  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 400  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 401  * op/pragma/warn/regcomp.
 402  */
 403 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 404 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 405
 406 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 407
 408 /*
 409  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 410  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 411  * "...".
 412  */
 413 #define _FAIL(code) STMT_START {                                        \
 414     const char *ellipses = "";                                          \
 415     IV len = RExC_end - RExC_precomp;                                   \
 416                                                                         \
 417     if (!SIZE_ONLY)                                                     \
 418         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 419     if (len > RegexLengthToShowInErrorMessages) {                       \
 420         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 421         len = RegexLengthToShowInErrorMessages - 10;                    \
 422         ellipses = "...";                                               \
 423     }                                                                   \
 424     code;                                                               \
 425 } STMT_END
 426
 427 #define FAIL(msg) _FAIL(                            \
 428     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 429             msg, (int)len, RExC_precomp, ellipses))
 430
 431 #define FAIL2(msg,arg) _FAIL(                       \
 432     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 433             arg, (int)len, RExC_precomp, ellipses))
 434
 435 /*
 436  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 437  */
 438 #define Simple_vFAIL(m) STMT_START {                                    \
 439     const IV offset = RExC_parse - RExC_precomp;                        \
 440     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 441             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 442 } STMT_END
 443
 444 /*
 445  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 446  */
 447 #define vFAIL(m) STMT_START {                           \
 448     if (!SIZE_ONLY)                                     \
 449         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 450     Simple_vFAIL(m);                                    \
 451 } STMT_END
 452
 453 /*
 454  * Like Simple_vFAIL(), but accepts two arguments.
 455  */
 456 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 457     const IV offset = RExC_parse - RExC_precomp;                        \
 458     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 459             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 460 } STMT_END
 461
 462 /*
 463  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 464  */
 465 #define vFAIL2(m,a1) STMT_START {                       \
 466     if (!SIZE_ONLY)                                     \
 467         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 468     Simple_vFAIL2(m, a1);                               \
 469 } STMT_END
 470
 471
 472 /*
 473  * Like Simple_vFAIL(), but accepts three arguments.
 474  */
 475 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 476     const IV offset = RExC_parse - RExC_precomp;                \
 477     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 478             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 479 } STMT_END
 480
 481 /*
 482  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 483  */
 484 #define vFAIL3(m,a1,a2) STMT_START {                    \
 485     if (!SIZE_ONLY)                                     \
 486         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 487     Simple_vFAIL3(m, a1, a2);                           \
 488 } STMT_END
 489
 490 /*
 491  * Like Simple_vFAIL(), but accepts four arguments.
 492  */
 493 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 494     const IV offset = RExC_parse - RExC_precomp;                \
 495     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 496             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 497 } STMT_END
 498
 499 #define ckWARNreg(loc,m) STMT_START {                                   \
 500     const IV offset = loc - RExC_precomp;                               \
 501     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 502             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 503 } STMT_END
 504
 505 #define ckWARNregdep(loc,m) STMT_START {                                \
 506     const IV offset = loc - RExC_precomp;                               \
 507     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 508             m REPORT_LOCATION,                                          \
 509             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 510 } STMT_END
 511
 512 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 515             m REPORT_LOCATION,                                          \
 516             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 517 } STMT_END
 518
 519 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 520     const IV offset = loc - RExC_precomp;                               \
 521     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 522             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 523 } STMT_END
 524
 525 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 526     const IV offset = loc - RExC_precomp;                               \
 527     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 528             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 529 } STMT_END
 530
 531 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 532     const IV offset = loc - RExC_precomp;                               \
 533     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 534             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 535 } STMT_END
 536
 537 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 538     const IV offset = loc - RExC_precomp;                               \
 539     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 540             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 541 } STMT_END
 542
 543 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 544     const IV offset = loc - RExC_precomp;                               \
 545     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 546             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 547 } STMT_END
 548
 549 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 550     const IV offset = loc - RExC_precomp;                               \
 551     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 552             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 553 } STMT_END
 554
 555
 556 /* Allow for side effects in s */
 557 #define REGC(c,s) STMT_START {                  \
 558     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 559 } STMT_END
 560
 561 /* Macros for recording node offsets.   20001227 mjd@plover.com
 562  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 563  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 564  * Element 0 holds the number n.
 565  * Position is 1 indexed.
 566  */
 567 #ifndef RE_TRACK_PATTERN_OFFSETS
 568 #define Set_Node_Offset_To_R(node,byte)
 569 #define Set_Node_Offset(node,byte)
 570 #define Set_Cur_Node_Offset
 571 #define Set_Node_Length_To_R(node,len)
 572 #define Set_Node_Length(node,len)
 573 #define Set_Node_Cur_Length(node)
 574 #define Node_Offset(n)
 575 #define Node_Length(n)
 576 #define Set_Node_Offset_Length(node,offset,len)
 577 #define ProgLen(ri) ri->u.proglen
 578 #define SetProgLen(ri,x) ri->u.proglen = x
 579 #else
 580 #define ProgLen(ri) ri->u.offsets[0]
 581 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 582 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 583     if (! SIZE_ONLY) {                                                  \
 584         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 585                     __LINE__, (int)(node), (int)(byte)));               \
 586         if((node) < 0) {                                                \
 587             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 588         } else {                                                        \
 589             RExC_offsets[2*(node)-1] = (byte);                          \
 590         }                                                               \
 591     }                                                                   \
 592 } STMT_END
 593
 594 #define Set_Node_Offset(node,byte) \
 595     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 596 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 597
 598 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 599     if (! SIZE_ONLY) {                                                  \
 600         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 601                 __LINE__, (int)(node), (int)(len)));                    \
 602         if((node) < 0) {                                                \
 603             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 604         } else {                                                        \
 605             RExC_offsets[2*(node)] = (len);                             \
 606         }                                                               \
 607     }                                                                   \
 608 } STMT_END
 609
 610 #define Set_Node_Length(node,len) \
 611     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 612 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 613 #define Set_Node_Cur_Length(node) \
 614     Set_Node_Length(node, RExC_parse - parse_start)
 615
 616 /* Get offsets and lengths */
 617 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 618 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 619
 620 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 621     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 622     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 623 } STMT_END
 624 #endif
 625
 626 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 627 #define EXPERIMENTAL_INPLACESCAN
 628 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 629
 630 #define DEBUG_STUDYDATA(str,data,depth)                              \
 631 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 632     PerlIO_printf(Perl_debug_log,                                    \
 633         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 634         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 635         (int)(depth)*2, "",                                          \
 636         (IV)((data)->pos_min),                                       \
 637         (IV)((data)->pos_delta),                                     \
 638         (UV)((data)->flags),                                         \
 639         (IV)((data)->whilem_c),                                      \
 640         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 641         is_inf ? "INF " : ""                                         \
 642     );                                                               \
 643     if ((data)->last_found)                                          \
 644         PerlIO_printf(Perl_debug_log,                                \
 645             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 646             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 647             SvPVX_const((data)->last_found),                         \
 648             (IV)((data)->last_end),                                  \
 649             (IV)((data)->last_start_min),                            \
 650             (IV)((data)->last_start_max),                            \
 651             ((data)->longest &&                                      \
 652              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 653             SvPVX_const((data)->longest_fixed),                      \
 654             (IV)((data)->offset_fixed),                              \
 655             ((data)->longest &&                                      \
 656              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 657             SvPVX_const((data)->longest_float),                      \
 658             (IV)((data)->offset_float_min),                          \
 659             (IV)((data)->offset_float_max)                           \
 660         );                                                           \
 661     PerlIO_printf(Perl_debug_log,"\n");                              \
 662 });
 663
 664 static void clear_re(pTHX_ void *r);
 665
 666 /* Mark that we cannot extend a found fixed substring at this point.
 667    Update the longest found anchored substring and the longest found
 668    floating substrings if needed. */
 669
 670 STATIC void
 671 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 672 {
 673     const STRLEN l = CHR_SVLEN(data->last_found);
 674     const STRLEN old_l = CHR_SVLEN(*data->longest);
 675     GET_RE_DEBUG_FLAGS_DECL;
 676
 677     PERL_ARGS_ASSERT_SCAN_COMMIT;
 678
 679     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 680         SvSetMagicSV(*data->longest, data->last_found);
 681         if (*data->longest == data->longest_fixed) {
 682             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 683             if (data->flags & SF_BEFORE_EOL)
 684                 data->flags
 685                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 686             else
 687                 data->flags &= ~SF_FIX_BEFORE_EOL;
 688             data->minlen_fixed=minlenp;
 689             data->lookbehind_fixed=0;
 690         }
 691         else { /* *data->longest == data->longest_float */
 692             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 693             data->offset_float_max = (l
 694                                       ? data->last_start_max
 695                                       : data->pos_min + data->pos_delta);
 696             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 697                 data->offset_float_max = I32_MAX;
 698             if (data->flags & SF_BEFORE_EOL)
 699                 data->flags
 700                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 701             else
 702                 data->flags &= ~SF_FL_BEFORE_EOL;
 703             data->minlen_float=minlenp;
 704             data->lookbehind_float=0;
 705         }
 706     }
 707     SvCUR_set(data->last_found, 0);
 708     {
 709         SV * const sv = data->last_found;
 710         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 711             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 712             if (mg)
 713                 mg->mg_len = 0;
 714         }
 715     }
 716     data->last_end = -1;
 717     data->flags &= ~SF_BEFORE_EOL;
 718     DEBUG_STUDYDATA("commit: ",data,0);
 719 }
 720
 721 /* Can match anything (initialization) */
 722 STATIC void
 723 S_cl_anything(struct regnode_charclass_class *cl)
 724 {
 725     PERL_ARGS_ASSERT_CL_ANYTHING;
 726
 727     ANYOF_CLASS_ZERO(cl);
 728     ANYOF_BITMAP_SETALL(cl);
 729     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_LOCALE;
 730 }
 731
 732 /* Can match anything (initialization) */
 733 STATIC int
 734 S_cl_is_anything(const struct regnode_charclass_class *cl)
 735 {
 736     int value;
 737
 738     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 739
 740     for (value = 0; value <= ANYOF_MAX; value += 2)
 741         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 742             return 1;
 743     if (!(cl->flags & ANYOF_UNICODE_ALL))
 744         return 0;
 745     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 746         return 0;
 747     return 1;
 748 }
 749
 750 /* Can match anything (initialization) */
 751 STATIC void
 752 S_cl_init(struct regnode_charclass_class *cl)
 753 {
 754     PERL_ARGS_ASSERT_CL_INIT;
 755
 756     Zero(cl, 1, struct regnode_charclass_class);
 757     cl->type = ANYOF;
 758     cl_anything(cl);
 759     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 760 }
 761
 762 /* These two functions currently do the exact same thing */
 763 #define cl_init_zero            S_cl_init
 764
 765 /* 'And' a given class with another one.  Can create false positives */
 766 /* cl should not be inverted */
 767 STATIC void
 768 S_cl_and(struct regnode_charclass_class *cl,
 769         const struct regnode_charclass_class *and_with)
 770 {
 771     PERL_ARGS_ASSERT_CL_AND;
 772
 773     assert(and_with->type == ANYOF);
 774
 775     /* I (khw) am not sure all these restrictions are necessary XXX */
 776     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 777         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 778         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 779         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 780         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 781         int i;
 782
 783         if (and_with->flags & ANYOF_INVERT)
 784             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 785                 cl->bitmap[i] &= ~and_with->bitmap[i];
 786         else
 787             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 788                 cl->bitmap[i] &= and_with->bitmap[i];
 789     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 790
 791     if (and_with->flags & ANYOF_INVERT) {
 792
 793         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 794          * aren't affected by the inversion.  Those that are affected are
 795          * handled individually below */
 796         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 797         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 798         cl->flags |= affected_flags;
 799
 800         /* We currently don't know how to deal with things that aren't in the
 801          * bitmap, but we know that the intersection is no greater than what
 802          * is already in cl, so let there be false positives that get sorted
 803          * out after the synthetic start class succeeds, and the node is
 804          * matched for real. */
 805
 806         /* The inversion of these two flags indicate that the resulting
 807          * intersection doesn't have them */
 808         if (and_with->flags & ANYOF_UNICODE_ALL) {
 809             cl->flags &= ~ANYOF_UNICODE_ALL;
 810         }
 811         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 812             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 813         }
 814     }
 815     else {   /* and'd node is not inverted */
 816         if (! ANYOF_NONBITMAP(and_with)) {
 817
 818             /* Here 'and_with' doesn't match anything outside the bitmap
 819              * (except possibly ANYOF_UNICODE_ALL), which means the
 820              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 821              * which case we don't know what the intersection is, but it's no
 822              * greater than what cl already has, so can just leave it alone,
 823              * with possible false positives */
 824             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 825                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 826             }
 827         }
 828         else if (! ANYOF_NONBITMAP(cl)) {
 829
 830             /* Here, 'and_with' does match something outside the bitmap, and cl
 831              * doesn't have a list of things to match outside the bitmap.  If
 832              * cl can match all code points above 255, the intersection will
 833              * be those above-255 code points that 'and_with' matches.  There
 834              * may be false positives from code points in 'and_with' that are
 835              * outside the bitmap but below 256, but those get sorted out
 836              * after the synthetic start class succeeds).  If cl can't match
 837              * all Unicode code points, it means here that it can't match *
 838              * anything outside the bitmap, so we leave the bitmap empty */
 839             if (cl->flags & ANYOF_UNICODE_ALL) {
 840                 ARG_SET(cl, ARG(and_with));
 841             }
 842         }
 843         else {
 844             /* Here, both 'and_with' and cl match something outside the
 845              * bitmap.  Currently we do not do the intersection, so just match
 846              * whatever cl had at the beginning.  */
 847         }
 848
 849
 850         /* Take the intersection of the two sets of flags */
 851         cl->flags &= and_with->flags;
 852     }
 853 }
 854
 855 /* 'OR' a given class with another one.  Can create false positives */
 856 /* cl should not be inverted */
 857 STATIC void
 858 S_cl_or(struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 859 {
 860     PERL_ARGS_ASSERT_CL_OR;
 861
 862     if (or_with->flags & ANYOF_INVERT) {
 863
 864         /* Here, the or'd node is to be inverted.  This means we take the
 865          * complement of everything not in the bitmap, but currently we don't
 866          * know what that is, so give up and match anything */
 867         if (ANYOF_NONBITMAP(or_with)) {
 868             cl_anything(cl);
 869         }
 870         /* We do not use
 871          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 872          *   <= (B1 | !B2) | (CL1 | !CL2)
 873          * which is wasteful if CL2 is small, but we ignore CL2:
 874          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 875          * XXXX Can we handle case-fold?  Unclear:
 876          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 877          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 878          */
 879         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 880              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 881              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 882             int i;
 883
 884             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 885                 cl->bitmap[i] |= ~or_with->bitmap[i];
 886         } /* XXXX: logic is complicated otherwise */
 887         else {
 888             cl_anything(cl);
 889         }
 890
 891         /* And, we can just take the union of the flags that aren't affected
 892          * by the inversion */
 893         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 894
 895         /* For the remaining flags:
 896             ANYOF_UNICODE_ALL and inverted means to not match anything above
 897                     255, which means that the union with cl should just be
 898                     what cl has in it, so can ignore this flag
 899             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 900                     is 127-255 to match them, but then invert that, so the
 901                     union with cl should just be what cl has in it, so can
 902                     ignore this flag
 903          */
 904     } else {    /* 'or_with' is not inverted */
 905         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 906         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 907              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 908                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 909             int i;
 910
 911             /* OR char bitmap and class bitmap separately */
 912             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 913                 cl->bitmap[i] |= or_with->bitmap[i];
 914             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 915                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 916                     cl->classflags[i] |= or_with->classflags[i];
 917                 cl->flags |= ANYOF_CLASS;
 918             }
 919         }
 920         else { /* XXXX: logic is complicated, leave it along for a moment. */
 921             cl_anything(cl);
 922         }
 923
 924         /* Take the union */
 925         cl->flags |= or_with->flags;
 926
 927         if (ANYOF_NONBITMAP(or_with)) {
 928
 929             /* Use the added node's outside-the-bit-map match if there isn't a
 930              * conflict.  If there is a conflict (both nodes match something
 931              * outside the bitmap, but what they match outside is not the same
 932              * pointer, and hence not easily compared until XXX we extend
 933              * inversion lists this far), give up and allow the start class to
 934              * match everything outside the bitmap */
 935             if (! ANYOF_NONBITMAP(cl)) {
 936                 ARG_SET(cl, ARG(or_with));
 937             }
 938             else if (ARG(cl) != ARG(or_with)) {
 939                 cl->flags |= ANYOF_UNICODE_ALL;
 940             }
 941         }
 942     }
 943 }
 944
 945 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 946 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 947 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 948 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 949
 950
 951 #ifdef DEBUGGING
 952 /*
 953    dump_trie(trie,widecharmap,revcharmap)
 954    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 955    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 956
 957    These routines dump out a trie in a somewhat readable format.
 958    The _interim_ variants are used for debugging the interim
 959    tables that are used to generate the final compressed
 960    representation which is what dump_trie expects.
 961
 962    Part of the reason for their existence is to provide a form
 963    of documentation as to how the different representations function.
 964
 965 */
 966
 967 /*
 968   Dumps the final compressed table form of the trie to Perl_debug_log.
 969   Used for debugging make_trie().
 970 */
 971
 972 STATIC void
 973 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 974             AV *revcharmap, U32 depth)
 975 {
 976     U32 state;
 977     SV *sv=sv_newmortal();
 978     int colwidth= widecharmap ? 6 : 4;
 979     U16 word;
 980     GET_RE_DEBUG_FLAGS_DECL;
 981
 982     PERL_ARGS_ASSERT_DUMP_TRIE;
 983
 984     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 985         (int)depth * 2 + 2,"",
 986         "Match","Base","Ofs" );
 987
 988     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 989         SV ** const tmp = av_fetch( revcharmap, state, 0);
 990         if ( tmp ) {
 991             PerlIO_printf( Perl_debug_log, "%*s",
 992                 colwidth,
 993                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 994                             PL_colors[0], PL_colors[1],
 995                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 996                             PERL_PV_ESCAPE_FIRSTCHAR
 997                 )
 998             );
 999         }
1000     }
1001     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1002         (int)depth * 2 + 2,"");
1003
1004     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1005         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1006     PerlIO_printf( Perl_debug_log, "\n");
1007
1008     for( state = 1 ; state < trie->statecount ; state++ ) {
1009         const U32 base = trie->states[ state ].trans.base;
1010
1011         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1012
1013         if ( trie->states[ state ].wordnum ) {
1014             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1015         } else {
1016             PerlIO_printf( Perl_debug_log, "%6s", "" );
1017         }
1018
1019         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1020
1021         if ( base ) {
1022             U32 ofs = 0;
1023
1024             while( ( base + ofs  < trie->uniquecharcount ) ||
1025                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1026                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1027                     ofs++;
1028
1029             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1030
1031             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1032                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1033                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1034                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1035                 {
1036                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1037                     colwidth,
1038                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1039                 } else {
1040                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1041                 }
1042             }
1043
1044             PerlIO_printf( Perl_debug_log, "]");
1045
1046         }
1047         PerlIO_printf( Perl_debug_log, "\n" );
1048     }
1049     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1050     for (word=1; word <= trie->wordcount; word++) {
1051         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1052             (int)word, (int)(trie->wordinfo[word].prev),
1053             (int)(trie->wordinfo[word].len));
1054     }
1055     PerlIO_printf(Perl_debug_log, "\n" );
1056 }
1057 /*
1058   Dumps a fully constructed but uncompressed trie in list form.
1059   List tries normally only are used for construction when the number of
1060   possible chars (trie->uniquecharcount) is very high.
1061   Used for debugging make_trie().
1062 */
1063 STATIC void
1064 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1065                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1066                          U32 depth)
1067 {
1068     U32 state;
1069     SV *sv=sv_newmortal();
1070     int colwidth= widecharmap ? 6 : 4;
1071     GET_RE_DEBUG_FLAGS_DECL;
1072
1073     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1074
1075     /* print out the table precompression.  */
1076     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1077         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1078         "------:-----+-----------------\n" );
1079
1080     for( state=1 ; state < next_alloc ; state ++ ) {
1081         U16 charid;
1082
1083         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1084             (int)depth * 2 + 2,"", (UV)state  );
1085         if ( ! trie->states[ state ].wordnum ) {
1086             PerlIO_printf( Perl_debug_log, "%5s| ","");
1087         } else {
1088             PerlIO_printf( Perl_debug_log, "W%4x| ",
1089                 trie->states[ state ].wordnum
1090             );
1091         }
1092         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1093             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1094             if ( tmp ) {
1095                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1096                     colwidth,
1097                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1098                             PL_colors[0], PL_colors[1],
1099                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1100                             PERL_PV_ESCAPE_FIRSTCHAR
1101                     ) ,
1102                     TRIE_LIST_ITEM(state,charid).forid,
1103                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1104                 );
1105                 if (!(charid % 10))
1106                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1107                         (int)((depth * 2) + 14), "");
1108             }
1109         }
1110         PerlIO_printf( Perl_debug_log, "\n");
1111     }
1112 }
1113
1114 /*
1115   Dumps a fully constructed but uncompressed trie in table form.
1116   This is the normal DFA style state transition table, with a few
1117   twists to facilitate compression later.
1118   Used for debugging make_trie().
1119 */
1120 STATIC void
1121 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1122                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1123                           U32 depth)
1124 {
1125     U32 state;
1126     U16 charid;
1127     SV *sv=sv_newmortal();
1128     int colwidth= widecharmap ? 6 : 4;
1129     GET_RE_DEBUG_FLAGS_DECL;
1130
1131     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1132
1133     /*
1134        print out the table precompression so that we can do a visual check
1135        that they are identical.
1136      */
1137
1138     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1139
1140     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1141         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1142         if ( tmp ) {
1143             PerlIO_printf( Perl_debug_log, "%*s",
1144                 colwidth,
1145                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1146                             PL_colors[0], PL_colors[1],
1147                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1148                             PERL_PV_ESCAPE_FIRSTCHAR
1149                 )
1150             );
1151         }
1152     }
1153
1154     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1155
1156     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1157         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1158     }
1159
1160     PerlIO_printf( Perl_debug_log, "\n" );
1161
1162     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1163
1164         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1165             (int)depth * 2 + 2,"",
1166             (UV)TRIE_NODENUM( state ) );
1167
1168         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1169             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1170             if (v)
1171                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1172             else
1173                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1174         }
1175         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1176             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1177         } else {
1178             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1179             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1180         }
1181     }
1182 }
1183
1184 #endif
1185
1186
1187 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1188   startbranch: the first branch in the whole branch sequence
1189   first      : start branch of sequence of branch-exact nodes.
1190                May be the same as startbranch
1191   last       : Thing following the last branch.
1192                May be the same as tail.
1193   tail       : item following the branch sequence
1194   count      : words in the sequence
1195   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1196   depth      : indent depth
1197
1198 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1199
1200 A trie is an N'ary tree where the branches are determined by digital
1201 decomposition of the key. IE, at the root node you look up the 1st character and
1202 follow that branch repeat until you find the end of the branches. Nodes can be
1203 marked as "accepting" meaning they represent a complete word. Eg:
1204
1205   /he|she|his|hers/
1206
1207 would convert into the following structure. Numbers represent states, letters
1208 following numbers represent valid transitions on the letter from that state, if
1209 the number is in square brackets it represents an accepting state, otherwise it
1210 will be in parenthesis.
1211
1212       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1213       |    |
1214       |   (2)
1215       |    |
1216      (1)   +-i->(6)-+-s->[7]
1217       |
1218       +-s->(3)-+-h->(4)-+-e->[5]
1219
1220       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1221
1222 This shows that when matching against the string 'hers' we will begin at state 1
1223 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1224 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1225 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1226 single traverse. We store a mapping from accepting to state to which word was
1227 matched, and then when we have multiple possibilities we try to complete the
1228 rest of the regex in the order in which they occured in the alternation.
1229
1230 The only prior NFA like behaviour that would be changed by the TRIE support is
1231 the silent ignoring of duplicate alternations which are of the form:
1232
1233  / (DUPE|DUPE) X? (?{ ... }) Y /x
1234
1235 Thus EVAL blocks following a trie may be called a different number of times with
1236 and without the optimisation. With the optimisations dupes will be silently
1237 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1238 the following demonstrates:
1239
1240  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1241
1242 which prints out 'word' three times, but
1243
1244  'words'=~/(word|word|word)(?{ print $1 })S/
1245
1246 which doesnt print it out at all. This is due to other optimisations kicking in.
1247
1248 Example of what happens on a structural level:
1249
1250 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1251
1252    1: CURLYM[1] {1,32767}(18)
1253    5:   BRANCH(8)
1254    6:     EXACT <ac>(16)
1255    8:   BRANCH(11)
1256    9:     EXACT <ad>(16)
1257   11:   BRANCH(14)
1258   12:     EXACT <ab>(16)
1259   16:   SUCCEED(0)
1260   17:   NOTHING(18)
1261   18: END(0)
1262
1263 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1264 and should turn into:
1265
1266    1: CURLYM[1] {1,32767}(18)
1267    5:   TRIE(16)
1268         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1269           <ac>
1270           <ad>
1271           <ab>
1272   16:   SUCCEED(0)
1273   17:   NOTHING(18)
1274   18: END(0)
1275
1276 Cases where tail != last would be like /(?foo|bar)baz/:
1277
1278    1: BRANCH(4)
1279    2:   EXACT <foo>(8)
1280    4: BRANCH(7)
1281    5:   EXACT <bar>(8)
1282    7: TAIL(8)
1283    8: EXACT <baz>(10)
1284   10: END(0)
1285
1286 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1287 and would end up looking like:
1288
1289     1: TRIE(8)
1290       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1291         <foo>
1292         <bar>
1293    7: TAIL(8)
1294    8: EXACT <baz>(10)
1295   10: END(0)
1296
1297     d = uvuni_to_utf8_flags(d, uv, 0);
1298
1299 is the recommended Unicode-aware way of saying
1300
1301     *(d++) = uv;
1302 */
1303
1304 #define TRIE_STORE_REVCHAR                                                 \
1305     STMT_START {                                                           \
1306         if (UTF) {                                                         \
1307             SV *zlopp = newSV(2);                                          \
1308             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1309             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1310             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1311             SvPOK_on(zlopp);                                               \
1312             SvUTF8_on(zlopp);                                              \
1313             av_push(revcharmap, zlopp);                                    \
1314         } else {                                                           \
1315             char ooooff = (char)uvc;                                               \
1316             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1317         }                                                                  \
1318         } STMT_END
1319
1320 #define TRIE_READ_CHAR STMT_START {                                           \
1321     wordlen++;                                                                \
1322     if ( UTF ) {                                                              \
1323         if ( folder ) {                                                       \
1324             if ( foldlen > 0 ) {                                              \
1325                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1326                foldlen -= len;                                                \
1327                scan += len;                                                   \
1328                len = 0;                                                       \
1329             } else {                                                          \
1330                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1331                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1332                 foldlen -= UNISKIP( uvc );                                    \
1333                 scan = foldbuf + UNISKIP( uvc );                              \
1334             }                                                                 \
1335         } else {                                                              \
1336             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1337         }                                                                     \
1338     } else {                                                                  \
1339         uvc = (U32)*uc;                                                       \
1340         len = 1;                                                              \
1341     }                                                                         \
1342 } STMT_END
1343
1344
1345
1346 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1347     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1348         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1349         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1350     }                                                           \
1351     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1352     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1353     TRIE_LIST_CUR( state )++;                                   \
1354 } STMT_END
1355
1356 #define TRIE_LIST_NEW(state) STMT_START {                       \
1357     Newxz( trie->states[ state ].trans.list,               \
1358         4, reg_trie_trans_le );                                 \
1359      TRIE_LIST_CUR( state ) = 1;                                \
1360      TRIE_LIST_LEN( state ) = 4;                                \
1361 } STMT_END
1362
1363 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1364     U16 dupe= trie->states[ state ].wordnum;                    \
1365     regnode * const noper_next = regnext( noper );              \
1366                                                                 \
1367     DEBUG_r({                                                   \
1368         /* store the word for dumping */                        \
1369         SV* tmp;                                                \
1370         if (OP(noper) != NOTHING)                               \
1371             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1372         else                                                    \
1373             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1374         av_push( trie_words, tmp );                             \
1375     });                                                         \
1376                                                                 \
1377     curword++;                                                  \
1378     trie->wordinfo[curword].prev   = 0;                         \
1379     trie->wordinfo[curword].len    = wordlen;                   \
1380     trie->wordinfo[curword].accept = state;                     \
1381                                                                 \
1382     if ( noper_next < tail ) {                                  \
1383         if (!trie->jump)                                        \
1384             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1385         trie->jump[curword] = (U16)(noper_next - convert);      \
1386         if (!jumper)                                            \
1387             jumper = noper_next;                                \
1388         if (!nextbranch)                                        \
1389             nextbranch= regnext(cur);                           \
1390     }                                                           \
1391                                                                 \
1392     if ( dupe ) {                                               \
1393         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1394         /* chain, so that when the bits of chain are later    */\
1395         /* linked together, the dups appear in the chain      */\
1396         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1397         trie->wordinfo[dupe].prev = curword;                    \
1398     } else {                                                    \
1399         /* we haven't inserted this word yet.                */ \
1400         trie->states[ state ].wordnum = curword;                \
1401     }                                                           \
1402 } STMT_END
1403
1404
1405 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1406      ( ( base + charid >=  ucharcount                                   \
1407          && base + charid < ubound                                      \
1408          && state == trie->trans[ base - ucharcount + charid ].check    \
1409          && trie->trans[ base - ucharcount + charid ].next )            \
1410            ? trie->trans[ base - ucharcount + charid ].next             \
1411            : ( state==1 ? special : 0 )                                 \
1412       )
1413
1414 #define MADE_TRIE       1
1415 #define MADE_JUMP_TRIE  2
1416 #define MADE_EXACT_TRIE 4
1417
1418 STATIC I32
1419 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1420 {
1421     dVAR;
1422     /* first pass, loop through and scan words */
1423     reg_trie_data *trie;
1424     HV *widecharmap = NULL;
1425     AV *revcharmap = newAV();
1426     regnode *cur;
1427     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1428     STRLEN len = 0;
1429     UV uvc = 0;
1430     U16 curword = 0;
1431     U32 next_alloc = 0;
1432     regnode *jumper = NULL;
1433     regnode *nextbranch = NULL;
1434     regnode *convert = NULL;
1435     U32 *prev_states; /* temp array mapping each state to previous one */
1436     /* we just use folder as a flag in utf8 */
1437     const U8 * folder = NULL;
1438
1439 #ifdef DEBUGGING
1440     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1441     AV *trie_words = NULL;
1442     /* along with revcharmap, this only used during construction but both are
1443      * useful during debugging so we store them in the struct when debugging.
1444      */
1445 #else
1446     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1447     STRLEN trie_charcount=0;
1448 #endif
1449     SV *re_trie_maxbuff;
1450     GET_RE_DEBUG_FLAGS_DECL;
1451
1452     PERL_ARGS_ASSERT_MAKE_TRIE;
1453 #ifndef DEBUGGING
1454     PERL_UNUSED_ARG(depth);
1455 #endif
1456
1457     switch (flags) {
1458         case EXACTFA:
1459         case EXACTFU: folder = PL_fold_latin1; break;
1460         case EXACTF:  folder = PL_fold; break;
1461         case EXACTFL: folder = PL_fold_locale; break;
1462     }
1463
1464     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1465     trie->refcount = 1;
1466     trie->startstate = 1;
1467     trie->wordcount = word_count;
1468     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1469     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1470     if (!(UTF && folder))
1471         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1472     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1473                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1474
1475     DEBUG_r({
1476         trie_words = newAV();
1477     });
1478
1479     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1480     if (!SvIOK(re_trie_maxbuff)) {
1481         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1482     }
1483     DEBUG_OPTIMISE_r({
1484                 PerlIO_printf( Perl_debug_log,
1485                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1486                   (int)depth * 2 + 2, "",
1487                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1488                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1489                   (int)depth);
1490     });
1491
1492    /* Find the node we are going to overwrite */
1493     if ( first == startbranch && OP( last ) != BRANCH ) {
1494         /* whole branch chain */
1495         convert = first;
1496     } else {
1497         /* branch sub-chain */
1498         convert = NEXTOPER( first );
1499     }
1500
1501     /*  -- First loop and Setup --
1502
1503        We first traverse the branches and scan each word to determine if it
1504        contains widechars, and how many unique chars there are, this is
1505        important as we have to build a table with at least as many columns as we
1506        have unique chars.
1507
1508        We use an array of integers to represent the character codes 0..255
1509        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1510        native representation of the character value as the key and IV's for the
1511        coded index.
1512
1513        *TODO* If we keep track of how many times each character is used we can
1514        remap the columns so that the table compression later on is more
1515        efficient in terms of memory by ensuring the most common value is in the
1516        middle and the least common are on the outside.  IMO this would be better
1517        than a most to least common mapping as theres a decent chance the most
1518        common letter will share a node with the least common, meaning the node
1519        will not be compressible. With a middle is most common approach the worst
1520        case is when we have the least common nodes twice.
1521
1522      */
1523
1524     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1525         regnode * const noper = NEXTOPER( cur );
1526         const U8 *uc = (U8*)STRING( noper );
1527         const U8 * const e  = uc + STR_LEN( noper );
1528         STRLEN foldlen = 0;
1529         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1530         const U8 *scan = (U8*)NULL;
1531         U32 wordlen      = 0;         /* required init */
1532         STRLEN chars = 0;
1533         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1534
1535         if (OP(noper) == NOTHING) {
1536             trie->minlen= 0;
1537             continue;
1538         }
1539         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1540             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1541                                           regardless of encoding */
1542
1543         for ( ; uc < e ; uc += len ) {
1544             TRIE_CHARCOUNT(trie)++;
1545             TRIE_READ_CHAR;
1546             chars++;
1547             if ( uvc < 256 ) {
1548                 if ( !trie->charmap[ uvc ] ) {
1549                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1550                     if ( folder )
1551                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1552                     TRIE_STORE_REVCHAR;
1553                 }
1554                 if ( set_bit ) {
1555                     /* store the codepoint in the bitmap, and its folded
1556                      * equivalent. */
1557                     TRIE_BITMAP_SET(trie,uvc);
1558
1559                     /* store the folded codepoint */
1560                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1561
1562                     if ( !UTF ) {
1563                         /* store first byte of utf8 representation of
1564                            variant codepoints */
1565                         if (! UNI_IS_INVARIANT(uvc)) {
1566                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1567                         }
1568                     }
1569                     set_bit = 0; /* We've done our bit :-) */
1570                 }
1571             } else {
1572                 SV** svpp;
1573                 if ( !widecharmap )
1574                     widecharmap = newHV();
1575
1576                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1577
1578                 if ( !svpp )
1579                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1580
1581                 if ( !SvTRUE( *svpp ) ) {
1582                     sv_setiv( *svpp, ++trie->uniquecharcount );
1583                     TRIE_STORE_REVCHAR;
1584                 }
1585             }
1586         }
1587         if( cur == first ) {
1588             trie->minlen=chars;
1589             trie->maxlen=chars;
1590         } else if (chars < trie->minlen) {
1591             trie->minlen=chars;
1592         } else if (chars > trie->maxlen) {
1593             trie->maxlen=chars;
1594         }
1595
1596     } /* end first pass */
1597     DEBUG_TRIE_COMPILE_r(
1598         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1599                 (int)depth * 2 + 2,"",
1600                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1601                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1602                 (int)trie->minlen, (int)trie->maxlen )
1603     );
1604
1605     /*
1606         We now know what we are dealing with in terms of unique chars and
1607         string sizes so we can calculate how much memory a naive
1608         representation using a flat table  will take. If it's over a reasonable
1609         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1610         conservative but potentially much slower representation using an array
1611         of lists.
1612
1613         At the end we convert both representations into the same compressed
1614         form that will be used in regexec.c for matching with. The latter
1615         is a form that cannot be used to construct with but has memory
1616         properties similar to the list form and access properties similar
1617         to the table form making it both suitable for fast searches and
1618         small enough that its feasable to store for the duration of a program.
1619
1620         See the comment in the code where the compressed table is produced
1621         inplace from the flat tabe representation for an explanation of how
1622         the compression works.
1623
1624     */
1625
1626
1627     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1628     prev_states[1] = 0;
1629
1630     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1631         /*
1632             Second Pass -- Array Of Lists Representation
1633
1634             Each state will be represented by a list of charid:state records
1635             (reg_trie_trans_le) the first such element holds the CUR and LEN
1636             points of the allocated array. (See defines above).
1637
1638             We build the initial structure using the lists, and then convert
1639             it into the compressed table form which allows faster lookups
1640             (but cant be modified once converted).
1641         */
1642
1643         STRLEN transcount = 1;
1644
1645         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1646             "%*sCompiling trie using list compiler\n",
1647             (int)depth * 2 + 2, ""));
1648
1649         trie->states = (reg_trie_state *)
1650             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1651                                   sizeof(reg_trie_state) );
1652         TRIE_LIST_NEW(1);
1653         next_alloc = 2;
1654
1655         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1656
1657             regnode * const noper = NEXTOPER( cur );
1658             U8 *uc           = (U8*)STRING( noper );
1659             const U8 * const e = uc + STR_LEN( noper );
1660             U32 state        = 1;         /* required init */
1661             U16 charid       = 0;         /* sanity init */
1662             U8 *scan         = (U8*)NULL; /* sanity init */
1663             STRLEN foldlen   = 0;         /* required init */
1664             U32 wordlen      = 0;         /* required init */
1665             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1666
1667             if (OP(noper) != NOTHING) {
1668                 for ( ; uc < e ; uc += len ) {
1669
1670                     TRIE_READ_CHAR;
1671
1672                     if ( uvc < 256 ) {
1673                         charid = trie->charmap[ uvc ];
1674                     } else {
1675                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1676                         if ( !svpp ) {
1677                             charid = 0;
1678                         } else {
1679                             charid=(U16)SvIV( *svpp );
1680                         }
1681                     }
1682                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1683                     if ( charid ) {
1684
1685                         U16 check;
1686                         U32 newstate = 0;
1687
1688                         charid--;
1689                         if ( !trie->states[ state ].trans.list ) {
1690                             TRIE_LIST_NEW( state );
1691                         }
1692                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1693                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1694                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1695                                 break;
1696                             }
1697                         }
1698                         if ( ! newstate ) {
1699                             newstate = next_alloc++;
1700                             prev_states[newstate] = state;
1701                             TRIE_LIST_PUSH( state, charid, newstate );
1702                             transcount++;
1703                         }
1704                         state = newstate;
1705                     } else {
1706                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1707                     }
1708                 }
1709             }
1710             TRIE_HANDLE_WORD(state);
1711
1712         } /* end second pass */
1713
1714         /* next alloc is the NEXT state to be allocated */
1715         trie->statecount = next_alloc;
1716         trie->states = (reg_trie_state *)
1717             PerlMemShared_realloc( trie->states,
1718                                    next_alloc
1719                                    * sizeof(reg_trie_state) );
1720
1721         /* and now dump it out before we compress it */
1722         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1723                                                          revcharmap, next_alloc,
1724                                                          depth+1)
1725         );
1726
1727         trie->trans = (reg_trie_trans *)
1728             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1729         {
1730             U32 state;
1731             U32 tp = 0;
1732             U32 zp = 0;
1733
1734
1735             for( state=1 ; state < next_alloc ; state ++ ) {
1736                 U32 base=0;
1737
1738                 /*
1739                 DEBUG_TRIE_COMPILE_MORE_r(
1740                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1741                 );
1742                 */
1743
1744                 if (trie->states[state].trans.list) {
1745                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1746                     U16 maxid=minid;
1747                     U16 idx;
1748
1749                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1750                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1751                         if ( forid < minid ) {
1752                             minid=forid;
1753                         } else if ( forid > maxid ) {
1754                             maxid=forid;
1755                         }
1756                     }
1757                     if ( transcount < tp + maxid - minid + 1) {
1758                         transcount *= 2;
1759                         trie->trans = (reg_trie_trans *)
1760                             PerlMemShared_realloc( trie->trans,
1761                                                      transcount
1762                                                      * sizeof(reg_trie_trans) );
1763                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1764                     }
1765                     base = trie->uniquecharcount + tp - minid;
1766                     if ( maxid == minid ) {
1767                         U32 set = 0;
1768                         for ( ; zp < tp ; zp++ ) {
1769                             if ( ! trie->trans[ zp ].next ) {
1770                                 base = trie->uniquecharcount + zp - minid;
1771                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1772                                 trie->trans[ zp ].check = state;
1773                                 set = 1;
1774                                 break;
1775                             }
1776                         }
1777                         if ( !set ) {
1778                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1779                             trie->trans[ tp ].check = state;
1780                             tp++;
1781                             zp = tp;
1782                         }
1783                     } else {
1784                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1785                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1786                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1787                             trie->trans[ tid ].check = state;
1788                         }
1789                         tp += ( maxid - minid + 1 );
1790                     }
1791                     Safefree(trie->states[ state ].trans.list);
1792                 }
1793                 /*
1794                 DEBUG_TRIE_COMPILE_MORE_r(
1795                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1796                 );
1797                 */
1798                 trie->states[ state ].trans.base=base;
1799             }
1800             trie->lasttrans = tp + 1;
1801         }
1802     } else {
1803         /*
1804            Second Pass -- Flat Table Representation.
1805
1806            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1807            We know that we will need Charcount+1 trans at most to store the data
1808            (one row per char at worst case) So we preallocate both structures
1809            assuming worst case.
1810
1811            We then construct the trie using only the .next slots of the entry
1812            structs.
1813
1814            We use the .check field of the first entry of the node temporarily to
1815            make compression both faster and easier by keeping track of how many non
1816            zero fields are in the node.
1817
1818            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1819            transition.
1820
1821            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1822            number representing the first entry of the node, and state as a
1823            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1824            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1825            are 2 entrys per node. eg:
1826
1827              A B       A B
1828           1. 2 4    1. 3 7
1829           2. 0 3    3. 0 5
1830           3. 0 0    5. 0 0
1831           4. 0 0    7. 0 0
1832
1833            The table is internally in the right hand, idx form. However as we also
1834            have to deal with the states array which is indexed by nodenum we have to
1835            use TRIE_NODENUM() to convert.
1836
1837         */
1838         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1839             "%*sCompiling trie using table compiler\n",
1840             (int)depth * 2 + 2, ""));
1841
1842         trie->trans = (reg_trie_trans *)
1843             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1844                                   * trie->uniquecharcount + 1,
1845                                   sizeof(reg_trie_trans) );
1846         trie->states = (reg_trie_state *)
1847             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1848                                   sizeof(reg_trie_state) );
1849         next_alloc = trie->uniquecharcount + 1;
1850
1851
1852         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1853
1854             regnode * const noper   = NEXTOPER( cur );
1855             const U8 *uc     = (U8*)STRING( noper );
1856             const U8 * const e = uc + STR_LEN( noper );
1857
1858             U32 state        = 1;         /* required init */
1859
1860             U16 charid       = 0;         /* sanity init */
1861             U32 accept_state = 0;         /* sanity init */
1862             U8 *scan         = (U8*)NULL; /* sanity init */
1863
1864             STRLEN foldlen   = 0;         /* required init */
1865             U32 wordlen      = 0;         /* required init */
1866             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1867
1868             if ( OP(noper) != NOTHING ) {
1869                 for ( ; uc < e ; uc += len ) {
1870
1871                     TRIE_READ_CHAR;
1872
1873                     if ( uvc < 256 ) {
1874                         charid = trie->charmap[ uvc ];
1875                     } else {
1876                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1877                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1878                     }
1879                     if ( charid ) {
1880                         charid--;
1881                         if ( !trie->trans[ state + charid ].next ) {
1882                             trie->trans[ state + charid ].next = next_alloc;
1883                             trie->trans[ state ].check++;
1884                             prev_states[TRIE_NODENUM(next_alloc)]
1885                                     = TRIE_NODENUM(state);
1886                             next_alloc += trie->uniquecharcount;
1887                         }
1888                         state = trie->trans[ state + charid ].next;
1889                     } else {
1890                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1891                     }
1892                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1893                 }
1894             }
1895             accept_state = TRIE_NODENUM( state );
1896             TRIE_HANDLE_WORD(accept_state);
1897
1898         } /* end second pass */
1899
1900         /* and now dump it out before we compress it */
1901         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1902                                                           revcharmap,
1903                                                           next_alloc, depth+1));
1904
1905         {
1906         /*
1907            * Inplace compress the table.*
1908
1909            For sparse data sets the table constructed by the trie algorithm will
1910            be mostly 0/FAIL transitions or to put it another way mostly empty.
1911            (Note that leaf nodes will not contain any transitions.)
1912
1913            This algorithm compresses the tables by eliminating most such
1914            transitions, at the cost of a modest bit of extra work during lookup:
1915
1916            - Each states[] entry contains a .base field which indicates the
1917            index in the state[] array wheres its transition data is stored.
1918
1919            - If .base is 0 there are no valid transitions from that node.
1920
1921            - If .base is nonzero then charid is added to it to find an entry in
1922            the trans array.
1923
1924            -If trans[states[state].base+charid].check!=state then the
1925            transition is taken to be a 0/Fail transition. Thus if there are fail
1926            transitions at the front of the node then the .base offset will point
1927            somewhere inside the previous nodes data (or maybe even into a node
1928            even earlier), but the .check field determines if the transition is
1929            valid.
1930
1931            XXX - wrong maybe?
1932            The following process inplace converts the table to the compressed
1933            table: We first do not compress the root node 1,and mark all its
1934            .check pointers as 1 and set its .base pointer as 1 as well. This
1935            allows us to do a DFA construction from the compressed table later,
1936            and ensures that any .base pointers we calculate later are greater
1937            than 0.
1938
1939            - We set 'pos' to indicate the first entry of the second node.
1940
1941            - We then iterate over the columns of the node, finding the first and
1942            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1943            and set the .check pointers accordingly, and advance pos
1944            appropriately and repreat for the next node. Note that when we copy
1945            the next pointers we have to convert them from the original
1946            NODEIDX form to NODENUM form as the former is not valid post
1947            compression.
1948
1949            - If a node has no transitions used we mark its base as 0 and do not
1950            advance the pos pointer.
1951
1952            - If a node only has one transition we use a second pointer into the
1953            structure to fill in allocated fail transitions from other states.
1954            This pointer is independent of the main pointer and scans forward
1955            looking for null transitions that are allocated to a state. When it
1956            finds one it writes the single transition into the "hole".  If the
1957            pointer doesnt find one the single transition is appended as normal.
1958
1959            - Once compressed we can Renew/realloc the structures to release the
1960            excess space.
1961
1962            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1963            specifically Fig 3.47 and the associated pseudocode.
1964
1965            demq
1966         */
1967         const U32 laststate = TRIE_NODENUM( next_alloc );
1968         U32 state, charid;
1969         U32 pos = 0, zp=0;
1970         trie->statecount = laststate;
1971
1972         for ( state = 1 ; state < laststate ; state++ ) {
1973             U8 flag = 0;
1974             const U32 stateidx = TRIE_NODEIDX( state );
1975             const U32 o_used = trie->trans[ stateidx ].check;
1976             U32 used = trie->trans[ stateidx ].check;
1977             trie->trans[ stateidx ].check = 0;
1978
1979             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1980                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1981                     if ( trie->trans[ stateidx + charid ].next ) {
1982                         if (o_used == 1) {
1983                             for ( ; zp < pos ; zp++ ) {
1984                                 if ( ! trie->trans[ zp ].next ) {
1985                                     break;
1986                                 }
1987                             }
1988                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1989                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1990                             trie->trans[ zp ].check = state;
1991                             if ( ++zp > pos ) pos = zp;
1992                             break;
1993                         }
1994                         used--;
1995                     }
1996                     if ( !flag ) {
1997                         flag = 1;
1998                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1999                     }
2000                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2001                     trie->trans[ pos ].check = state;
2002                     pos++;
2003                 }
2004             }
2005         }
2006         trie->lasttrans = pos + 1;
2007         trie->states = (reg_trie_state *)
2008             PerlMemShared_realloc( trie->states, laststate
2009                                    * sizeof(reg_trie_state) );
2010         DEBUG_TRIE_COMPILE_MORE_r(
2011                 PerlIO_printf( Perl_debug_log,
2012                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2013                     (int)depth * 2 + 2,"",
2014                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2015                     (IV)next_alloc,
2016                     (IV)pos,
2017                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2018             );
2019
2020         } /* end table compress */
2021     }
2022     DEBUG_TRIE_COMPILE_MORE_r(
2023             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2024                 (int)depth * 2 + 2, "",
2025                 (UV)trie->statecount,
2026                 (UV)trie->lasttrans)
2027     );
2028     /* resize the trans array to remove unused space */
2029     trie->trans = (reg_trie_trans *)
2030         PerlMemShared_realloc( trie->trans, trie->lasttrans
2031                                * sizeof(reg_trie_trans) );
2032
2033     {   /* Modify the program and insert the new TRIE node */
2034         U8 nodetype =(U8)(flags & 0xFF);
2035         char *str=NULL;
2036
2037 #ifdef DEBUGGING
2038         regnode *optimize = NULL;
2039 #ifdef RE_TRACK_PATTERN_OFFSETS
2040
2041         U32 mjd_offset = 0;
2042         U32 mjd_nodelen = 0;
2043 #endif /* RE_TRACK_PATTERN_OFFSETS */
2044 #endif /* DEBUGGING */
2045         /*
2046            This means we convert either the first branch or the first Exact,
2047            depending on whether the thing following (in 'last') is a branch
2048            or not and whther first is the startbranch (ie is it a sub part of
2049            the alternation or is it the whole thing.)
2050            Assuming its a sub part we convert the EXACT otherwise we convert
2051            the whole branch sequence, including the first.
2052          */
2053         /* Find the node we are going to overwrite */
2054         if ( first != startbranch || OP( last ) == BRANCH ) {
2055             /* branch sub-chain */
2056             NEXT_OFF( first ) = (U16)(last - first);
2057 #ifdef RE_TRACK_PATTERN_OFFSETS
2058             DEBUG_r({
2059                 mjd_offset= Node_Offset((convert));
2060                 mjd_nodelen= Node_Length((convert));
2061             });
2062 #endif
2063             /* whole branch chain */
2064         }
2065 #ifdef RE_TRACK_PATTERN_OFFSETS
2066         else {
2067             DEBUG_r({
2068                 const  regnode *nop = NEXTOPER( convert );
2069                 mjd_offset= Node_Offset((nop));
2070                 mjd_nodelen= Node_Length((nop));
2071             });
2072         }
2073         DEBUG_OPTIMISE_r(
2074             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2075                 (int)depth * 2 + 2, "",
2076                 (UV)mjd_offset, (UV)mjd_nodelen)
2077         );
2078 #endif
2079         /* But first we check to see if there is a common prefix we can
2080            split out as an EXACT and put in front of the TRIE node.  */
2081         trie->startstate= 1;
2082         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2083             U32 state;
2084             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2085                 U32 ofs = 0;
2086                 I32 idx = -1;
2087                 U32 count = 0;
2088                 const U32 base = trie->states[ state ].trans.base;
2089
2090                 if ( trie->states[state].wordnum )
2091                         count = 1;
2092
2093                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2094                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2095                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2096                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2097                     {
2098                         if ( ++count > 1 ) {
2099                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2100                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2101                             if ( state == 1 ) break;
2102                             if ( count == 2 ) {
2103                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2104                                 DEBUG_OPTIMISE_r(
2105                                     PerlIO_printf(Perl_debug_log,
2106                                         "%*sNew Start State=%"UVuf" Class: [",
2107                                         (int)depth * 2 + 2, "",
2108                                         (UV)state));
2109                                 if (idx >= 0) {
2110                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2111                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2112
2113                                     TRIE_BITMAP_SET(trie,*ch);
2114                                     if ( folder )
2115                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2116                                     DEBUG_OPTIMISE_r(
2117                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2118                                     );
2119                                 }
2120                             }
2121                             TRIE_BITMAP_SET(trie,*ch);
2122                             if ( folder )
2123                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2124                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2125                         }
2126                         idx = ofs;
2127                     }
2128                 }
2129                 if ( count == 1 ) {
2130                     SV **tmp = av_fetch( revcharmap, idx, 0);
2131                     STRLEN len;
2132                     char *ch = SvPV( *tmp, len );
2133                     DEBUG_OPTIMISE_r({
2134                         SV *sv=sv_newmortal();
2135                         PerlIO_printf( Perl_debug_log,
2136                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2137                             (int)depth * 2 + 2, "",
2138                             (UV)state, (UV)idx,
2139                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2140                                 PL_colors[0], PL_colors[1],
2141                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2142                                 PERL_PV_ESCAPE_FIRSTCHAR
2143                             )
2144                         );
2145                     });
2146                     if ( state==1 ) {
2147                         OP( convert ) = nodetype;
2148                         str=STRING(convert);
2149                         STR_LEN(convert)=0;
2150                     }
2151                     STR_LEN(convert) += len;
2152                     while (len--)
2153                         *str++ = *ch++;
2154                 } else {
2155 #ifdef DEBUGGING
2156                     if (state>1)
2157                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2158 #endif
2159                     break;
2160                 }
2161             }
2162             trie->prefixlen = (state-1);
2163             if (str) {
2164                 regnode *n = convert+NODE_SZ_STR(convert);
2165                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2166                 trie->startstate = state;
2167                 trie->minlen -= (state - 1);
2168                 trie->maxlen -= (state - 1);
2169 #ifdef DEBUGGING
2170                /* At least the UNICOS C compiler choked on this
2171                 * being argument to DEBUG_r(), so let's just have
2172                 * it right here. */
2173                if (
2174 #ifdef PERL_EXT_RE_BUILD
2175                    1
2176 #else
2177                    DEBUG_r_TEST
2178 #endif
2179                    ) {
2180                    regnode *fix = convert;
2181                    U32 word = trie->wordcount;
2182                    mjd_nodelen++;
2183                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2184                    while( ++fix < n ) {
2185                        Set_Node_Offset_Length(fix, 0, 0);
2186                    }
2187                    while (word--) {
2188                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2189                        if (tmp) {
2190                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2191                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2192                            else
2193                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2194                        }
2195                    }
2196                }
2197 #endif
2198                 if (trie->maxlen) {
2199                     convert = n;
2200                 } else {
2201                     NEXT_OFF(convert) = (U16)(tail - convert);
2202                     DEBUG_r(optimize= n);
2203                 }
2204             }
2205         }
2206         if (!jumper)
2207             jumper = last;
2208         if ( trie->maxlen ) {
2209             NEXT_OFF( convert ) = (U16)(tail - convert);
2210             ARG_SET( convert, data_slot );
2211             /* Store the offset to the first unabsorbed branch in
2212                jump[0], which is otherwise unused by the jump logic.
2213                We use this when dumping a trie and during optimisation. */
2214             if (trie->jump)
2215                 trie->jump[0] = (U16)(nextbranch - convert);
2216
2217             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2218              *   and there is a bitmap
2219              *   and the first "jump target" node we found leaves enough room
2220              * then convert the TRIE node into a TRIEC node, with the bitmap
2221              * embedded inline in the opcode - this is hypothetically faster.
2222              */
2223             if ( !trie->states[trie->startstate].wordnum
2224                  && trie->bitmap
2225                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2226             {
2227                 OP( convert ) = TRIEC;
2228                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2229                 PerlMemShared_free(trie->bitmap);
2230                 trie->bitmap= NULL;
2231             } else
2232                 OP( convert ) = TRIE;
2233
2234             /* store the type in the flags */
2235             convert->flags = nodetype;
2236             DEBUG_r({
2237             optimize = convert
2238                       + NODE_STEP_REGNODE
2239                       + regarglen[ OP( convert ) ];
2240             });
2241             /* XXX We really should free up the resource in trie now,
2242                    as we won't use them - (which resources?) dmq */
2243         }
2244         /* needed for dumping*/
2245         DEBUG_r(if (optimize) {
2246             regnode *opt = convert;
2247
2248             while ( ++opt < optimize) {
2249                 Set_Node_Offset_Length(opt,0,0);
2250             }
2251             /*
2252                 Try to clean up some of the debris left after the
2253                 optimisation.
2254              */
2255             while( optimize < jumper ) {
2256                 mjd_nodelen += Node_Length((optimize));
2257                 OP( optimize ) = OPTIMIZED;
2258                 Set_Node_Offset_Length(optimize,0,0);
2259                 optimize++;
2260             }
2261             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2262         });
2263     } /* end node insert */
2264
2265     /*  Finish populating the prev field of the wordinfo array.  Walk back
2266      *  from each accept state until we find another accept state, and if
2267      *  so, point the first word's .prev field at the second word. If the
2268      *  second already has a .prev field set, stop now. This will be the
2269      *  case either if we've already processed that word's accept state,
2270      *  or that state had multiple words, and the overspill words were
2271      *  already linked up earlier.
2272      */
2273     {
2274         U16 word;
2275         U32 state;
2276         U16 prev;
2277
2278         for (word=1; word <= trie->wordcount; word++) {
2279             prev = 0;
2280             if (trie->wordinfo[word].prev)
2281                 continue;
2282             state = trie->wordinfo[word].accept;
2283             while (state) {
2284                 state = prev_states[state];
2285                 if (!state)
2286                     break;
2287                 prev = trie->states[state].wordnum;
2288                 if (prev)
2289                     break;
2290             }
2291             trie->wordinfo[word].prev = prev;
2292         }
2293         Safefree(prev_states);
2294     }
2295
2296
2297     /* and now dump out the compressed format */
2298     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2299
2300     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2301 #ifdef DEBUGGING
2302     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2303     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2304 #else
2305     SvREFCNT_dec(revcharmap);
2306 #endif
2307     return trie->jump
2308            ? MADE_JUMP_TRIE
2309            : trie->startstate>1
2310              ? MADE_EXACT_TRIE
2311              : MADE_TRIE;
2312 }
2313
2314 STATIC void
2315 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2316 {
2317 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2318
2319    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2320    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2321    ISBN 0-201-10088-6
2322
2323    We find the fail state for each state in the trie, this state is the longest proper
2324    suffix of the current state's 'word' that is also a proper prefix of another word in our
2325    trie. State 1 represents the word '' and is thus the default fail state. This allows
2326    the DFA not to have to restart after its tried and failed a word at a given point, it
2327    simply continues as though it had been matching the other word in the first place.
2328    Consider
2329       'abcdgu'=~/abcdefg|cdgu/
2330    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2331    fail, which would bring us to the state representing 'd' in the second word where we would
2332    try 'g' and succeed, proceeding to match 'cdgu'.
2333  */
2334  /* add a fail transition */
2335     const U32 trie_offset = ARG(source);
2336     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2337     U32 *q;
2338     const U32 ucharcount = trie->uniquecharcount;
2339     const U32 numstates = trie->statecount;
2340     const U32 ubound = trie->lasttrans + ucharcount;
2341     U32 q_read = 0;
2342     U32 q_write = 0;
2343     U32 charid;
2344     U32 base = trie->states[ 1 ].trans.base;
2345     U32 *fail;
2346     reg_ac_data *aho;
2347     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2348     GET_RE_DEBUG_FLAGS_DECL;
2349
2350     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2351 #ifndef DEBUGGING
2352     PERL_UNUSED_ARG(depth);
2353 #endif
2354
2355
2356     ARG_SET( stclass, data_slot );
2357     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2358     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2359     aho->trie=trie_offset;
2360     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2361     Copy( trie->states, aho->states, numstates, reg_trie_state );
2362     Newxz( q, numstates, U32);
2363     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2364     aho->refcount = 1;
2365     fail = aho->fail;
2366     /* initialize fail[0..1] to be 1 so that we always have
2367        a valid final fail state */
2368     fail[ 0 ] = fail[ 1 ] = 1;
2369
2370     for ( charid = 0; charid < ucharcount ; charid++ ) {
2371         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2372         if ( newstate ) {
2373             q[ q_write ] = newstate;
2374             /* set to point at the root */
2375             fail[ q[ q_write++ ] ]=1;
2376         }
2377     }
2378     while ( q_read < q_write) {
2379         const U32 cur = q[ q_read++ % numstates ];
2380         base = trie->states[ cur ].trans.base;
2381
2382         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2383             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2384             if (ch_state) {
2385                 U32 fail_state = cur;
2386                 U32 fail_base;
2387                 do {
2388                     fail_state = fail[ fail_state ];
2389                     fail_base = aho->states[ fail_state ].trans.base;
2390                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2391
2392                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2393                 fail[ ch_state ] = fail_state;
2394                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2395                 {
2396                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2397                 }
2398                 q[ q_write++ % numstates] = ch_state;
2399             }
2400         }
2401     }
2402     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2403        when we fail in state 1, this allows us to use the
2404        charclass scan to find a valid start char. This is based on the principle
2405        that theres a good chance the string being searched contains lots of stuff
2406        that cant be a start char.
2407      */
2408     fail[ 0 ] = fail[ 1 ] = 0;
2409     DEBUG_TRIE_COMPILE_r({
2410         PerlIO_printf(Perl_debug_log,
2411                       "%*sStclass Failtable (%"UVuf" states): 0",
2412                       (int)(depth * 2), "", (UV)numstates
2413         );
2414         for( q_read=1; q_read<numstates; q_read++ ) {
2415             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2416         }
2417         PerlIO_printf(Perl_debug_log, "\n");
2418     });
2419     Safefree(q);
2420     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2421 }
2422
2423
2424 /*
2425  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2426  * These need to be revisited when a newer toolchain becomes available.
2427  */
2428 #if defined(__sparc64__) && defined(__GNUC__)
2429 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2430 #       undef  SPARC64_GCC_WORKAROUND
2431 #       define SPARC64_GCC_WORKAROUND 1
2432 #   endif
2433 #endif
2434
2435 #define DEBUG_PEEP(str,scan,depth) \
2436     DEBUG_OPTIMISE_r({if (scan){ \
2437        SV * const mysv=sv_newmortal(); \
2438        regnode *Next = regnext(scan); \
2439        regprop(RExC_rx, mysv, scan); \
2440        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2441        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2442        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2443    }});
2444
2445
2446
2447
2448
2449 #define JOIN_EXACT(scan,min,flags) \
2450     if (PL_regkind[OP(scan)] == EXACT) \
2451         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2452
2453 STATIC U32
2454 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2455     /* Merge several consecutive EXACTish nodes into one. */
2456     regnode *n = regnext(scan);
2457     U32 stringok = 1;
2458     regnode *next = scan + NODE_SZ_STR(scan);
2459     U32 merged = 0;
2460     U32 stopnow = 0;
2461 #ifdef DEBUGGING
2462     regnode *stop = scan;
2463     GET_RE_DEBUG_FLAGS_DECL;
2464 #else
2465     PERL_UNUSED_ARG(depth);
2466 #endif
2467
2468     PERL_ARGS_ASSERT_JOIN_EXACT;
2469 #ifndef EXPERIMENTAL_INPLACESCAN
2470     PERL_UNUSED_ARG(flags);
2471     PERL_UNUSED_ARG(val);
2472 #endif
2473     DEBUG_PEEP("join",scan,depth);
2474
2475     /* Skip NOTHING, merge EXACT*. */
2476     while (n &&
2477            ( PL_regkind[OP(n)] == NOTHING ||
2478              (stringok && (OP(n) == OP(scan))))
2479            && NEXT_OFF(n)
2480            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2481
2482         if (OP(n) == TAIL || n > next)
2483             stringok = 0;
2484         if (PL_regkind[OP(n)] == NOTHING) {
2485             DEBUG_PEEP("skip:",n,depth);
2486             NEXT_OFF(scan) += NEXT_OFF(n);
2487             next = n + NODE_STEP_REGNODE;
2488 #ifdef DEBUGGING
2489             if (stringok)
2490                 stop = n;
2491 #endif
2492             n = regnext(n);
2493         }
2494         else if (stringok) {
2495             const unsigned int oldl = STR_LEN(scan);
2496             regnode * const nnext = regnext(n);
2497
2498             DEBUG_PEEP("merg",n,depth);
2499
2500             merged++;
2501             if (oldl + STR_LEN(n) > U8_MAX)
2502                 break;
2503             NEXT_OFF(scan) += NEXT_OFF(n);
2504             STR_LEN(scan) += STR_LEN(n);
2505             next = n + NODE_SZ_STR(n);
2506             /* Now we can overwrite *n : */
2507             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2508 #ifdef DEBUGGING
2509             stop = next - 1;
2510 #endif
2511             n = nnext;
2512             if (stopnow) break;
2513         }
2514
2515 #ifdef EXPERIMENTAL_INPLACESCAN
2516         if (flags && !NEXT_OFF(n)) {
2517             DEBUG_PEEP("atch", val, depth);
2518             if (reg_off_by_arg[OP(n)]) {
2519                 ARG_SET(n, val - n);
2520             }
2521             else {
2522                 NEXT_OFF(n) = val - n;
2523             }
2524             stopnow = 1;
2525         }
2526 #endif
2527     }
2528 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2529 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2530 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2531 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2532
2533     if (UTF
2534         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2535         && ( STR_LEN(scan) >= 6 ) )
2536     {
2537     /*
2538     Two problematic code points in Unicode casefolding of EXACT nodes:
2539
2540     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2541     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2542
2543     which casefold to
2544
2545     Unicode                      UTF-8
2546
2547     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2548     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2549
2550     This means that in case-insensitive matching (or "loose matching",
2551     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2552     length of the above casefolded versions) can match a target string
2553     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2554     This would rather mess up the minimum length computation.
2555
2556     What we'll do is to look for the tail four bytes, and then peek
2557     at the preceding two bytes to see whether we need to decrease
2558     the minimum length by four (six minus two).
2559
2560     Thanks to the design of UTF-8, there cannot be false matches:
2561     A sequence of valid UTF-8 bytes cannot be a subsequence of
2562     another valid sequence of UTF-8 bytes.
2563
2564     */
2565          char * const s0 = STRING(scan), *s, *t;
2566          char * const s1 = s0 + STR_LEN(scan) - 1;
2567          char * const s2 = s1 - 4;
2568 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2569          const char t0[] = "\xaf\x49\xaf\x42";
2570 #else
2571          const char t0[] = "\xcc\x88\xcc\x81";
2572 #endif
2573          const char * const t1 = t0 + 3;
2574
2575          for (s = s0 + 2;
2576               s < s2 && (t = ninstr(s, s1, t0, t1));
2577               s = t + 4) {
2578 #ifdef EBCDIC
2579               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2580                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2581 #else
2582               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2583                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2584 #endif
2585                    *min -= 4;
2586          }
2587     }
2588
2589 #ifdef DEBUGGING
2590     /* Allow dumping */
2591     n = scan + NODE_SZ_STR(scan);
2592     while (n <= stop) {
2593         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2594             OP(n) = OPTIMIZED;
2595             NEXT_OFF(n) = 0;
2596         }
2597         n++;
2598     }
2599 #endif
2600     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2601     return stopnow;
2602 }
2603
2604 /* REx optimizer.  Converts nodes into quicker variants "in place".
2605    Finds fixed substrings.  */
2606
2607 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2608    to the position after last scanned or to NULL. */
2609
2610 #define INIT_AND_WITHP \
2611     assert(!and_withp); \
2612     Newx(and_withp,1,struct regnode_charclass_class); \
2613     SAVEFREEPV(and_withp)
2614
2615 /* this is a chain of data about sub patterns we are processing that
2616    need to be handled separately/specially in study_chunk. Its so
2617    we can simulate recursion without losing state.  */
2618 struct scan_frame;
2619 typedef struct scan_frame {
2620     regnode *last;  /* last node to process in this frame */
2621     regnode *next;  /* next node to process when last is reached */
2622     struct scan_frame *prev; /*previous frame*/
2623     I32 stop; /* what stopparen do we use */
2624 } scan_frame;
2625
2626
2627 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2628
2629 #define CASE_SYNST_FNC(nAmE)                                       \
2630 case nAmE:                                                         \
2631     if (flags & SCF_DO_STCLASS_AND) {                              \
2632             for (value = 0; value < 256; value++)                  \
2633                 if (!is_ ## nAmE ## _cp(value))                       \
2634                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2635     }                                                              \
2636     else {                                                         \
2637             for (value = 0; value < 256; value++)                  \
2638                 if (is_ ## nAmE ## _cp(value))                        \
2639                     ANYOF_BITMAP_SET(data->start_class, value);    \
2640     }                                                              \
2641     break;                                                         \
2642 case N ## nAmE:                                                    \
2643     if (flags & SCF_DO_STCLASS_AND) {                              \
2644             for (value = 0; value < 256; value++)                   \
2645                 if (is_ ## nAmE ## _cp(value))                         \
2646                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2647     }                                                               \
2648     else {                                                          \
2649             for (value = 0; value < 256; value++)                   \
2650                 if (!is_ ## nAmE ## _cp(value))                        \
2651                     ANYOF_BITMAP_SET(data->start_class, value);     \
2652     }                                                               \
2653     break
2654
2655
2656
2657 STATIC I32
2658 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2659                         I32 *minlenp, I32 *deltap,
2660                         regnode *last,
2661                         scan_data_t *data,
2662                         I32 stopparen,
2663                         U8* recursed,
2664                         struct regnode_charclass_class *and_withp,
2665                         U32 flags, U32 depth)
2666                         /* scanp: Start here (read-write). */
2667                         /* deltap: Write maxlen-minlen here. */
2668                         /* last: Stop before this one. */
2669                         /* data: string data about the pattern */
2670                         /* stopparen: treat close N as END */
2671                         /* recursed: which subroutines have we recursed into */
2672                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2673 {
2674     dVAR;
2675     I32 min = 0, pars = 0, code;
2676     regnode *scan = *scanp, *next;
2677     I32 delta = 0;
2678     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2679     int is_inf_internal = 0;            /* The studied chunk is infinite */
2680     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2681     scan_data_t data_fake;
2682     SV *re_trie_maxbuff = NULL;
2683     regnode *first_non_open = scan;
2684     I32 stopmin = I32_MAX;
2685     scan_frame *frame = NULL;
2686     GET_RE_DEBUG_FLAGS_DECL;
2687
2688     PERL_ARGS_ASSERT_STUDY_CHUNK;
2689
2690 #ifdef DEBUGGING
2691     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2692 #endif
2693
2694     if ( depth == 0 ) {
2695         while (first_non_open && OP(first_non_open) == OPEN)
2696             first_non_open=regnext(first_non_open);
2697     }
2698
2699
2700   fake_study_recurse:
2701     while ( scan && OP(scan) != END && scan < last ){
2702         /* Peephole optimizer: */
2703         DEBUG_STUDYDATA("Peep:", data,depth);
2704         DEBUG_PEEP("Peep",scan,depth);
2705         JOIN_EXACT(scan,&min,0);
2706
2707         /* Follow the next-chain of the current node and optimize
2708            away all the NOTHINGs from it.  */
2709         if (OP(scan) != CURLYX) {
2710             const int max = (reg_off_by_arg[OP(scan)]
2711                        ? I32_MAX
2712                        /* I32 may be smaller than U16 on CRAYs! */
2713                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2714             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2715             int noff;
2716             regnode *n = scan;
2717
2718             /* Skip NOTHING and LONGJMP. */
2719             while ((n = regnext(n))
2720                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2721                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2722                    && off + noff < max)
2723                 off += noff;
2724             if (reg_off_by_arg[OP(scan)])
2725                 ARG(scan) = off;
2726             else
2727                 NEXT_OFF(scan) = off;
2728         }
2729
2730
2731
2732         /* The principal pseudo-switch.  Cannot be a switch, since we
2733            look into several different things.  */
2734         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2735                    || OP(scan) == IFTHEN) {
2736             next = regnext(scan);
2737             code = OP(scan);
2738             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2739
2740             if (OP(next) == code || code == IFTHEN) {
2741                 /* NOTE - There is similar code to this block below for handling
2742                    TRIE nodes on a re-study.  If you change stuff here check there
2743                    too. */
2744                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2745                 struct regnode_charclass_class accum;
2746                 regnode * const startbranch=scan;
2747
2748                 if (flags & SCF_DO_SUBSTR)
2749                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2750                 if (flags & SCF_DO_STCLASS)
2751                     cl_init_zero(&accum);
2752
2753                 while (OP(scan) == code) {
2754                     I32 deltanext, minnext, f = 0, fake;
2755                     struct regnode_charclass_class this_class;
2756
2757                     num++;
2758                     data_fake.flags = 0;
2759                     if (data) {
2760                         data_fake.whilem_c = data->whilem_c;
2761                         data_fake.last_closep = data->last_closep;
2762                     }
2763                     else
2764                         data_fake.last_closep = &fake;
2765
2766                     data_fake.pos_delta = delta;
2767                     next = regnext(scan);
2768                     scan = NEXTOPER(scan);
2769                     if (code != BRANCH)
2770                         scan = NEXTOPER(scan);
2771                     if (flags & SCF_DO_STCLASS) {
2772                         cl_init(&this_class);
2773                         data_fake.start_class = &this_class;
2774                         f = SCF_DO_STCLASS_AND;
2775                     }
2776                     if (flags & SCF_WHILEM_VISITED_POS)
2777                         f |= SCF_WHILEM_VISITED_POS;
2778
2779                     /* we suppose the run is continuous, last=next...*/
2780                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2781                                           next, &data_fake,
2782                                           stopparen, recursed, NULL, f,depth+1);
2783                     if (min1 > minnext)
2784                         min1 = minnext;
2785                     if (max1 < minnext + deltanext)
2786                         max1 = minnext + deltanext;
2787                     if (deltanext == I32_MAX)
2788                         is_inf = is_inf_internal = 1;
2789                     scan = next;
2790                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2791                         pars++;
2792                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2793                         if ( stopmin > minnext)
2794                             stopmin = min + min1;
2795                         flags &= ~SCF_DO_SUBSTR;
2796                         if (data)
2797                             data->flags |= SCF_SEEN_ACCEPT;
2798                     }
2799                     if (data) {
2800                         if (data_fake.flags & SF_HAS_EVAL)
2801                             data->flags |= SF_HAS_EVAL;
2802                         data->whilem_c = data_fake.whilem_c;
2803                     }
2804                     if (flags & SCF_DO_STCLASS)
2805                         cl_or(&accum, &this_class);
2806                 }
2807                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2808                     min1 = 0;
2809                 if (flags & SCF_DO_SUBSTR) {
2810                     data->pos_min += min1;
2811                     data->pos_delta += max1 - min1;
2812                     if (max1 != min1 || is_inf)
2813                         data->longest = &(data->longest_float);
2814                 }
2815                 min += min1;
2816                 delta += max1 - min1;
2817                 if (flags & SCF_DO_STCLASS_OR) {
2818                     cl_or(data->start_class, &accum);
2819                     if (min1) {
2820                         cl_and(data->start_class, and_withp);
2821                         flags &= ~SCF_DO_STCLASS;
2822                     }
2823                 }
2824                 else if (flags & SCF_DO_STCLASS_AND) {
2825                     if (min1) {
2826                         cl_and(data->start_class, &accum);
2827                         flags &= ~SCF_DO_STCLASS;
2828                     }
2829                     else {
2830                         /* Switch to OR mode: cache the old value of
2831                          * data->start_class */
2832                         INIT_AND_WITHP;
2833                         StructCopy(data->start_class, and_withp,
2834                                    struct regnode_charclass_class);
2835                         flags &= ~SCF_DO_STCLASS_AND;
2836                         StructCopy(&accum, data->start_class,
2837                                    struct regnode_charclass_class);
2838                         flags |= SCF_DO_STCLASS_OR;
2839                         data->start_class->flags |= ANYOF_EOS;
2840                     }
2841                 }
2842
2843                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2844                 /* demq.
2845
2846                    Assuming this was/is a branch we are dealing with: 'scan' now
2847                    points at the item that follows the branch sequence, whatever
2848                    it is. We now start at the beginning of the sequence and look
2849                    for subsequences of
2850
2851                    BRANCH->EXACT=>x1
2852                    BRANCH->EXACT=>x2
2853                    tail
2854
2855                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2856
2857                    If we can find such a subsequence we need to turn the first
2858                    element into a trie and then add the subsequent branch exact
2859                    strings to the trie.
2860
2861                    We have two cases
2862
2863                      1. patterns where the whole set of branches can be converted.
2864
2865                      2. patterns where only a subset can be converted.
2866
2867                    In case 1 we can replace the whole set with a single regop
2868                    for the trie. In case 2 we need to keep the start and end
2869                    branches so
2870
2871                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2872                      becomes BRANCH TRIE; BRANCH X;
2873
2874                   There is an additional case, that being where there is a
2875                   common prefix, which gets split out into an EXACT like node
2876                   preceding the TRIE node.
2877
2878                   If x(1..n)==tail then we can do a simple trie, if not we make
2879                   a "jump" trie, such that when we match the appropriate word
2880                   we "jump" to the appropriate tail node. Essentially we turn
2881                   a nested if into a case structure of sorts.
2882
2883                 */
2884
2885                     int made=0;
2886                     if (!re_trie_maxbuff) {
2887                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2888                         if (!SvIOK(re_trie_maxbuff))
2889                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2890                     }
2891                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2892                         regnode *cur;
2893                         regnode *first = (regnode *)NULL;
2894                         regnode *last = (regnode *)NULL;
2895                         regnode *tail = scan;
2896                         U8 optype = 0;
2897                         U32 count=0;
2898
2899 #ifdef DEBUGGING
2900                         SV * const mysv = sv_newmortal();       /* for dumping */
2901 #endif
2902                         /* var tail is used because there may be a TAIL
2903                            regop in the way. Ie, the exacts will point to the
2904                            thing following the TAIL, but the last branch will
2905                            point at the TAIL. So we advance tail. If we
2906                            have nested (?:) we may have to move through several
2907                            tails.
2908                          */
2909
2910                         while ( OP( tail ) == TAIL ) {
2911                             /* this is the TAIL generated by (?:) */
2912                             tail = regnext( tail );
2913                         }
2914
2915
2916                         DEBUG_OPTIMISE_r({
2917                             regprop(RExC_rx, mysv, tail );
2918                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2919                                 (int)depth * 2 + 2, "",
2920                                 "Looking for TRIE'able sequences. Tail node is: ",
2921                                 SvPV_nolen_const( mysv )
2922                             );
2923                         });
2924
2925                         /*
2926
2927                            step through the branches, cur represents each
2928                            branch, noper is the first thing to be matched
2929                            as part of that branch and noper_next is the
2930                            regnext() of that node. if noper is an EXACT
2931                            and noper_next is the same as scan (our current
2932                            position in the regex) then the EXACT branch is
2933                            a possible optimization target. Once we have
2934                            two or more consecutive such branches we can
2935                            create a trie of the EXACT's contents and stich
2936                            it in place. If the sequence represents all of
2937                            the branches we eliminate the whole thing and
2938                            replace it with a single TRIE. If it is a
2939                            subsequence then we need to stitch it in. This
2940                            means the first branch has to remain, and needs
2941                            to be repointed at the item on the branch chain
2942                            following the last branch optimized. This could
2943                            be either a BRANCH, in which case the
2944                            subsequence is internal, or it could be the
2945                            item following the branch sequence in which
2946                            case the subsequence is at the end.
2947
2948                         */
2949
2950                         /* dont use tail as the end marker for this traverse */
2951                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2952                             regnode * const noper = NEXTOPER( cur );
2953 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2954                             regnode * const noper_next = regnext( noper );
2955 #endif
2956
2957                             DEBUG_OPTIMISE_r({
2958                                 regprop(RExC_rx, mysv, cur);
2959                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2960                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2961
2962                                 regprop(RExC_rx, mysv, noper);
2963                                 PerlIO_printf( Perl_debug_log, " -> %s",
2964                                     SvPV_nolen_const(mysv));
2965
2966                                 if ( noper_next ) {
2967                                   regprop(RExC_rx, mysv, noper_next );
2968                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2969                                     SvPV_nolen_const(mysv));
2970                                 }
2971                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2972                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2973                             });
2974                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2975                                          : PL_regkind[ OP( noper ) ] == EXACT )
2976                                   || OP(noper) == NOTHING )
2977 #ifdef NOJUMPTRIE
2978                                   && noper_next == tail
2979 #endif
2980                                   && count < U16_MAX)
2981                             {
2982                                 count++;
2983                                 if ( !first || optype == NOTHING ) {
2984                                     if (!first) first = cur;
2985                                     optype = OP( noper );
2986                                 } else {
2987                                     last = cur;
2988                                 }
2989                             } else {
2990 /*
2991     Currently we do not believe that the trie logic can
2992     handle case insensitive matching properly when the
2993     pattern is not unicode (thus forcing unicode semantics).
2994
2995     If/when this is fixed the following define can be swapped
2996     in below to fully enable trie logic.
2997
2998     XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
2999     not /aa
3000
3001 #define TRIE_TYPE_IS_SAFE 1
3002
3003 */
3004 #define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
3005
3006                                 if ( last && TRIE_TYPE_IS_SAFE ) {
3007                                     make_trie( pRExC_state,
3008                                             startbranch, first, cur, tail, count,
3009                                             optype, depth+1 );
3010                                 }
3011                                 if ( PL_regkind[ OP( noper ) ] == EXACT
3012 #ifdef NOJUMPTRIE
3013                                      && noper_next == tail
3014 #endif
3015                                 ){
3016                                     count = 1;
3017                                     first = cur;
3018                                     optype = OP( noper );
3019                                 } else {
3020                                     count = 0;
3021                                     first = NULL;
3022                                     optype = 0;
3023                                 }
3024                                 last = NULL;
3025                             }
3026                         }
3027                         DEBUG_OPTIMISE_r({
3028                             regprop(RExC_rx, mysv, cur);
3029                             PerlIO_printf( Perl_debug_log,
3030                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3031                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3032
3033                         });
3034
3035                         if ( last && TRIE_TYPE_IS_SAFE ) {
3036                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3037 #ifdef TRIE_STUDY_OPT
3038                             if ( ((made == MADE_EXACT_TRIE &&
3039                                  startbranch == first)
3040                                  || ( first_non_open == first )) &&
3041                                  depth==0 ) {
3042                                 flags |= SCF_TRIE_RESTUDY;
3043                                 if ( startbranch == first
3044                                      && scan == tail )
3045                                 {
3046                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3047                                 }
3048                             }
3049 #endif
3050                         }
3051                     }
3052
3053                 } /* do trie */
3054
3055             }
3056             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3057                 scan = NEXTOPER(NEXTOPER(scan));
3058             } else                      /* single branch is optimized. */
3059                 scan = NEXTOPER(scan);
3060             continue;
3061         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3062             scan_frame *newframe = NULL;
3063             I32 paren;
3064             regnode *start;
3065             regnode *end;
3066
3067             if (OP(scan) != SUSPEND) {
3068             /* set the pointer */
3069                 if (OP(scan) == GOSUB) {
3070                     paren = ARG(scan);
3071                     RExC_recurse[ARG2L(scan)] = scan;
3072                     start = RExC_open_parens[paren-1];
3073                     end   = RExC_close_parens[paren-1];
3074                 } else {
3075                     paren = 0;
3076                     start = RExC_rxi->program + 1;
3077                     end   = RExC_opend;
3078                 }
3079                 if (!recursed) {
3080                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3081                     SAVEFREEPV(recursed);
3082                 }
3083                 if (!PAREN_TEST(recursed,paren+1)) {
3084                     PAREN_SET(recursed,paren+1);
3085                     Newx(newframe,1,scan_frame);
3086                 } else {
3087                     if (flags & SCF_DO_SUBSTR) {
3088                         SCAN_COMMIT(pRExC_state,data,minlenp);
3089                         data->longest = &(data->longest_float);
3090                     }
3091                     is_inf = is_inf_internal = 1;
3092                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3093                         cl_anything(data->start_class);
3094                     flags &= ~SCF_DO_STCLASS;
3095                 }
3096             } else {
3097                 Newx(newframe,1,scan_frame);
3098                 paren = stopparen;
3099                 start = scan+2;
3100                 end = regnext(scan);
3101             }
3102             if (newframe) {
3103                 assert(start);
3104                 assert(end);
3105                 SAVEFREEPV(newframe);
3106                 newframe->next = regnext(scan);
3107                 newframe->last = last;
3108                 newframe->stop = stopparen;
3109                 newframe->prev = frame;
3110
3111                 frame = newframe;
3112                 scan =  start;
3113                 stopparen = paren;
3114                 last = end;
3115
3116                 continue;
3117             }
3118         }
3119         else if (OP(scan) == EXACT) {
3120             I32 l = STR_LEN(scan);
3121             UV uc;
3122             if (UTF) {
3123                 const U8 * const s = (U8*)STRING(scan);
3124                 l = utf8_length(s, s + l);
3125                 uc = utf8_to_uvchr(s, NULL);
3126             } else {
3127                 uc = *((U8*)STRING(scan));
3128             }
3129             min += l;
3130             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3131                 /* The code below prefers earlier match for fixed
3132                    offset, later match for variable offset.  */
3133                 if (data->last_end == -1) { /* Update the start info. */
3134                     data->last_start_min = data->pos_min;
3135                     data->last_start_max = is_inf
3136                         ? I32_MAX : data->pos_min + data->pos_delta;
3137                 }
3138                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3139                 if (UTF)
3140                     SvUTF8_on(data->last_found);
3141                 {
3142                     SV * const sv = data->last_found;
3143                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3144                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3145                     if (mg && mg->mg_len >= 0)
3146                         mg->mg_len += utf8_length((U8*)STRING(scan),
3147                                                   (U8*)STRING(scan)+STR_LEN(scan));
3148                 }
3149                 data->last_end = data->pos_min + l;
3150                 data->pos_min += l; /* As in the first entry. */
3151                 data->flags &= ~SF_BEFORE_EOL;
3152             }
3153             if (flags & SCF_DO_STCLASS_AND) {
3154                 /* Check whether it is compatible with what we know already! */
3155                 int compat = 1;
3156
3157
3158                 /* If compatible, we or it in below.  It is compatible if is
3159                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3160                  * it's for a locale.  Even if there isn't unicode semantics
3161                  * here, at runtime there may be because of matching against a
3162                  * utf8 string, so accept a possible false positive for
3163                  * latin1-range folds */
3164                 if (uc >= 0x100 ||
3165                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3166                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3167                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3168                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3169                     )
3170                 {
3171                     compat = 0;
3172                 }
3173                 ANYOF_CLASS_ZERO(data->start_class);
3174                 ANYOF_BITMAP_ZERO(data->start_class);
3175                 if (compat)
3176                     ANYOF_BITMAP_SET(data->start_class, uc);
3177                 else if (uc >= 0x100) {
3178                     int i;
3179
3180                     /* Some Unicode code points fold to the Latin1 range; as
3181                      * XXX temporary code, instead of figuring out if this is
3182                      * one, just assume it is and set all the start class bits
3183                      * that could be some such above 255 code point's fold
3184                      * which will generate fals positives.  As the code
3185                      * elsewhere that does compute the fold settles down, it
3186                      * can be extracted out and re-used here */
3187                     for (i = 0; i < 256; i++){
3188                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3189                             ANYOF_BITMAP_SET(data->start_class, i);
3190                         }
3191                     }
3192                 }
3193                 data->start_class->flags &= ~ANYOF_EOS;
3194                 if (uc < 0x100)
3195                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3196             }
3197             else if (flags & SCF_DO_STCLASS_OR) {
3198                 /* false positive possible if the class is case-folded */
3199                 if (uc < 0x100)
3200                     ANYOF_BITMAP_SET(data->start_class, uc);
3201                 else
3202                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3203                 data->start_class->flags &= ~ANYOF_EOS;
3204                 cl_and(data->start_class, and_withp);
3205             }
3206             flags &= ~SCF_DO_STCLASS;
3207         }
3208         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3209             I32 l = STR_LEN(scan);
3210             UV uc = *((U8*)STRING(scan));
3211
3212             /* Search for fixed substrings supports EXACT only. */
3213             if (flags & SCF_DO_SUBSTR) {
3214                 assert(data);
3215                 SCAN_COMMIT(pRExC_state, data, minlenp);
3216             }
3217             if (UTF) {
3218                 const U8 * const s = (U8 *)STRING(scan);
3219                 l = utf8_length(s, s + l);
3220                 uc = utf8_to_uvchr(s, NULL);
3221             }
3222             min += l;
3223             if (flags & SCF_DO_SUBSTR)
3224                 data->pos_min += l;
3225             if (flags & SCF_DO_STCLASS_AND) {
3226                 /* Check whether it is compatible with what we know already! */
3227                 int compat = 1;
3228                 if (uc >= 0x100 ||
3229                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3230                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3231                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3232                 {
3233                     compat = 0;
3234                 }
3235                 ANYOF_CLASS_ZERO(data->start_class);
3236                 ANYOF_BITMAP_ZERO(data->start_class);
3237                 if (compat) {
3238                     ANYOF_BITMAP_SET(data->start_class, uc);
3239                     data->start_class->flags &= ~ANYOF_EOS;
3240                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3241                     if (OP(scan) == EXACTFL) {
3242                         data->start_class->flags |= ANYOF_LOCALE;
3243                     }
3244                     else {
3245
3246                         /* Also set the other member of the fold pair.  In case
3247                          * that unicode semantics is called for at runtime, use
3248                          * the full latin1 fold.  (Can't do this for locale,
3249                          * because not known until runtime */
3250                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3251                     }
3252                 }
3253                 else if (uc >= 0x100) {
3254                     int i;
3255                     for (i = 0; i < 256; i++){
3256                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3257                             ANYOF_BITMAP_SET(data->start_class, i);
3258                         }
3259                     }
3260                 }
3261             }
3262             else if (flags & SCF_DO_STCLASS_OR) {
3263                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3264                     /* false positive possible if the class is case-folded.
3265                        Assume that the locale settings are the same... */
3266                     if (uc < 0x100) {
3267                         ANYOF_BITMAP_SET(data->start_class, uc);
3268                         if (OP(scan) != EXACTFL) {
3269
3270                             /* And set the other member of the fold pair, but
3271                              * can't do that in locale because not known until
3272                              * run-time */
3273                             ANYOF_BITMAP_SET(data->start_class,
3274                                              PL_fold_latin1[uc]);
3275                         }
3276                     }
3277                     data->start_class->flags &= ~ANYOF_EOS;
3278                 }
3279                 cl_and(data->start_class, and_withp);
3280             }
3281             flags &= ~SCF_DO_STCLASS;
3282         }
3283         else if (REGNODE_VARIES(OP(scan))) {
3284             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3285             I32 f = flags, pos_before = 0;
3286             regnode * const oscan = scan;
3287             struct regnode_charclass_class this_class;
3288             struct regnode_charclass_class *oclass = NULL;
3289             I32 next_is_eval = 0;
3290
3291             switch (PL_regkind[OP(scan)]) {
3292             case WHILEM:                /* End of (?:...)* . */
3293                 scan = NEXTOPER(scan);
3294                 goto finish;
3295             case PLUS:
3296                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3297                     next = NEXTOPER(scan);
3298                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3299                         mincount = 1;
3300                         maxcount = REG_INFTY;
3301                         next = regnext(scan);
3302                         scan = NEXTOPER(scan);
3303                         goto do_curly;
3304                     }
3305                 }
3306                 if (flags & SCF_DO_SUBSTR)
3307                     data->pos_min++;
3308                 min++;
3309                 /* Fall through. */
3310             case STAR:
3311                 if (flags & SCF_DO_STCLASS) {
3312                     mincount = 0;
3313                     maxcount = REG_INFTY;
3314                     next = regnext(scan);
3315                     scan = NEXTOPER(scan);
3316                     goto do_curly;
3317                 }
3318                 is_inf = is_inf_internal = 1;
3319                 scan = regnext(scan);
3320                 if (flags & SCF_DO_SUBSTR) {
3321                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3322                     data->longest = &(data->longest_float);
3323                 }
3324                 goto optimize_curly_tail;
3325             case CURLY:
3326                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3327                     && (scan->flags == stopparen))
3328                 {
3329                     mincount = 1;
3330                     maxcount = 1;
3331                 } else {
3332                     mincount = ARG1(scan);
3333                     maxcount = ARG2(scan);
3334                 }
3335                 next = regnext(scan);
3336                 if (OP(scan) == CURLYX) {
3337                     I32 lp = (data ? *(data->last_closep) : 0);
3338                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3339                 }
3340                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3341                 next_is_eval = (OP(scan) == EVAL);
3342               do_curly:
3343                 if (flags & SCF_DO_SUBSTR) {
3344                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3345                     pos_before = data->pos_min;
3346                 }
3347                 if (data) {
3348                     fl = data->flags;
3349                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3350                     if (is_inf)
3351                         data->flags |= SF_IS_INF;
3352                 }
3353                 if (flags & SCF_DO_STCLASS) {
3354                     cl_init(&this_class);
3355                     oclass = data->start_class;
3356                     data->start_class = &this_class;
3357                     f |= SCF_DO_STCLASS_AND;
3358                     f &= ~SCF_DO_STCLASS_OR;
3359                 }
3360                 /* Exclude from super-linear cache processing any {n,m}
3361                    regops for which the combination of input pos and regex
3362                    pos is not enough information to determine if a match
3363                    will be possible.
3364
3365                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3366                    regex pos at the \s*, the prospects for a match depend not
3367                    only on the input position but also on how many (bar\s*)
3368                    repeats into the {4,8} we are. */
3369                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3370                     f &= ~SCF_WHILEM_VISITED_POS;
3371
3372                 /* This will finish on WHILEM, setting scan, or on NULL: */
3373                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3374                                       last, data, stopparen, recursed, NULL,
3375                                       (mincount == 0
3376                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3377
3378                 if (flags & SCF_DO_STCLASS)
3379                     data->start_class = oclass;
3380                 if (mincount == 0 || minnext == 0) {
3381                     if (flags & SCF_DO_STCLASS_OR) {
3382                         cl_or(data->start_class, &this_class);
3383                     }
3384                     else if (flags & SCF_DO_STCLASS_AND) {
3385                         /* Switch to OR mode: cache the old value of
3386                          * data->start_class */
3387                         INIT_AND_WITHP;
3388                         StructCopy(data->start_class, and_withp,
3389                                    struct regnode_charclass_class);
3390                         flags &= ~SCF_DO_STCLASS_AND;
3391                         StructCopy(&this_class, data->start_class,
3392                                    struct regnode_charclass_class);
3393                         flags |= SCF_DO_STCLASS_OR;
3394                         data->start_class->flags |= ANYOF_EOS;
3395                     }
3396                 } else {                /* Non-zero len */
3397                     if (flags & SCF_DO_STCLASS_OR) {
3398                         cl_or(data->start_class, &this_class);
3399                         cl_and(data->start_class, and_withp);
3400                     }
3401                     else if (flags & SCF_DO_STCLASS_AND)
3402                         cl_and(data->start_class, &this_class);
3403                     flags &= ~SCF_DO_STCLASS;
3404                 }
3405                 if (!scan)              /* It was not CURLYX, but CURLY. */
3406                     scan = next;
3407                 if ( /* ? quantifier ok, except for (?{ ... }) */
3408                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3409                     && (minnext == 0) && (deltanext == 0)
3410                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3411                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3412                 {
3413                     ckWARNreg(RExC_parse,
3414                               "Quantifier unexpected on zero-length expression");
3415                 }
3416
3417                 min += minnext * mincount;
3418                 is_inf_internal |= ((maxcount == REG_INFTY
3419                                      && (minnext + deltanext) > 0)
3420                                     || deltanext == I32_MAX);
3421                 is_inf |= is_inf_internal;
3422                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3423
3424                 /* Try powerful optimization CURLYX => CURLYN. */
3425                 if (  OP(oscan) == CURLYX && data
3426                       && data->flags & SF_IN_PAR
3427                       && !(data->flags & SF_HAS_EVAL)
3428                       && !deltanext && minnext == 1 ) {
3429                     /* Try to optimize to CURLYN.  */
3430                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3431                     regnode * const nxt1 = nxt;
3432 #ifdef DEBUGGING
3433                     regnode *nxt2;
3434 #endif
3435
3436                     /* Skip open. */
3437                     nxt = regnext(nxt);
3438                     if (!REGNODE_SIMPLE(OP(nxt))
3439                         && !(PL_regkind[OP(nxt)] == EXACT
3440                              && STR_LEN(nxt) == 1))
3441                         goto nogo;
3442 #ifdef DEBUGGING
3443                     nxt2 = nxt;
3444 #endif
3445                     nxt = regnext(nxt);
3446                     if (OP(nxt) != CLOSE)
3447                         goto nogo;
3448                     if (RExC_open_parens) {
3449                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3450                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3451                     }
3452                     /* Now we know that nxt2 is the only contents: */
3453                     oscan->flags = (U8)ARG(nxt);
3454                     OP(oscan) = CURLYN;
3455                     OP(nxt1) = NOTHING; /* was OPEN. */
3456
3457 #ifdef DEBUGGING
3458                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3459                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3460                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3461                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3462                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3463                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3464 #endif
3465                 }
3466               nogo:
3467
3468                 /* Try optimization CURLYX => CURLYM. */
3469                 if (  OP(oscan) == CURLYX && data
3470                       && !(data->flags & SF_HAS_PAR)
3471                       && !(data->flags & SF_HAS_EVAL)
3472                       && !deltanext     /* atom is fixed width */
3473                       && minnext != 0   /* CURLYM can't handle zero width */
3474                 ) {
3475                     /* XXXX How to optimize if data == 0? */
3476                     /* Optimize to a simpler form.  */
3477                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3478                     regnode *nxt2;
3479
3480                     OP(oscan) = CURLYM;
3481                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3482                             && (OP(nxt2) != WHILEM))
3483                         nxt = nxt2;
3484                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3485                     /* Need to optimize away parenths. */
3486                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3487                         /* Set the parenth number.  */
3488                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3489
3490                         oscan->flags = (U8)ARG(nxt);
3491                         if (RExC_open_parens) {
3492                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3493                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3494                         }
3495                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3496                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3497
3498 #ifdef DEBUGGING
3499                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3500                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3501                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3502                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3503 #endif
3504 #if 0
3505                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3506                             regnode *nnxt = regnext(nxt1);
3507                             if (nnxt == nxt) {
3508                                 if (reg_off_by_arg[OP(nxt1)])
3509                                     ARG_SET(nxt1, nxt2 - nxt1);
3510                                 else if (nxt2 - nxt1 < U16_MAX)
3511                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3512                                 else
3513                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3514                             }
3515                             nxt1 = nnxt;
3516                         }
3517 #endif
3518                         /* Optimize again: */
3519                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3520                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3521                     }
3522                     else
3523                         oscan->flags = 0;
3524                 }
3525                 else if ((OP(oscan) == CURLYX)
3526                          && (flags & SCF_WHILEM_VISITED_POS)
3527                          /* See the comment on a similar expression above.
3528                             However, this time it's not a subexpression
3529                             we care about, but the expression itself. */
3530                          && (maxcount == REG_INFTY)
3531                          && data && ++data->whilem_c < 16) {
3532                     /* This stays as CURLYX, we can put the count/of pair. */
3533                     /* Find WHILEM (as in regexec.c) */
3534                     regnode *nxt = oscan + NEXT_OFF(oscan);
3535
3536                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3537                         nxt += ARG(nxt);
3538                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3539                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3540                 }
3541                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3542                     pars++;
3543                 if (flags & SCF_DO_SUBSTR) {
3544                     SV *last_str = NULL;
3545                     int counted = mincount != 0;
3546
3547                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3548 #if defined(SPARC64_GCC_WORKAROUND)
3549                         I32 b = 0;
3550                         STRLEN l = 0;
3551                         const char *s = NULL;
3552                         I32 old = 0;
3553
3554                         if (pos_before >= data->last_start_min)
3555                             b = pos_before;
3556                         else
3557                             b = data->last_start_min;
3558
3559                         l = 0;
3560                         s = SvPV_const(data->last_found, l);
3561                         old = b - data->last_start_min;
3562
3563 #else
3564                         I32 b = pos_before >= data->last_start_min
3565                             ? pos_before : data->last_start_min;
3566                         STRLEN l;
3567                         const char * const s = SvPV_const(data->last_found, l);
3568                         I32 old = b - data->last_start_min;
3569 #endif
3570
3571                         if (UTF)
3572                             old = utf8_hop((U8*)s, old) - (U8*)s;
3573                         l -= old;
3574                         /* Get the added string: */
3575                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3576                         if (deltanext == 0 && pos_before == b) {
3577                             /* What was added is a constant string */
3578                             if (mincount > 1) {
3579                                 SvGROW(last_str, (mincount * l) + 1);
3580                                 repeatcpy(SvPVX(last_str) + l,
3581                                           SvPVX_const(last_str), l, mincount - 1);
3582                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3583                                 /* Add additional parts. */
3584                                 SvCUR_set(data->last_found,
3585                                           SvCUR(data->last_found) - l);
3586                                 sv_catsv(data->last_found, last_str);
3587                                 {
3588                                     SV * sv = data->last_found;
3589                                     MAGIC *mg =
3590                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3591                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3592                                     if (mg && mg->mg_len >= 0)
3593                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3594                                 }
3595                                 data->last_end += l * (mincount - 1);
3596                             }
3597                         } else {
3598                             /* start offset must point into the last copy */
3599                             data->last_start_min += minnext * (mincount - 1);
3600                             data->last_start_max += is_inf ? I32_MAX
3601                                 : (maxcount - 1) * (minnext + data->pos_delta);
3602                         }
3603                     }
3604                     /* It is counted once already... */
3605                     data->pos_min += minnext * (mincount - counted);
3606                     data->pos_delta += - counted * deltanext +
3607                         (minnext + deltanext) * maxcount - minnext * mincount;
3608                     if (mincount != maxcount) {
3609                          /* Cannot extend fixed substrings found inside
3610                             the group.  */
3611                         SCAN_COMMIT(pRExC_state,data,minlenp);
3612                         if (mincount && last_str) {
3613                             SV * const sv = data->last_found;
3614                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3615                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3616
3617                             if (mg)
3618                                 mg->mg_len = -1;
3619                             sv_setsv(sv, last_str);
3620                             data->last_end = data->pos_min;
3621                             data->last_start_min =
3622                                 data->pos_min - CHR_SVLEN(last_str);
3623                             data->last_start_max = is_inf
3624                                 ? I32_MAX
3625                                 : data->pos_min + data->pos_delta
3626                                 - CHR_SVLEN(last_str);
3627                         }
3628                         data->longest = &(data->longest_float);
3629                     }
3630                     SvREFCNT_dec(last_str);
3631                 }
3632                 if (data && (fl & SF_HAS_EVAL))
3633                     data->flags |= SF_HAS_EVAL;
3634               optimize_curly_tail:
3635                 if (OP(oscan) != CURLYX) {
3636                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3637                            && NEXT_OFF(next))
3638                         NEXT_OFF(oscan) += NEXT_OFF(next);
3639                 }
3640                 continue;
3641             default:                    /* REF, ANYOFV, and CLUMP only? */
3642                 if (flags & SCF_DO_SUBSTR) {
3643                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3644                     data->longest = &(data->longest_float);
3645                 }
3646                 is_inf = is_inf_internal = 1;
3647                 if (flags & SCF_DO_STCLASS_OR)
3648                     cl_anything(data->start_class);
3649                 flags &= ~SCF_DO_STCLASS;
3650                 break;
3651             }
3652         }
3653         else if (OP(scan) == LNBREAK) {
3654             if (flags & SCF_DO_STCLASS) {
3655                 int value = 0;
3656                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3657                 if (flags & SCF_DO_STCLASS_AND) {
3658                     for (value = 0; value < 256; value++)
3659                         if (!is_VERTWS_cp(value))
3660                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3661                 }
3662                 else {
3663                     for (value = 0; value < 256; value++)
3664                         if (is_VERTWS_cp(value))
3665                             ANYOF_BITMAP_SET(data->start_class, value);
3666                 }
3667                 if (flags & SCF_DO_STCLASS_OR)
3668                     cl_and(data->start_class, and_withp);
3669                 flags &= ~SCF_DO_STCLASS;
3670             }
3671             min += 1;
3672             delta += 1;
3673             if (flags & SCF_DO_SUBSTR) {
3674                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3675                 data->pos_min += 1;
3676                 data->pos_delta += 1;
3677                 data->longest = &(data->longest_float);
3678             }
3679         }
3680         else if (OP(scan) == FOLDCHAR) {
3681             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3682             flags &= ~SCF_DO_STCLASS;
3683             min += 1;
3684             delta += d;
3685             if (flags & SCF_DO_SUBSTR) {
3686                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3687                 data->pos_min += 1;
3688                 data->pos_delta += d;
3689                 data->longest = &(data->longest_float);
3690             }
3691         }
3692         else if (REGNODE_SIMPLE(OP(scan))) {
3693             int value = 0;
3694
3695             if (flags & SCF_DO_SUBSTR) {
3696                 SCAN_COMMIT(pRExC_state,data,minlenp);
3697                 data->pos_min++;
3698             }
3699             min++;
3700             if (flags & SCF_DO_STCLASS) {
3701                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3702
3703                 /* Some of the logic below assumes that switching
3704                    locale on will only add false positives. */
3705                 switch (PL_regkind[OP(scan)]) {
3706                 case SANY:
3707                 default:
3708                   do_default:
3709                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3710                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3711                         cl_anything(data->start_class);
3712                     break;
3713                 case REG_ANY:
3714                     if (OP(scan) == SANY)
3715                         goto do_default;
3716                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3717                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3718                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3719                         cl_anything(data->start_class);
3720                     }
3721                     if (flags & SCF_DO_STCLASS_AND || !value)
3722                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3723                     break;
3724                 case ANYOF:
3725                     if (flags & SCF_DO_STCLASS_AND)
3726                         cl_and(data->start_class,
3727                                (struct regnode_charclass_class*)scan);
3728                     else
3729                         cl_or(data->start_class,
3730                               (struct regnode_charclass_class*)scan);
3731                     break;
3732                 case ALNUM:
3733                     if (flags & SCF_DO_STCLASS_AND) {
3734                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3735                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3736                             if (OP(scan) == ALNUMU) {
3737                                 for (value = 0; value < 256; value++) {
3738                                     if (!isWORDCHAR_L1(value)) {
3739                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3740                                     }
3741                                 }
3742                             } else {
3743                                 for (value = 0; value < 256; value++) {
3744                                     if (!isALNUM(value)) {
3745                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3746                                     }
3747                                 }
3748                             }
3749                         }
3750                     }
3751                     else {
3752                         if (data->start_class->flags & ANYOF_LOCALE)
3753                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3754                         else if (OP(scan) == ALNUMU) {
3755                             for (value = 0; value < 256; value++) {
3756                                 if (isWORDCHAR_L1(value)) {
3757                                     ANYOF_BITMAP_SET(data->start_class, value);
3758                                 }
3759                             }
3760                         } else {
3761                             for (value = 0; value < 256; value++) {
3762                                 if (isALNUM(value)) {
3763                                     ANYOF_BITMAP_SET(data->start_class, value);
3764                                 }
3765                             }
3766                         }
3767                     }
3768                     break;
3769                 case NALNUM:
3770                     if (flags & SCF_DO_STCLASS_AND) {
3771                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3772                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3773                             if (OP(scan) == NALNUMU) {
3774                                 for (value = 0; value < 256; value++) {
3775                                     if (isWORDCHAR_L1(value)) {
3776                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3777                                     }
3778                                 }
3779                             } else {
3780                                 for (value = 0; value < 256; value++) {
3781                                     if (isALNUM(value)) {
3782                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3783                                     }
3784                                 }
3785                             }
3786                         }
3787                     }
3788                     else {
3789                         if (data->start_class->flags & ANYOF_LOCALE)
3790                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3791                         else {
3792                             if (OP(scan) == NALNUMU) {
3793                                 for (value = 0; value < 256; value++) {
3794                                     if (! isWORDCHAR_L1(value)) {
3795                                         ANYOF_BITMAP_SET(data->start_class, value);
3796                                     }
3797                                 }
3798                             } else {
3799                                 for (value = 0; value < 256; value++) {
3800                                     if (! isALNUM(value)) {
3801                                         ANYOF_BITMAP_SET(data->start_class, value);
3802                                     }
3803                                 }
3804                             }
3805                         }
3806                     }
3807                     break;
3808                 case SPACE:
3809                     if (flags & SCF_DO_STCLASS_AND) {
3810                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3811                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3812                             if (OP(scan) == SPACEU) {
3813                                 for (value = 0; value < 256; value++) {
3814                                     if (!isSPACE_L1(value)) {
3815                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3816                                     }
3817                                 }
3818                             } else {
3819                                 for (value = 0; value < 256; value++) {
3820                                     if (!isSPACE(value)) {
3821                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3822                                     }
3823                                 }
3824                             }
3825                         }
3826                     }
3827                     else {
3828                         if (data->start_class->flags & ANYOF_LOCALE) {
3829                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3830                         }
3831                         else if (OP(scan) == SPACEU) {
3832                             for (value = 0; value < 256; value++) {
3833                                 if (isSPACE_L1(value)) {
3834                                     ANYOF_BITMAP_SET(data->start_class, value);
3835                                 }
3836                             }
3837                         } else {
3838                             for (value = 0; value < 256; value++) {
3839                                 if (isSPACE(value)) {
3840                                     ANYOF_BITMAP_SET(data->start_class, value);
3841                                 }
3842                             }
3843                         }
3844                     }
3845                     break;
3846                 case NSPACE:
3847                     if (flags & SCF_DO_STCLASS_AND) {
3848                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3849                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3850                             if (OP(scan) == NSPACEU) {
3851                                 for (value = 0; value < 256; value++) {
3852                                     if (isSPACE_L1(value)) {
3853                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3854                                     }
3855                                 }
3856                             } else {
3857                                 for (value = 0; value < 256; value++) {
3858                                     if (isSPACE(value)) {
3859                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3860                                     }
3861                                 }
3862                             }
3863                         }
3864                     }
3865                     else {
3866                         if (data->start_class->flags & ANYOF_LOCALE)
3867                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3868                         else if (OP(scan) == NSPACEU) {
3869                             for (value = 0; value < 256; value++) {
3870                                 if (!isSPACE_L1(value)) {
3871                                     ANYOF_BITMAP_SET(data->start_class, value);
3872                                 }
3873                             }
3874                         }
3875                         else {
3876                             for (value = 0; value < 256; value++) {
3877                                 if (!isSPACE(value)) {
3878                                     ANYOF_BITMAP_SET(data->start_class, value);
3879                                 }
3880                             }
3881                         }
3882                     }
3883                     break;
3884                 case DIGIT:
3885                     if (flags & SCF_DO_STCLASS_AND) {
3886                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3887                         for (value = 0; value < 256; value++)
3888                             if (!isDIGIT(value))
3889                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3890                     }
3891                     else {
3892                         if (data->start_class->flags & ANYOF_LOCALE)
3893                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3894                         else {
3895                             for (value = 0; value < 256; value++)
3896                                 if (isDIGIT(value))
3897                                     ANYOF_BITMAP_SET(data->start_class, value);
3898                         }
3899                     }
3900                     break;
3901                 case NDIGIT:
3902                     if (flags & SCF_DO_STCLASS_AND) {
3903                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3904                         for (value = 0; value < 256; value++)
3905                             if (isDIGIT(value))
3906                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3907                     }
3908                     else {
3909                         if (data->start_class->flags & ANYOF_LOCALE)
3910                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3911                         else {
3912                             for (value = 0; value < 256; value++)
3913                                 if (!isDIGIT(value))
3914                                     ANYOF_BITMAP_SET(data->start_class, value);
3915                         }
3916                     }
3917                     break;
3918                 CASE_SYNST_FNC(VERTWS);
3919                 CASE_SYNST_FNC(HORIZWS);
3920
3921                 }
3922                 if (flags & SCF_DO_STCLASS_OR)
3923                     cl_and(data->start_class, and_withp);
3924                 flags &= ~SCF_DO_STCLASS;
3925             }
3926         }
3927         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3928             data->flags |= (OP(scan) == MEOL
3929                             ? SF_BEFORE_MEOL
3930                             : SF_BEFORE_SEOL);
3931         }
3932         else if (  PL_regkind[OP(scan)] == BRANCHJ
3933                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3934                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3935                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3936             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3937                 || OP(scan) == UNLESSM )
3938             {
3939                 /* Negative Lookahead/lookbehind
3940                    In this case we can't do fixed string optimisation.
3941                 */
3942
3943                 I32 deltanext, minnext, fake = 0;
3944                 regnode *nscan;
3945                 struct regnode_charclass_class intrnl;
3946                 int f = 0;
3947
3948                 data_fake.flags = 0;
3949                 if (data) {
3950                     data_fake.whilem_c = data->whilem_c;
3951                     data_fake.last_closep = data->last_closep;
3952                 }
3953                 else
3954                     data_fake.last_closep = &fake;
3955                 data_fake.pos_delta = delta;
3956                 if ( flags & SCF_DO_STCLASS && !scan->flags
3957                      && OP(scan) == IFMATCH ) { /* Lookahead */
3958                     cl_init(&intrnl);
3959                     data_fake.start_class = &intrnl;
3960                     f |= SCF_DO_STCLASS_AND;
3961                 }
3962                 if (flags & SCF_WHILEM_VISITED_POS)
3963                     f |= SCF_WHILEM_VISITED_POS;
3964                 next = regnext(scan);
3965                 nscan = NEXTOPER(NEXTOPER(scan));
3966                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3967                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3968                 if (scan->flags) {
3969                     if (deltanext) {
3970                         FAIL("Variable length lookbehind not implemented");
3971                     }
3972                     else if (minnext > (I32)U8_MAX) {
3973                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3974                     }
3975                     scan->flags = (U8)minnext;
3976                 }
3977                 if (data) {
3978                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3979                         pars++;
3980                     if (data_fake.flags & SF_HAS_EVAL)
3981                         data->flags |= SF_HAS_EVAL;
3982                     data->whilem_c = data_fake.whilem_c;
3983                 }
3984                 if (f & SCF_DO_STCLASS_AND) {
3985                     if (flags & SCF_DO_STCLASS_OR) {
3986                         /* OR before, AND after: ideally we would recurse with
3987                          * data_fake to get the AND applied by study of the
3988                          * remainder of the pattern, and then derecurse;
3989                          * *** HACK *** for now just treat as "no information".
3990                          * See [perl #56690].
3991                          */
3992                         cl_init(data->start_class);
3993                     }  else {
3994                         /* AND before and after: combine and continue */
3995                         const int was = (data->start_class->flags & ANYOF_EOS);
3996
3997                         cl_and(data->start_class, &intrnl);
3998                         if (was)
3999                             data->start_class->flags |= ANYOF_EOS;
4000                     }
4001                 }
4002             }
4003 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4004             else {
4005                 /* Positive Lookahead/lookbehind
4006                    In this case we can do fixed string optimisation,
4007                    but we must be careful about it. Note in the case of
4008                    lookbehind the positions will be offset by the minimum
4009                    length of the pattern, something we won't know about
4010                    until after the recurse.
4011                 */
4012                 I32 deltanext, fake = 0;
4013                 regnode *nscan;
4014                 struct regnode_charclass_class intrnl;
4015                 int f = 0;
4016                 /* We use SAVEFREEPV so that when the full compile
4017                     is finished perl will clean up the allocated
4018                     minlens when it's all done. This way we don't
4019                     have to worry about freeing them when we know
4020                     they wont be used, which would be a pain.
4021                  */
4022                 I32 *minnextp;
4023                 Newx( minnextp, 1, I32 );
4024                 SAVEFREEPV(minnextp);
4025
4026                 if (data) {
4027                     StructCopy(data, &data_fake, scan_data_t);
4028                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4029                         f |= SCF_DO_SUBSTR;
4030                         if (scan->flags)
4031                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4032                         data_fake.last_found=newSVsv(data->last_found);
4033                     }
4034                 }
4035                 else
4036                     data_fake.last_closep = &fake;
4037                 data_fake.flags = 0;
4038                 data_fake.pos_delta = delta;
4039                 if (is_inf)
4040                     data_fake.flags |= SF_IS_INF;
4041                 if ( flags & SCF_DO_STCLASS && !scan->flags
4042                      && OP(scan) == IFMATCH ) { /* Lookahead */
4043                     cl_init(&intrnl);
4044                     data_fake.start_class = &intrnl;
4045                     f |= SCF_DO_STCLASS_AND;
4046                 }
4047                 if (flags & SCF_WHILEM_VISITED_POS)
4048                     f |= SCF_WHILEM_VISITED_POS;
4049                 next = regnext(scan);
4050                 nscan = NEXTOPER(NEXTOPER(scan));
4051
4052                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4053                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4054                 if (scan->flags) {
4055                     if (deltanext) {
4056                         FAIL("Variable length lookbehind not implemented");
4057                     }
4058                     else if (*minnextp > (I32)U8_MAX) {
4059                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4060                     }
4061                     scan->flags = (U8)*minnextp;
4062                 }
4063
4064                 *minnextp += min;
4065
4066                 if (f & SCF_DO_STCLASS_AND) {
4067                     const int was = (data->start_class->flags & ANYOF_EOS);
4068
4069                     cl_and(data->start_class, &intrnl);
4070                     if (was)
4071                         data->start_class->flags |= ANYOF_EOS;
4072                 }
4073                 if (data) {
4074                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4075                         pars++;
4076                     if (data_fake.flags & SF_HAS_EVAL)
4077                         data->flags |= SF_HAS_EVAL;
4078                     data->whilem_c = data_fake.whilem_c;
4079                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4080                         if (RExC_rx->minlen<*minnextp)
4081                             RExC_rx->minlen=*minnextp;
4082                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4083                         SvREFCNT_dec(data_fake.last_found);
4084
4085                         if ( data_fake.minlen_fixed != minlenp )
4086                         {
4087                             data->offset_fixed= data_fake.offset_fixed;
4088                             data->minlen_fixed= data_fake.minlen_fixed;
4089                             data->lookbehind_fixed+= scan->flags;
4090                         }
4091                         if ( data_fake.minlen_float != minlenp )
4092                         {
4093                             data->minlen_float= data_fake.minlen_float;
4094                             data->offset_float_min=data_fake.offset_float_min;
4095                             data->offset_float_max=data_fake.offset_float_max;
4096                             data->lookbehind_float+= scan->flags;
4097                         }
4098                     }
4099                 }
4100
4101
4102             }
4103 #endif
4104         }
4105         else if (OP(scan) == OPEN) {
4106             if (stopparen != (I32)ARG(scan))
4107                 pars++;
4108         }
4109         else if (OP(scan) == CLOSE) {
4110             if (stopparen == (I32)ARG(scan)) {
4111                 break;
4112             }
4113             if ((I32)ARG(scan) == is_par) {
4114                 next = regnext(scan);
4115
4116                 if ( next && (OP(next) != WHILEM) && next < last)
4117                     is_par = 0;         /* Disable optimization */
4118             }
4119             if (data)
4120                 *(data->last_closep) = ARG(scan);
4121         }
4122         else if (OP(scan) == EVAL) {
4123                 if (data)
4124                     data->flags |= SF_HAS_EVAL;
4125         }
4126         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4127             if (flags & SCF_DO_SUBSTR) {
4128                 SCAN_COMMIT(pRExC_state,data,minlenp);
4129                 flags &= ~SCF_DO_SUBSTR;
4130             }
4131             if (data && OP(scan)==ACCEPT) {
4132                 data->flags |= SCF_SEEN_ACCEPT;
4133                 if (stopmin > min)
4134                     stopmin = min;
4135             }
4136         }
4137         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4138         {
4139                 if (flags & SCF_DO_SUBSTR) {
4140                     SCAN_COMMIT(pRExC_state,data,minlenp);
4141                     data->longest = &(data->longest_float);
4142                 }
4143                 is_inf = is_inf_internal = 1;
4144                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4145                     cl_anything(data->start_class);
4146                 flags &= ~SCF_DO_STCLASS;
4147         }
4148         else if (OP(scan) == GPOS) {
4149             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4150                 !(delta || is_inf || (data && data->pos_delta)))
4151             {
4152                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4153                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4154                 if (RExC_rx->gofs < (U32)min)
4155                     RExC_rx->gofs = min;
4156             } else {
4157                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4158                 RExC_rx->gofs = 0;
4159             }
4160         }
4161 #ifdef TRIE_STUDY_OPT
4162 #ifdef FULL_TRIE_STUDY
4163         else if (PL_regkind[OP(scan)] == TRIE) {
4164             /* NOTE - There is similar code to this block above for handling
4165                BRANCH nodes on the initial study.  If you change stuff here
4166                check there too. */
4167             regnode *trie_node= scan;
4168             regnode *tail= regnext(scan);
4169             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4170             I32 max1 = 0, min1 = I32_MAX;
4171             struct regnode_charclass_class accum;
4172
4173             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4174                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4175             if (flags & SCF_DO_STCLASS)
4176                 cl_init_zero(&accum);
4177
4178             if (!trie->jump) {
4179                 min1= trie->minlen;
4180                 max1= trie->maxlen;
4181             } else {
4182                 const regnode *nextbranch= NULL;
4183                 U32 word;
4184
4185                 for ( word=1 ; word <= trie->wordcount ; word++)
4186                 {
4187                     I32 deltanext=0, minnext=0, f = 0, fake;
4188                     struct regnode_charclass_class this_class;
4189
4190                     data_fake.flags = 0;
4191                     if (data) {
4192                         data_fake.whilem_c = data->whilem_c;
4193                         data_fake.last_closep = data->last_closep;
4194                     }
4195                     else
4196                         data_fake.last_closep = &fake;
4197                     data_fake.pos_delta = delta;
4198                     if (flags & SCF_DO_STCLASS) {
4199                         cl_init(&this_class);
4200                         data_fake.start_class = &this_class;
4201                         f = SCF_DO_STCLASS_AND;
4202                     }
4203                     if (flags & SCF_WHILEM_VISITED_POS)
4204                         f |= SCF_WHILEM_VISITED_POS;
4205
4206                     if (trie->jump[word]) {
4207                         if (!nextbranch)
4208                             nextbranch = trie_node + trie->jump[0];
4209                         scan= trie_node + trie->jump[word];
4210                         /* We go from the jump point to the branch that follows
4211                            it. Note this means we need the vestigal unused branches
4212                            even though they arent otherwise used.
4213                          */
4214                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4215                             &deltanext, (regnode *)nextbranch, &data_fake,
4216                             stopparen, recursed, NULL, f,depth+1);
4217                     }
4218                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4219                         nextbranch= regnext((regnode*)nextbranch);
4220
4221                     if (min1 > (I32)(minnext + trie->minlen))
4222                         min1 = minnext + trie->minlen;
4223                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4224                         max1 = minnext + deltanext + trie->maxlen;
4225                     if (deltanext == I32_MAX)
4226                         is_inf = is_inf_internal = 1;
4227
4228                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4229                         pars++;
4230                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4231                         if ( stopmin > min + min1)
4232                             stopmin = min + min1;
4233                         flags &= ~SCF_DO_SUBSTR;
4234                         if (data)
4235                             data->flags |= SCF_SEEN_ACCEPT;
4236                     }
4237                     if (data) {
4238                         if (data_fake.flags & SF_HAS_EVAL)
4239                             data->flags |= SF_HAS_EVAL;
4240                         data->whilem_c = data_fake.whilem_c;
4241                     }
4242                     if (flags & SCF_DO_STCLASS)
4243                         cl_or(&accum, &this_class);
4244                 }
4245             }
4246             if (flags & SCF_DO_SUBSTR) {
4247                 data->pos_min += min1;
4248                 data->pos_delta += max1 - min1;
4249                 if (max1 != min1 || is_inf)
4250                     data->longest = &(data->longest_float);
4251             }
4252             min += min1;
4253             delta += max1 - min1;
4254             if (flags & SCF_DO_STCLASS_OR) {
4255                 cl_or(data->start_class, &accum);
4256                 if (min1) {
4257                     cl_and(data->start_class, and_withp);
4258                     flags &= ~SCF_DO_STCLASS;
4259                 }
4260             }
4261             else if (flags & SCF_DO_STCLASS_AND) {
4262                 if (min1) {
4263                     cl_and(data->start_class, &accum);
4264                     flags &= ~SCF_DO_STCLASS;
4265                 }
4266                 else {
4267                     /* Switch to OR mode: cache the old value of
4268                      * data->start_class */
4269                     INIT_AND_WITHP;
4270                     StructCopy(data->start_class, and_withp,
4271                                struct regnode_charclass_class);
4272                     flags &= ~SCF_DO_STCLASS_AND;
4273                     StructCopy(&accum, data->start_class,
4274                                struct regnode_charclass_class);
4275                     flags |= SCF_DO_STCLASS_OR;
4276                     data->start_class->flags |= ANYOF_EOS;
4277                 }
4278             }
4279             scan= tail;
4280             continue;
4281         }
4282 #else
4283         else if (PL_regkind[OP(scan)] == TRIE) {
4284             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4285             U8*bang=NULL;
4286
4287             min += trie->minlen;
4288             delta += (trie->maxlen - trie->minlen);
4289             flags &= ~SCF_DO_STCLASS; /* xxx */
4290             if (flags & SCF_DO_SUBSTR) {
4291                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4292                 data->pos_min += trie->minlen;
4293                 data->pos_delta += (trie->maxlen - trie->minlen);
4294                 if (trie->maxlen != trie->minlen)
4295                     data->longest = &(data->longest_float);
4296             }
4297             if (trie->jump) /* no more substrings -- for now /grr*/
4298                 flags &= ~SCF_DO_SUBSTR;
4299         }
4300 #endif /* old or new */
4301 #endif /* TRIE_STUDY_OPT */
4302
4303         /* Else: zero-length, ignore. */
4304         scan = regnext(scan);
4305     }
4306     if (frame) {
4307         last = frame->last;
4308         scan = frame->next;
4309         stopparen = frame->stop;
4310         frame = frame->prev;
4311         goto fake_study_recurse;
4312     }
4313
4314   finish:
4315     assert(!frame);
4316     DEBUG_STUDYDATA("pre-fin:",data,depth);
4317
4318     *scanp = scan;
4319     *deltap = is_inf_internal ? I32_MAX : delta;
4320     if (flags & SCF_DO_SUBSTR && is_inf)
4321         data->pos_delta = I32_MAX - data->pos_min;
4322     if (is_par > (I32)U8_MAX)
4323         is_par = 0;
4324     if (is_par && pars==1 && data) {
4325         data->flags |= SF_IN_PAR;
4326         data->flags &= ~SF_HAS_PAR;
4327     }
4328     else if (pars && data) {
4329         data->flags |= SF_HAS_PAR;
4330         data->flags &= ~SF_IN_PAR;
4331     }
4332     if (flags & SCF_DO_STCLASS_OR)
4333         cl_and(data->start_class, and_withp);
4334     if (flags & SCF_TRIE_RESTUDY)
4335         data->flags |=  SCF_TRIE_RESTUDY;
4336
4337     DEBUG_STUDYDATA("post-fin:",data,depth);
4338
4339     return min < stopmin ? min : stopmin;
4340 }
4341
4342 STATIC U32
4343 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4344 {
4345     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4346
4347     PERL_ARGS_ASSERT_ADD_DATA;
4348
4349     Renewc(RExC_rxi->data,
4350            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4351            char, struct reg_data);
4352     if(count)
4353         Renew(RExC_rxi->data->what, count + n, U8);
4354     else
4355         Newx(RExC_rxi->data->what, n, U8);
4356     RExC_rxi->data->count = count + n;
4357     Copy(s, RExC_rxi->data->what + count, n, U8);
4358     return count;
4359 }
4360
4361 /*XXX: todo make this not included in a non debugging perl */
4362 #ifndef PERL_IN_XSUB_RE
4363 void
4364 Perl_reginitcolors(pTHX)
4365 {
4366     dVAR;
4367     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4368     if (s) {
4369         char *t = savepv(s);
4370         int i = 0;
4371         PL_colors[0] = t;
4372         while (++i < 6) {
4373             t = strchr(t, '\t');
4374             if (t) {
4375                 *t = '\0';
4376                 PL_colors[i] = ++t;
4377             }
4378             else
4379                 PL_colors[i] = t = (char *)"";
4380         }
4381     } else {
4382         int i = 0;
4383         while (i < 6)
4384             PL_colors[i++] = (char *)"";
4385     }
4386     PL_colorset = 1;
4387 }
4388 #endif
4389
4390
4391 #ifdef TRIE_STUDY_OPT
4392 #define CHECK_RESTUDY_GOTO                                  \
4393         if (                                                \
4394               (data.flags & SCF_TRIE_RESTUDY)               \
4395               && ! restudied++                              \
4396         )     goto reStudy
4397 #else
4398 #define CHECK_RESTUDY_GOTO
4399 #endif
4400
4401 /*
4402  - pregcomp - compile a regular expression into internal code
4403  *
4404  * We can't allocate space until we know how big the compiled form will be,
4405  * but we can't compile it (and thus know how big it is) until we've got a
4406  * place to put the code.  So we cheat:  we compile it twice, once with code
4407  * generation turned off and size counting turned on, and once "for real".
4408  * This also means that we don't allocate space until we are sure that the
4409  * thing really will compile successfully, and we never have to move the
4410  * code and thus invalidate pointers into it.  (Note that it has to be in
4411  * one piece because free() must be able to free it all.) [NB: not true in perl]
4412  *
4413  * Beware that the optimization-preparation code in here knows about some
4414  * of the structure of the compiled regexp.  [I'll say.]
4415  */
4416
4417
4418
4419 #ifndef PERL_IN_XSUB_RE
4420 #define RE_ENGINE_PTR &PL_core_reg_engine
4421 #else
4422 extern const struct regexp_engine my_reg_engine;
4423 #define RE_ENGINE_PTR &my_reg_engine
4424 #endif
4425
4426 #ifndef PERL_IN_XSUB_RE
4427 REGEXP *
4428 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4429 {
4430     dVAR;
4431     HV * const table = GvHV(PL_hintgv);
4432
4433     PERL_ARGS_ASSERT_PREGCOMP;
4434
4435     /* Dispatch a request to compile a regexp to correct
4436        regexp engine. */
4437     if (table) {
4438         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4439         GET_RE_DEBUG_FLAGS_DECL;
4440         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4441             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4442             DEBUG_COMPILE_r({
4443                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4444                     SvIV(*ptr));
4445             });
4446             return CALLREGCOMP_ENG(eng, pattern, flags);
4447         }
4448     }
4449     return Perl_re_compile(aTHX_ pattern, flags);
4450 }
4451 #endif
4452
4453 REGEXP *
4454 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4455 {
4456     dVAR;
4457     REGEXP *rx;
4458     struct regexp *r;
4459     register regexp_internal *ri;
4460     STRLEN plen;
4461     char  *exp;
4462     char* xend;
4463     regnode *scan;
4464     I32 flags;
4465     I32 minlen = 0;
4466     U32 pm_flags;
4467
4468     /* these are all flags - maybe they should be turned
4469      * into a single int with different bit masks */
4470     I32 sawlookahead = 0;
4471     I32 sawplus = 0;
4472     I32 sawopen = 0;
4473     bool used_setjump = FALSE;
4474
4475     U8 jump_ret = 0;
4476     dJMPENV;
4477     scan_data_t data;
4478     RExC_state_t RExC_state;
4479     RExC_state_t * const pRExC_state = &RExC_state;
4480 #ifdef TRIE_STUDY_OPT
4481     int restudied;
4482     RExC_state_t copyRExC_state;
4483 #endif
4484     GET_RE_DEBUG_FLAGS_DECL;
4485
4486     PERL_ARGS_ASSERT_RE_COMPILE;
4487
4488     DEBUG_r(if (!PL_colorset) reginitcolors());
4489
4490     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4491     RExC_uni_semantics = 0;
4492
4493     /****************** LONG JUMP TARGET HERE***********************/
4494     /* Longjmp back to here if have to switch in midstream to utf8 */
4495     if (! RExC_orig_utf8) {
4496         JMPENV_PUSH(jump_ret);
4497         used_setjump = TRUE;
4498     }
4499
4500     if (jump_ret == 0) {    /* First time through */
4501         exp = SvPV(pattern, plen);
4502         xend = exp + plen;
4503         /* ignore the utf8ness if the pattern is 0 length */
4504         if (plen == 0) {
4505             RExC_utf8 = RExC_orig_utf8 = 0;
4506         }
4507
4508         DEBUG_COMPILE_r({
4509             SV *dsv= sv_newmortal();
4510             RE_PV_QUOTED_DECL(s, RExC_utf8,
4511                 dsv, exp, plen, 60);
4512             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4513                            PL_colors[4],PL_colors[5],s);
4514         });
4515     }
4516     else {  /* longjumped back */
4517         STRLEN len = plen;
4518
4519         /* If the cause for the longjmp was other than changing to utf8, pop
4520          * our own setjmp, and longjmp to the correct handler */
4521         if (jump_ret != UTF8_LONGJMP) {
4522             JMPENV_POP;
4523             JMPENV_JUMP(jump_ret);
4524         }
4525
4526         GET_RE_DEBUG_FLAGS;
4527
4528         /* It's possible to write a regexp in ascii that represents Unicode
4529         codepoints outside of the byte range, such as via \x{100}. If we
4530         detect such a sequence we have to convert the entire pattern to utf8
4531         and then recompile, as our sizing calculation will have been based
4532         on 1 byte == 1 character, but we will need to use utf8 to encode
4533         at least some part of the pattern, and therefore must convert the whole
4534         thing.
4535         -- dmq */
4536         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4537             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4538         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4539         xend = exp + len;
4540         RExC_orig_utf8 = RExC_utf8 = 1;
4541         SAVEFREEPV(exp);
4542     }
4543
4544 #ifdef TRIE_STUDY_OPT
4545     restudied = 0;
4546 #endif
4547
4548     /* Set to use unicode semantics if the pattern is in utf8 and has the
4549      * 'depends' charset specified, as it means unicode when utf8  */
4550     pm_flags = orig_pm_flags;
4551
4552     if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
4553         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4554     }
4555
4556     RExC_precomp = exp;
4557     RExC_flags = pm_flags;
4558     RExC_sawback = 0;
4559
4560     RExC_seen = 0;
4561     RExC_in_lookbehind = 0;
4562     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4563     RExC_seen_evals = 0;
4564     RExC_extralen = 0;
4565
4566     /* First pass: determine size, legality. */
4567     RExC_parse = exp;
4568     RExC_start = exp;
4569     RExC_end = xend;
4570     RExC_naughty = 0;
4571     RExC_npar = 1;
4572     RExC_nestroot = 0;
4573     RExC_size = 0L;
4574     RExC_emit = &PL_regdummy;
4575     RExC_whilem_seen = 0;
4576     RExC_open_parens = NULL;
4577     RExC_close_parens = NULL;
4578     RExC_opend = NULL;
4579     RExC_paren_names = NULL;
4580 #ifdef DEBUGGING
4581     RExC_paren_name_list = NULL;
4582 #endif
4583     RExC_recurse = NULL;
4584     RExC_recurse_count = 0;
4585
4586 #if 0 /* REGC() is (currently) a NOP at the first pass.
4587        * Clever compilers notice this and complain. --jhi */
4588     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4589 #endif
4590     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4591     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4592         RExC_precomp = NULL;
4593         return(NULL);
4594     }
4595
4596     /* Here, finished first pass.  Get rid of any added setjmp */
4597     if (used_setjump) {
4598         JMPENV_POP;
4599     }
4600
4601     DEBUG_PARSE_r({
4602         PerlIO_printf(Perl_debug_log,
4603             "Required size %"IVdf" nodes\n"
4604             "Starting second pass (creation)\n",
4605             (IV)RExC_size);
4606         RExC_lastnum=0;
4607         RExC_lastparse=NULL;
4608     });
4609
4610     /* The first pass could have found things that force Unicode semantics */
4611     if ((RExC_utf8 || RExC_uni_semantics)
4612          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4613     {
4614         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4615     }
4616
4617     /* Small enough for pointer-storage convention?
4618        If extralen==0, this means that we will not need long jumps. */
4619     if (RExC_size >= 0x10000L && RExC_extralen)
4620         RExC_size += RExC_extralen;
4621     else
4622         RExC_extralen = 0;
4623     if (RExC_whilem_seen > 15)
4624         RExC_whilem_seen = 15;
4625
4626     /* Allocate space and zero-initialize. Note, the two step process
4627        of zeroing when in debug mode, thus anything assigned has to
4628        happen after that */
4629     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4630     r = (struct regexp*)SvANY(rx);
4631     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4632          char, regexp_internal);
4633     if ( r == NULL || ri == NULL )
4634         FAIL("Regexp out of space");
4635 #ifdef DEBUGGING
4636     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4637     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4638 #else
4639     /* bulk initialize base fields with 0. */
4640     Zero(ri, sizeof(regexp_internal), char);
4641 #endif
4642
4643     /* non-zero initialization begins here */
4644     RXi_SET( r, ri );
4645     r->engine= RE_ENGINE_PTR;
4646     r->extflags = pm_flags;
4647     {
4648         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4649         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4650
4651         /* The caret is output if there are any defaults: if not all the STD
4652          * flags are set, or if no character set specifier is needed */
4653         bool has_default =
4654                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4655                     || ! has_charset);
4656         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4657         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4658                             >> RXf_PMf_STD_PMMOD_SHIFT);
4659         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4660         char *p;
4661         /* Allocate for the worst case, which is all the std flags are turned
4662          * on.  If more precision is desired, we could do a population count of
4663          * the flags set.  This could be done with a small lookup table, or by
4664          * shifting, masking and adding, or even, when available, assembly
4665          * language for a machine-language population count.
4666          * We never output a minus, as all those are defaults, so are
4667          * covered by the caret */
4668         const STRLEN wraplen = plen + has_p + has_runon
4669             + has_default       /* If needs a caret */
4670
4671                 /* If needs a character set specifier */
4672             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4673             + (sizeof(STD_PAT_MODS) - 1)
4674             + (sizeof("(?:)") - 1);
4675
4676         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4677         SvPOK_on(rx);
4678         SvFLAGS(rx) |= SvUTF8(pattern);
4679         *p++='('; *p++='?';
4680
4681         /* If a default, cover it using the caret */
4682         if (has_default) {
4683             *p++= DEFAULT_PAT_MOD;
4684         }
4685         if (has_charset) {
4686             STRLEN len;
4687             const char* const name = get_regex_charset_name(r->extflags, &len);
4688             Copy(name, p, len, char);
4689             p += len;
4690         }
4691         if (has_p)
4692             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4693         {
4694             char ch;
4695             while((ch = *fptr++)) {
4696                 if(reganch & 1)
4697                     *p++ = ch;
4698                 reganch >>= 1;
4699             }
4700         }
4701
4702         *p++ = ':';
4703         Copy(RExC_precomp, p, plen, char);
4704         assert ((RX_WRAPPED(rx) - p) < 16);
4705         r->pre_prefix = p - RX_WRAPPED(rx);
4706         p += plen;
4707         if (has_runon)
4708             *p++ = '\n';
4709         *p++ = ')';
4710         *p = 0;
4711         SvCUR_set(rx, p - SvPVX_const(rx));
4712     }
4713
4714     r->intflags = 0;
4715     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4716
4717     if (RExC_seen & REG_SEEN_RECURSE) {
4718         Newxz(RExC_open_parens, RExC_npar,regnode *);
4719         SAVEFREEPV(RExC_open_parens);
4720         Newxz(RExC_close_parens,RExC_npar,regnode *);
4721         SAVEFREEPV(RExC_close_parens);
4722     }
4723
4724     /* Useful during FAIL. */
4725 #ifdef RE_TRACK_PATTERN_OFFSETS
4726     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4727     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4728                           "%s %"UVuf" bytes for offset annotations.\n",
4729                           ri->u.offsets ? "Got" : "Couldn't get",
4730                           (UV)((2*RExC_size+1) * sizeof(U32))));
4731 #endif
4732     SetProgLen(ri,RExC_size);
4733     RExC_rx_sv = rx;
4734     RExC_rx = r;
4735     RExC_rxi = ri;
4736
4737     /* Second pass: emit code. */
4738     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4739     RExC_parse = exp;
4740     RExC_end = xend;
4741     RExC_naughty = 0;
4742     RExC_npar = 1;
4743     RExC_emit_start = ri->program;
4744     RExC_emit = ri->program;
4745     RExC_emit_bound = ri->program + RExC_size + 1;
4746
4747     /* Store the count of eval-groups for security checks: */
4748     RExC_rx->seen_evals = RExC_seen_evals;
4749     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4750     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4751         ReREFCNT_dec(rx);
4752         return(NULL);
4753     }
4754     /* XXXX To minimize changes to RE engine we always allocate
4755        3-units-long substrs field. */
4756     Newx(r->substrs, 1, struct reg_substr_data);
4757     if (RExC_recurse_count) {
4758         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4759         SAVEFREEPV(RExC_recurse);
4760     }
4761
4762 reStudy:
4763     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4764     Zero(r->substrs, 1, struct reg_substr_data);
4765
4766 #ifdef TRIE_STUDY_OPT
4767     if (!restudied) {
4768         StructCopy(&zero_scan_data, &data, scan_data_t);
4769         copyRExC_state = RExC_state;
4770     } else {
4771         U32 seen=RExC_seen;
4772         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4773
4774         RExC_state = copyRExC_state;
4775         if (seen & REG_TOP_LEVEL_BRANCHES)
4776             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4777         else
4778             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4779         if (data.last_found) {
4780             SvREFCNT_dec(data.longest_fixed);
4781             SvREFCNT_dec(data.longest_float);
4782             SvREFCNT_dec(data.last_found);
4783         }
4784         StructCopy(&zero_scan_data, &data, scan_data_t);
4785     }
4786 #else
4787     StructCopy(&zero_scan_data, &data, scan_data_t);
4788 #endif
4789
4790     /* Dig out information for optimizations. */
4791     r->extflags = RExC_flags; /* was pm_op */
4792     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4793
4794     if (UTF)
4795         SvUTF8_on(rx);  /* Unicode in it? */
4796     ri->regstclass = NULL;
4797     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4798         r->intflags |= PREGf_NAUGHTY;
4799     scan = ri->program + 1;             /* First BRANCH. */
4800
4801     /* testing for BRANCH here tells us whether there is "must appear"
4802        data in the pattern. If there is then we can use it for optimisations */
4803     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4804         I32 fake;
4805         STRLEN longest_float_length, longest_fixed_length;
4806         struct regnode_charclass_class ch_class; /* pointed to by data */
4807         int stclass_flag;
4808         I32 last_close = 0; /* pointed to by data */
4809         regnode *first= scan;
4810         regnode *first_next= regnext(first);
4811         /*
4812          * Skip introductions and multiplicators >= 1
4813          * so that we can extract the 'meat' of the pattern that must
4814          * match in the large if() sequence following.
4815          * NOTE that EXACT is NOT covered here, as it is normally
4816          * picked up by the optimiser separately.
4817          *
4818          * This is unfortunate as the optimiser isnt handling lookahead
4819          * properly currently.
4820          *
4821          */
4822         while ((OP(first) == OPEN && (sawopen = 1)) ||
4823                /* An OR of *one* alternative - should not happen now. */
4824             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4825             /* for now we can't handle lookbehind IFMATCH*/
4826             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4827             (OP(first) == PLUS) ||
4828             (OP(first) == MINMOD) ||
4829                /* An {n,m} with n>0 */
4830             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4831             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4832         {
4833                 /*
4834                  * the only op that could be a regnode is PLUS, all the rest
4835                  * will be regnode_1 or regnode_2.
4836                  *
4837                  */
4838                 if (OP(first) == PLUS)
4839                     sawplus = 1;
4840                 else
4841                     first += regarglen[OP(first)];
4842
4843                 first = NEXTOPER(first);
4844                 first_next= regnext(first);
4845         }
4846
4847         /* Starting-point info. */
4848       again:
4849         DEBUG_PEEP("first:",first,0);
4850         /* Ignore EXACT as we deal with it later. */
4851         if (PL_regkind[OP(first)] == EXACT) {
4852             if (OP(first) == EXACT)
4853                 NOOP;   /* Empty, get anchored substr later. */
4854             else
4855                 ri->regstclass = first;
4856         }
4857 #ifdef TRIE_STCLASS
4858         else if (PL_regkind[OP(first)] == TRIE &&
4859                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4860         {
4861             regnode *trie_op;
4862             /* this can happen only on restudy */
4863             if ( OP(first) == TRIE ) {
4864                 struct regnode_1 *trieop = (struct regnode_1 *)
4865                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4866                 StructCopy(first,trieop,struct regnode_1);
4867                 trie_op=(regnode *)trieop;
4868             } else {
4869                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4870                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4871                 StructCopy(first,trieop,struct regnode_charclass);
4872                 trie_op=(regnode *)trieop;
4873             }
4874             OP(trie_op)+=2;
4875             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4876             ri->regstclass = trie_op;
4877         }
4878 #endif
4879         else if (REGNODE_SIMPLE(OP(first)))
4880             ri->regstclass = first;
4881         else if (PL_regkind[OP(first)] == BOUND ||
4882                  PL_regkind[OP(first)] == NBOUND)
4883             ri->regstclass = first;
4884         else if (PL_regkind[OP(first)] == BOL) {
4885             r->extflags |= (OP(first) == MBOL
4886                            ? RXf_ANCH_MBOL
4887                            : (OP(first) == SBOL
4888                               ? RXf_ANCH_SBOL
4889                               : RXf_ANCH_BOL));
4890             first = NEXTOPER(first);
4891             goto again;
4892         }
4893         else if (OP(first) == GPOS) {
4894             r->extflags |= RXf_ANCH_GPOS;
4895             first = NEXTOPER(first);
4896             goto again;
4897         }
4898         else if ((!sawopen || !RExC_sawback) &&
4899             (OP(first) == STAR &&
4900             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4901             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4902         {
4903             /* turn .* into ^.* with an implied $*=1 */
4904             const int type =
4905                 (OP(NEXTOPER(first)) == REG_ANY)
4906                     ? RXf_ANCH_MBOL
4907                     : RXf_ANCH_SBOL;
4908             r->extflags |= type;
4909             r->intflags |= PREGf_IMPLICIT;
4910             first = NEXTOPER(first);
4911             goto again;
4912         }
4913         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4914             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4915             /* x+ must match at the 1st pos of run of x's */
4916             r->intflags |= PREGf_SKIP;
4917
4918         /* Scan is after the zeroth branch, first is atomic matcher. */
4919 #ifdef TRIE_STUDY_OPT
4920         DEBUG_PARSE_r(
4921             if (!restudied)
4922                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4923                               (IV)(first - scan + 1))
4924         );
4925 #else
4926         DEBUG_PARSE_r(
4927             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4928                 (IV)(first - scan + 1))
4929         );
4930 #endif
4931
4932
4933         /*
4934         * If there's something expensive in the r.e., find the
4935         * longest literal string that must appear and make it the
4936         * regmust.  Resolve ties in favor of later strings, since
4937         * the regstart check works with the beginning of the r.e.
4938         * and avoiding duplication strengthens checking.  Not a
4939         * strong reason, but sufficient in the absence of others.
4940         * [Now we resolve ties in favor of the earlier string if
4941         * it happens that c_offset_min has been invalidated, since the
4942         * earlier string may buy us something the later one won't.]
4943         */
4944
4945         data.longest_fixed = newSVpvs("");
4946         data.longest_float = newSVpvs("");
4947         data.last_found = newSVpvs("");
4948         data.longest = &(data.longest_fixed);
4949         first = scan;
4950         if (!ri->regstclass) {
4951             cl_init(&ch_class);
4952             data.start_class = &ch_class;
4953             stclass_flag = SCF_DO_STCLASS_AND;
4954         } else                          /* XXXX Check for BOUND? */
4955             stclass_flag = 0;
4956         data.last_closep = &last_close;
4957
4958         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4959             &data, -1, NULL, NULL,
4960             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4961
4962
4963         CHECK_RESTUDY_GOTO;
4964
4965
4966         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4967              && data.last_start_min == 0 && data.last_end > 0
4968              && !RExC_seen_zerolen
4969              && !(RExC_seen & REG_SEEN_VERBARG)
4970              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4971             r->extflags |= RXf_CHECK_ALL;
4972         scan_commit(pRExC_state, &data,&minlen,0);
4973         SvREFCNT_dec(data.last_found);
4974
4975         /* Note that code very similar to this but for anchored string
4976            follows immediately below, changes may need to be made to both.
4977            Be careful.
4978          */
4979         longest_float_length = CHR_SVLEN(data.longest_float);
4980         if (longest_float_length
4981             || (data.flags & SF_FL_BEFORE_EOL
4982                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4983                     || (RExC_flags & RXf_PMf_MULTILINE))))
4984         {
4985             I32 t,ml;
4986
4987             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4988                 && data.offset_fixed == data.offset_float_min
4989                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4990                     goto remove_float;          /* As in (a)+. */
4991
4992             /* copy the information about the longest float from the reg_scan_data
4993                over to the program. */
4994             if (SvUTF8(data.longest_float)) {
4995                 r->float_utf8 = data.longest_float;
4996                 r->float_substr = NULL;
4997             } else {
4998                 r->float_substr = data.longest_float;
4999                 r->float_utf8 = NULL;
5000             }
5001             /* float_end_shift is how many chars that must be matched that
5002                follow this item. We calculate it ahead of time as once the
5003                lookbehind offset is added in we lose the ability to correctly
5004                calculate it.*/
5005             ml = data.minlen_float ? *(data.minlen_float)
5006                                    : (I32)longest_float_length;
5007             r->float_end_shift = ml - data.offset_float_min
5008                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5009                 + data.lookbehind_float;
5010             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5011             r->float_max_offset = data.offset_float_max;
5012             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5013                 r->float_max_offset -= data.lookbehind_float;
5014
5015             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5016                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5017                            || (RExC_flags & RXf_PMf_MULTILINE)));
5018             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5019         }
5020         else {
5021           remove_float:
5022             r->float_substr = r->float_utf8 = NULL;
5023             SvREFCNT_dec(data.longest_float);
5024             longest_float_length = 0;
5025         }
5026
5027         /* Note that code very similar to this but for floating string
5028            is immediately above, changes may need to be made to both.
5029            Be careful.
5030          */
5031         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5032         if (longest_fixed_length
5033             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5034                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5035                     || (RExC_flags & RXf_PMf_MULTILINE))))
5036         {
5037             I32 t,ml;
5038
5039             /* copy the information about the longest fixed
5040                from the reg_scan_data over to the program. */
5041             if (SvUTF8(data.longest_fixed)) {
5042                 r->anchored_utf8 = data.longest_fixed;
5043                 r->anchored_substr = NULL;
5044             } else {
5045                 r->anchored_substr = data.longest_fixed;
5046                 r->anchored_utf8 = NULL;
5047             }
5048             /* fixed_end_shift is how many chars that must be matched that
5049                follow this item. We calculate it ahead of time as once the
5050                lookbehind offset is added in we lose the ability to correctly
5051                calculate it.*/
5052             ml = data.minlen_fixed ? *(data.minlen_fixed)
5053                                    : (I32)longest_fixed_length;
5054             r->anchored_end_shift = ml - data.offset_fixed
5055                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5056                 + data.lookbehind_fixed;
5057             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5058
5059             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5060                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5061                      || (RExC_flags & RXf_PMf_MULTILINE)));
5062             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5063         }
5064         else {
5065             r->anchored_substr = r->anchored_utf8 = NULL;
5066             SvREFCNT_dec(data.longest_fixed);
5067             longest_fixed_length = 0;
5068         }
5069         if (ri->regstclass
5070             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5071             ri->regstclass = NULL;
5072
5073         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5074             && stclass_flag
5075             && !(data.start_class->flags & ANYOF_EOS)
5076             && !cl_is_anything(data.start_class))
5077         {
5078             const U32 n = add_data(pRExC_state, 1, "f");
5079             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5080
5081             Newx(RExC_rxi->data->data[n], 1,
5082                 struct regnode_charclass_class);
5083             StructCopy(data.start_class,
5084                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5085                        struct regnode_charclass_class);
5086             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5087             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5088             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5089                       regprop(r, sv, (regnode*)data.start_class);
5090                       PerlIO_printf(Perl_debug_log,
5091                                     "synthetic stclass \"%s\".\n",
5092                                     SvPVX_const(sv));});
5093         }
5094
5095         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5096         if (longest_fixed_length > longest_float_length) {
5097             r->check_end_shift = r->anchored_end_shift;
5098             r->check_substr = r->anchored_substr;
5099             r->check_utf8 = r->anchored_utf8;
5100             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5101             if (r->extflags & RXf_ANCH_SINGLE)
5102                 r->extflags |= RXf_NOSCAN;
5103         }
5104         else {
5105             r->check_end_shift = r->float_end_shift;
5106             r->check_substr = r->float_substr;
5107             r->check_utf8 = r->float_utf8;
5108             r->check_offset_min = r->float_min_offset;
5109             r->check_offset_max = r->float_max_offset;
5110         }
5111         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5112            This should be changed ASAP!  */
5113         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5114             r->extflags |= RXf_USE_INTUIT;
5115             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5116                 r->extflags |= RXf_INTUIT_TAIL;
5117         }
5118         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5119         if ( (STRLEN)minlen < longest_float_length )
5120             minlen= longest_float_length;
5121         if ( (STRLEN)minlen < longest_fixed_length )
5122             minlen= longest_fixed_length;
5123         */
5124     }
5125     else {
5126         /* Several toplevels. Best we can is to set minlen. */
5127         I32 fake;
5128         struct regnode_charclass_class ch_class;
5129         I32 last_close = 0;
5130
5131         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5132
5133         scan = ri->program + 1;
5134         cl_init(&ch_class);
5135         data.start_class = &ch_class;
5136         data.last_closep = &last_close;
5137
5138
5139         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5140             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5141
5142         CHECK_RESTUDY_GOTO;
5143
5144         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5145                 = r->float_substr = r->float_utf8 = NULL;
5146
5147         if (!(data.start_class->flags & ANYOF_EOS)
5148             && !cl_is_anything(data.start_class))
5149         {
5150             const U32 n = add_data(pRExC_state, 1, "f");
5151             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5152
5153             Newx(RExC_rxi->data->data[n], 1,
5154                 struct regnode_charclass_class);
5155             StructCopy(data.start_class,
5156                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5157                        struct regnode_charclass_class);
5158             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5159             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5160             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5161                       regprop(r, sv, (regnode*)data.start_class);
5162                       PerlIO_printf(Perl_debug_log,
5163                                     "synthetic stclass \"%s\".\n",
5164                                     SvPVX_const(sv));});
5165         }
5166     }
5167
5168     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5169        the "real" pattern. */
5170     DEBUG_OPTIMISE_r({
5171         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5172                       (IV)minlen, (IV)r->minlen);
5173     });
5174     r->minlenret = minlen;
5175     if (r->minlen < minlen)
5176         r->minlen = minlen;
5177
5178     if (RExC_seen & REG_SEEN_GPOS)
5179         r->extflags |= RXf_GPOS_SEEN;
5180     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5181         r->extflags |= RXf_LOOKBEHIND_SEEN;
5182     if (RExC_seen & REG_SEEN_EVAL)
5183         r->extflags |= RXf_EVAL_SEEN;
5184     if (RExC_seen & REG_SEEN_CANY)
5185         r->extflags |= RXf_CANY_SEEN;
5186     if (RExC_seen & REG_SEEN_VERBARG)
5187         r->intflags |= PREGf_VERBARG_SEEN;
5188     if (RExC_seen & REG_SEEN_CUTGROUP)
5189         r->intflags |= PREGf_CUTGROUP_SEEN;
5190     if (RExC_paren_names)
5191         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5192     else
5193         RXp_PAREN_NAMES(r) = NULL;
5194
5195 #ifdef STUPID_PATTERN_CHECKS
5196     if (RX_PRELEN(rx) == 0)
5197         r->extflags |= RXf_NULL;
5198     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5199         /* XXX: this should happen BEFORE we compile */
5200         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5201     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5202         r->extflags |= RXf_WHITE;
5203     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5204         r->extflags |= RXf_START_ONLY;
5205 #else
5206     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5207             /* XXX: this should happen BEFORE we compile */
5208             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5209     else {
5210         regnode *first = ri->program + 1;
5211         U8 fop = OP(first);
5212
5213         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5214             r->extflags |= RXf_NULL;
5215         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5216             r->extflags |= RXf_START_ONLY;
5217         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5218                              && OP(regnext(first)) == END)
5219             r->extflags |= RXf_WHITE;
5220     }
5221 #endif
5222 #ifdef DEBUGGING
5223     if (RExC_paren_names) {
5224         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5225         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5226     } else
5227 #endif
5228         ri->name_list_idx = 0;
5229
5230     if (RExC_recurse_count) {
5231         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5232             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5233             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5234         }
5235     }
5236     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5237     /* assume we don't need to swap parens around before we match */
5238
5239     DEBUG_DUMP_r({
5240         PerlIO_printf(Perl_debug_log,"Final program:\n");
5241         regdump(r);
5242     });
5243 #ifdef RE_TRACK_PATTERN_OFFSETS
5244     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5245         const U32 len = ri->u.offsets[0];
5246         U32 i;
5247         GET_RE_DEBUG_FLAGS_DECL;
5248         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5249         for (i = 1; i <= len; i++) {
5250             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5251                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5252                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5253             }
5254         PerlIO_printf(Perl_debug_log, "\n");
5255     });
5256 #endif
5257     return rx;
5258 }
5259
5260 #undef RE_ENGINE_PTR
5261
5262
5263 SV*
5264 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5265                     const U32 flags)
5266 {
5267     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5268
5269     PERL_UNUSED_ARG(value);
5270
5271     if (flags & RXapif_FETCH) {
5272         return reg_named_buff_fetch(rx, key, flags);
5273     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5274         Perl_croak_no_modify(aTHX);
5275         return NULL;
5276     } else if (flags & RXapif_EXISTS) {
5277         return reg_named_buff_exists(rx, key, flags)
5278             ? &PL_sv_yes
5279             : &PL_sv_no;
5280     } else if (flags & RXapif_REGNAMES) {
5281         return reg_named_buff_all(rx, flags);
5282     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5283         return reg_named_buff_scalar(rx, flags);
5284     } else {
5285         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5286         return NULL;
5287     }
5288 }
5289
5290 SV*
5291 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5292                          const U32 flags)
5293 {
5294     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5295     PERL_UNUSED_ARG(lastkey);
5296
5297     if (flags & RXapif_FIRSTKEY)
5298         return reg_named_buff_firstkey(rx, flags);
5299     else if (flags & RXapif_NEXTKEY)
5300         return reg_named_buff_nextkey(rx, flags);
5301     else {
5302         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5303         return NULL;
5304     }
5305 }
5306
5307 SV*
5308 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5309                           const U32 flags)
5310 {
5311     AV *retarray = NULL;
5312     SV *ret;
5313     struct regexp *const rx = (struct regexp *)SvANY(r);
5314
5315     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5316
5317     if (flags & RXapif_ALL)
5318         retarray=newAV();
5319
5320     if (rx && RXp_PAREN_NAMES(rx)) {
5321         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5322         if (he_str) {
5323             IV i;
5324             SV* sv_dat=HeVAL(he_str);
5325             I32 *nums=(I32*)SvPVX(sv_dat);
5326             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5327                 if ((I32)(rx->nparens) >= nums[i]
5328                     && rx->offs[nums[i]].start != -1
5329                     && rx->offs[nums[i]].end != -1)
5330                 {
5331                     ret = newSVpvs("");
5332                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5333                     if (!retarray)
5334                         return ret;
5335                 } else {
5336                     ret = newSVsv(&PL_sv_undef);
5337                 }
5338                 if (retarray)
5339                     av_push(retarray, ret);
5340             }
5341             if (retarray)
5342                 return newRV_noinc(MUTABLE_SV(retarray));
5343         }
5344     }
5345     return NULL;
5346 }
5347
5348 bool
5349 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5350                            const U32 flags)
5351 {
5352     struct regexp *const rx = (struct regexp *)SvANY(r);
5353
5354     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5355
5356     if (rx && RXp_PAREN_NAMES(rx)) {
5357         if (flags & RXapif_ALL) {
5358             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5359         } else {
5360             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5361             if (sv) {
5362                 SvREFCNT_dec(sv);
5363                 return TRUE;
5364             } else {
5365                 return FALSE;
5366             }
5367         }
5368     } else {
5369         return FALSE;
5370     }
5371 }
5372
5373 SV*
5374 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5375 {
5376     struct regexp *const rx = (struct regexp *)SvANY(r);
5377
5378     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5379
5380     if ( rx && RXp_PAREN_NAMES(rx) ) {
5381         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5382
5383         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5384     } else {
5385         return FALSE;
5386     }
5387 }
5388
5389 SV*
5390 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5391 {
5392     struct regexp *const rx = (struct regexp *)SvANY(r);
5393     GET_RE_DEBUG_FLAGS_DECL;
5394
5395     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5396
5397     if (rx && RXp_PAREN_NAMES(rx)) {
5398         HV *hv = RXp_PAREN_NAMES(rx);
5399         HE *temphe;
5400         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5401             IV i;
5402             IV parno = 0;
5403             SV* sv_dat = HeVAL(temphe);
5404             I32 *nums = (I32*)SvPVX(sv_dat);
5405             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5406                 if ((I32)(rx->lastparen) >= nums[i] &&
5407                     rx->offs[nums[i]].start != -1 &&
5408                     rx->offs[nums[i]].end != -1)
5409                 {
5410                     parno = nums[i];
5411                     break;
5412                 }
5413             }
5414             if (parno || flags & RXapif_ALL) {
5415                 return newSVhek(HeKEY_hek(temphe));
5416             }
5417         }
5418     }
5419     return NULL;
5420 }
5421
5422 SV*
5423 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5424 {
5425     SV *ret;
5426     AV *av;
5427     I32 length;
5428     struct regexp *const rx = (struct regexp *)SvANY(r);
5429
5430     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5431
5432     if (rx && RXp_PAREN_NAMES(rx)) {
5433         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5434             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5435         } else if (flags & RXapif_ONE) {
5436             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5437             av = MUTABLE_AV(SvRV(ret));
5438             length = av_len(av);
5439             SvREFCNT_dec(ret);
5440             return newSViv(length + 1);
5441         } else {
5442             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5443             return NULL;
5444         }
5445     }
5446     return &PL_sv_undef;
5447 }
5448
5449 SV*
5450 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5451 {
5452     struct regexp *const rx = (struct regexp *)SvANY(r);
5453     AV *av = newAV();
5454
5455     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5456
5457     if (rx && RXp_PAREN_NAMES(rx)) {
5458         HV *hv= RXp_PAREN_NAMES(rx);
5459         HE *temphe;
5460         (void)hv_iterinit(hv);
5461         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5462             IV i;
5463             IV parno = 0;
5464             SV* sv_dat = HeVAL(temphe);
5465             I32 *nums = (I32*)SvPVX(sv_dat);
5466             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5467                 if ((I32)(rx->lastparen) >= nums[i] &&
5468                     rx->offs[nums[i]].start != -1 &&
5469                     rx->offs[nums[i]].end != -1)
5470                 {
5471                     parno = nums[i];
5472                     break;
5473                 }
5474             }
5475             if (parno || flags & RXapif_ALL) {
5476                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5477             }
5478         }
5479     }
5480
5481     return newRV_noinc(MUTABLE_SV(av));
5482 }
5483
5484 void
5485 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5486                              SV * const sv)
5487 {
5488     struct regexp *const rx = (struct regexp *)SvANY(r);
5489     char *s = NULL;
5490     I32 i = 0;
5491     I32 s1, t1;
5492
5493     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5494
5495     if (!rx->subbeg) {
5496         sv_setsv(sv,&PL_sv_undef);
5497         return;
5498     }
5499     else
5500     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5501         /* $` */
5502         i = rx->offs[0].start;
5503         s = rx->subbeg;
5504     }
5505     else
5506     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5507         /* $' */
5508         s = rx->subbeg + rx->offs[0].end;
5509         i = rx->sublen - rx->offs[0].end;
5510     }
5511     else
5512     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5513         (s1 = rx->offs[paren].start) != -1 &&
5514         (t1 = rx->offs[paren].end) != -1)
5515     {
5516         /* $& $1 ... */
5517         i = t1 - s1;
5518         s = rx->subbeg + s1;
5519     } else {
5520         sv_setsv(sv,&PL_sv_undef);
5521         return;
5522     }
5523     assert(rx->sublen >= (s - rx->subbeg) + i );
5524     if (i >= 0) {
5525         const int oldtainted = PL_tainted;
5526         TAINT_NOT;
5527         sv_setpvn(sv, s, i);
5528         PL_tainted = oldtainted;
5529         if ( (rx->extflags & RXf_CANY_SEEN)
5530             ? (RXp_MATCH_UTF8(rx)
5531                         && (!i || is_utf8_string((U8*)s, i)))
5532             : (RXp_MATCH_UTF8(rx)) )
5533         {
5534             SvUTF8_on(sv);
5535         }
5536         else
5537             SvUTF8_off(sv);
5538         if (PL_tainting) {
5539             if (RXp_MATCH_TAINTED(rx)) {
5540                 if (SvTYPE(sv) >= SVt_PVMG) {
5541                     MAGIC* const mg = SvMAGIC(sv);
5542                     MAGIC* mgt;
5543                     PL_tainted = 1;
5544                     SvMAGIC_set(sv, mg->mg_moremagic);
5545                     SvTAINT(sv);
5546                     if ((mgt = SvMAGIC(sv))) {
5547                         mg->mg_moremagic = mgt;
5548                         SvMAGIC_set(sv, mg);
5549                     }
5550                 } else {
5551                     PL_tainted = 1;
5552                     SvTAINT(sv);
5553                 }
5554             } else
5555                 SvTAINTED_off(sv);
5556         }
5557     } else {
5558         sv_setsv(sv,&PL_sv_undef);
5559         return;
5560     }
5561 }
5562
5563 void
5564 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5565                                                          SV const * const value)
5566 {
5567     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5568
5569     PERL_UNUSED_ARG(rx);
5570     PERL_UNUSED_ARG(paren);
5571     PERL_UNUSED_ARG(value);
5572
5573     if (!PL_localizing)
5574         Perl_croak_no_modify(aTHX);
5575 }
5576
5577 I32
5578 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5579                               const I32 paren)
5580 {
5581     struct regexp *const rx = (struct regexp *)SvANY(r);
5582     I32 i;
5583     I32 s1, t1;
5584
5585     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5586
5587     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5588         switch (paren) {
5589       /* $` / ${^PREMATCH} */
5590       case RX_BUFF_IDX_PREMATCH:
5591         if (rx->offs[0].start != -1) {
5592                         i = rx->offs[0].start;
5593                         if (i > 0) {
5594                                 s1 = 0;
5595                                 t1 = i;
5596                                 goto getlen;
5597                         }
5598             }
5599         return 0;
5600       /* $' / ${^POSTMATCH} */
5601       case RX_BUFF_IDX_POSTMATCH:
5602             if (rx->offs[0].end != -1) {
5603                         i = rx->sublen - rx->offs[0].end;
5604                         if (i > 0) {
5605                                 s1 = rx->offs[0].end;
5606                                 t1 = rx->sublen;
5607                                 goto getlen;
5608                         }
5609             }
5610         return 0;
5611       /* $& / ${^MATCH}, $1, $2, ... */
5612       default:
5613             if (paren <= (I32)rx->nparens &&
5614             (s1 = rx->offs[paren].start) != -1 &&
5615             (t1 = rx->offs[paren].end) != -1)
5616             {
5617             i = t1 - s1;
5618             goto getlen;
5619         } else {
5620             if (ckWARN(WARN_UNINITIALIZED))
5621                 report_uninit((const SV *)sv);
5622             return 0;
5623         }
5624     }
5625   getlen:
5626     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5627         const char * const s = rx->subbeg + s1;
5628         const U8 *ep;
5629         STRLEN el;
5630
5631         i = t1 - s1;
5632         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5633                         i = el;
5634     }
5635     return i;
5636 }
5637
5638 SV*
5639 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5640 {
5641     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5642         PERL_UNUSED_ARG(rx);
5643         if (0)
5644             return NULL;
5645         else
5646             return newSVpvs("Regexp");
5647 }
5648
5649 /* Scans the name of a named buffer from the pattern.
5650  * If flags is REG_RSN_RETURN_NULL returns null.
5651  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5652  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5653  * to the parsed name as looked up in the RExC_paren_names hash.
5654  * If there is an error throws a vFAIL().. type exception.
5655  */
5656
5657 #define REG_RSN_RETURN_NULL    0
5658 #define REG_RSN_RETURN_NAME    1
5659 #define REG_RSN_RETURN_DATA    2
5660
5661 STATIC SV*
5662 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5663 {
5664     char *name_start = RExC_parse;
5665
5666     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5667
5668     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5669          /* skip IDFIRST by using do...while */
5670         if (UTF)
5671             do {
5672                 RExC_parse += UTF8SKIP(RExC_parse);
5673             } while (isALNUM_utf8((U8*)RExC_parse));
5674         else
5675             do {
5676                 RExC_parse++;
5677             } while (isALNUM(*RExC_parse));
5678     }
5679
5680     if ( flags ) {
5681         SV* sv_name
5682             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5683                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5684         if ( flags == REG_RSN_RETURN_NAME)
5685             return sv_name;
5686         else if (flags==REG_RSN_RETURN_DATA) {
5687             HE *he_str = NULL;
5688             SV *sv_dat = NULL;
5689             if ( ! sv_name )      /* should not happen*/
5690                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5691             if (RExC_paren_names)
5692                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5693             if ( he_str )
5694                 sv_dat = HeVAL(he_str);
5695             if ( ! sv_dat )
5696                 vFAIL("Reference to nonexistent named group");
5697             return sv_dat;
5698         }
5699         else {
5700             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5701         }
5702         /* NOT REACHED */
5703     }
5704     return NULL;
5705 }
5706
5707 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5708     int rem=(int)(RExC_end - RExC_parse);                       \
5709     int cut;                                                    \
5710     int num;                                                    \
5711     int iscut=0;                                                \
5712     if (rem>10) {                                               \
5713         rem=10;                                                 \
5714         iscut=1;                                                \
5715     }                                                           \
5716     cut=10-rem;                                                 \
5717     if (RExC_lastparse!=RExC_parse)                             \
5718         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5719             rem, RExC_parse,                                    \
5720             cut + 4,                                            \
5721             iscut ? "..." : "<"                                 \
5722         );                                                      \
5723     else                                                        \
5724         PerlIO_printf(Perl_debug_log,"%16s","");                \
5725                                                                 \
5726     if (SIZE_ONLY)                                              \
5727        num = RExC_size + 1;                                     \
5728     else                                                        \
5729        num=REG_NODE_NUM(RExC_emit);                             \
5730     if (RExC_lastnum!=num)                                      \
5731        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5732     else                                                        \
5733        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5734     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5735         (int)((depth*2)), "",                                   \
5736         (funcname)                                              \
5737     );                                                          \
5738     RExC_lastnum=num;                                           \
5739     RExC_lastparse=RExC_parse;                                  \
5740 })
5741
5742
5743
5744 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5745     DEBUG_PARSE_MSG((funcname));                            \
5746     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5747 })
5748 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5749     DEBUG_PARSE_MSG((funcname));                            \
5750     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5751 })
5752
5753 /* This section of code defines the inversion list object and its methods.  The
5754  * interfaces are highly subject to change, so as much as possible is static to
5755  * this file.  An inversion list is here implemented as a malloc'd C array with
5756  * some added info.  More will be coming when functionality is added later.
5757  *
5758  * Some of the methods should always be private to the implementation, and some
5759  * should eventually be made public */
5760
5761 #define INVLIST_INITIAL_LEN 10
5762 #define INVLIST_ARRAY_KEY "array"
5763 #define INVLIST_MAX_KEY "max"
5764 #define INVLIST_LEN_KEY "len"
5765
5766 PERL_STATIC_INLINE UV*
5767 S_invlist_array(pTHX_ HV* const invlist)
5768 {
5769     /* Returns the pointer to the inversion list's array.  Every time the
5770      * length changes, this needs to be called in case malloc or realloc moved
5771      * it */
5772
5773     SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5774
5775     PERL_ARGS_ASSERT_INVLIST_ARRAY;
5776
5777     if (list_ptr == NULL) {
5778         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5779                                                             INVLIST_ARRAY_KEY);
5780     }
5781
5782     return INT2PTR(UV *, SvUV(*list_ptr));
5783 }
5784
5785 PERL_STATIC_INLINE void
5786 S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5787 {
5788     PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5789
5790     /* Sets the array stored in the inversion list to the memory beginning with
5791      * the parameter */
5792
5793     if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5794         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5795                                                             INVLIST_ARRAY_KEY);
5796     }
5797 }
5798
5799 PERL_STATIC_INLINE UV
5800 S_invlist_len(pTHX_ HV* const invlist)
5801 {
5802     /* Returns the current number of elements in the inversion list's array */
5803
5804     SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5805
5806     PERL_ARGS_ASSERT_INVLIST_LEN;
5807
5808     if (len_ptr == NULL) {
5809         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5810                                                             INVLIST_LEN_KEY);
5811     }
5812
5813     return SvUV(*len_ptr);
5814 }
5815
5816 PERL_STATIC_INLINE UV
5817 S_invlist_max(pTHX_ HV* const invlist)
5818 {
5819     /* Returns the maximum number of elements storable in the inversion list's
5820      * array, without having to realloc() */
5821
5822     SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5823
5824     PERL_ARGS_ASSERT_INVLIST_MAX;
5825
5826     if (max_ptr == NULL) {
5827         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5828                                                             INVLIST_MAX_KEY);
5829     }
5830
5831     return SvUV(*max_ptr);
5832 }
5833
5834 PERL_STATIC_INLINE void
5835 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5836 {
5837     /* Sets the current number of elements stored in the inversion list */
5838
5839     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5840
5841     if (len != 0 && len > invlist_max(invlist)) {
5842         Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5843     }
5844
5845     if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5846         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5847                                                             INVLIST_LEN_KEY);
5848     }
5849 }
5850
5851 PERL_STATIC_INLINE void
5852 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5853 {
5854
5855     /* Sets the maximum number of elements storable in the inversion list
5856      * without having to realloc() */
5857
5858     PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5859
5860     if (max < invlist_len(invlist)) {
5861         Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5862     }
5863
5864     if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5865         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5866                                                             INVLIST_LEN_KEY);
5867     }
5868 }
5869
5870 #ifndef PERL_IN_XSUB_RE
5871 HV*
5872 Perl__new_invlist(pTHX_ IV initial_size)
5873 {
5874
5875     /* Return a pointer to a newly constructed inversion list, with enough
5876      * space to store 'initial_size' elements.  If that number is negative, a
5877      * system default is used instead */
5878
5879     HV* invlist = newHV();
5880     UV* list;
5881
5882     if (initial_size < 0) {
5883         initial_size = INVLIST_INITIAL_LEN;
5884     }
5885
5886     /* Allocate the initial space */
5887     Newx(list, initial_size, UV);
5888     invlist_set_array(invlist, list);
5889
5890     /* set_len has to come before set_max, as the latter inspects the len */
5891     invlist_set_len(invlist, 0);
5892     invlist_set_max(invlist, initial_size);
5893
5894     return invlist;
5895 }
5896 #endif
5897
5898 PERL_STATIC_INLINE void
5899 S_invlist_destroy(pTHX_ HV* const invlist)
5900 {
5901    /* Inversion list destructor */
5902
5903     SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5904
5905     PERL_ARGS_ASSERT_INVLIST_DESTROY;
5906
5907     if (list_ptr != NULL) {
5908         UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5909         Safefree(list);
5910     }
5911 }
5912
5913 STATIC void
5914 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5915 {
5916     /* Change the maximum size of an inversion list (up or down) */
5917
5918     UV* orig_array;
5919     UV* array;
5920     const UV old_max = invlist_max(invlist);
5921
5922     PERL_ARGS_ASSERT_INVLIST_EXTEND;
5923
5924     if (old_max == new_max) {   /* If a no-op */
5925         return;
5926     }
5927
5928     array = orig_array = invlist_array(invlist);
5929     Renew(array, new_max, UV);
5930
5931     /* If the size change moved the list in memory, set the new one */
5932     if (array != orig_array) {
5933         invlist_set_array(invlist, array);
5934     }
5935
5936     invlist_set_max(invlist, new_max);
5937
5938 }
5939
5940 PERL_STATIC_INLINE void
5941 S_invlist_trim(pTHX_ HV* const invlist)
5942 {
5943     PERL_ARGS_ASSERT_INVLIST_TRIM;
5944
5945     /* Change the length of the inversion list to how many entries it currently
5946      * has */
5947
5948     invlist_extend(invlist, invlist_len(invlist));
5949 }
5950
5951 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
5952  * etc */
5953
5954 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
5955
5956 #ifndef PERL_IN_XSUB_RE
5957 void
5958 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
5959 {
5960    /* Subject to change or removal.  Append the range from 'start' to 'end' at
5961     * the end of the inversion list.  The range must be above any existing
5962     * ones. */
5963
5964     UV* array = invlist_array(invlist);
5965     UV max = invlist_max(invlist);
5966     UV len = invlist_len(invlist);
5967
5968     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
5969
5970     if (len > 0) {
5971
5972         /* Here, the existing list is non-empty. The current max entry in the
5973          * list is generally the first value not in the set, except when the
5974          * set extends to the end of permissible values, in which case it is
5975          * the first entry in that final set, and so this call is an attempt to
5976          * append out-of-order */
5977
5978         UV final_element = len - 1;
5979         if (array[final_element] > start
5980             || ELEMENT_IN_INVLIST_SET(final_element))
5981         {
5982             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
5983         }
5984
5985         /* Here, it is a legal append.  If the new range begins with the first
5986          * value not in the set, it is extending the set, so the new first
5987          * value not in the set is one greater than the newly extended range.
5988          * */
5989         if (array[final_element] == start) {
5990             if (end != UV_MAX) {
5991                 array[final_element] = end + 1;
5992             }
5993             else {
5994                 /* But if the end is the maximum representable on the machine,
5995                  * just let the range that this would extend have no end */
5996                 invlist_set_len(invlist, len - 1);
5997             }
5998             return;
5999         }
6000     }
6001
6002     /* Here the new range doesn't extend any existing set.  Add it */
6003
6004     len += 2;   /* Includes an element each for the start and end of range */
6005
6006     /* If overflows the existing space, extend, which may cause the array to be
6007      * moved */
6008     if (max < len) {
6009         invlist_extend(invlist, len);
6010         array = invlist_array(invlist);
6011     }
6012
6013     invlist_set_len(invlist, len);
6014
6015     /* The next item on the list starts the range, the one after that is
6016      * one past the new range.  */
6017     array[len - 2] = start;
6018     if (end != UV_MAX) {
6019         array[len - 1] = end + 1;
6020     }
6021     else {
6022         /* But if the end is the maximum representable on the machine, just let
6023          * the range have no end */
6024         invlist_set_len(invlist, len - 1);
6025     }
6026 }
6027 #endif
6028
6029 STATIC HV*
6030 S_invlist_union(pTHX_ HV* const a, HV* const b)
6031 {
6032     /* Return a new inversion list which is the union of two inversion lists.
6033      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6034      * Richard Gillam, published by Addison-Wesley, and explained at some
6035      * length there.  The preface says to incorporate its examples into your
6036      * code at your own risk.
6037      *
6038      * The algorithm is like a merge sort.
6039      *
6040      * XXX A potential performance improvement is to keep track as we go along
6041      * if only one of the inputs contributes to the result, meaning the other
6042      * is a subset of that one.  In that case, we can skip the final copy and
6043      * return the larger of the input lists */
6044
6045     UV* array_a = invlist_array(a);   /* a's array */
6046     UV* array_b = invlist_array(b);
6047     UV len_a = invlist_len(a);  /* length of a's array */
6048     UV len_b = invlist_len(b);
6049
6050     HV* u;                      /* the resulting union */
6051     UV* array_u;
6052     UV len_u;
6053
6054     UV i_a = 0;             /* current index into a's array */
6055     UV i_b = 0;
6056     UV i_u = 0;
6057
6058     /* running count, as explained in the algorithm source book; items are
6059      * stopped accumulating and are output when the count changes to/from 0.
6060      * The count is incremented when we start a range that's in the set, and
6061      * decremented when we start a range that's not in the set.  So its range
6062      * is 0 to 2.  Only when the count is zero is something not in the set.
6063      */
6064     UV count = 0;
6065
6066     PERL_ARGS_ASSERT_INVLIST_UNION;
6067
6068     /* Size the union for the worst case: that the sets are completely
6069      * disjoint */
6070     u = _new_invlist(len_a + len_b);
6071     array_u = invlist_array(u);
6072
6073     /* Go through each list item by item, stopping when exhausted one of
6074      * them */
6075     while (i_a < len_a && i_b < len_b) {
6076         UV cp;      /* The element to potentially add to the union's array */
6077         bool cp_in_set;   /* is it in the the input list's set or not */
6078
6079         /* We need to take one or the other of the two inputs for the union.
6080          * Since we are merging two sorted lists, we take the smaller of the
6081          * next items.  In case of a tie, we take the one that is in its set
6082          * first.  If we took one not in the set first, it would decrement the
6083          * count, possibly to 0 which would cause it to be output as ending the
6084          * range, and the next time through we would take the same number, and
6085          * output it again as beginning the next range.  By doing it the
6086          * opposite way, there is no possibility that the count will be
6087          * momentarily decremented to 0, and thus the two adjoining ranges will
6088          * be seamlessly merged.  (In a tie and both are in the set or both not
6089          * in the set, it doesn't matter which we take first.) */
6090         if (array_a[i_a] < array_b[i_b]
6091             || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6092         {
6093             cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6094             cp= array_a[i_a++];
6095         }
6096         else {
6097             cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6098             cp= array_b[i_b++];
6099         }
6100
6101         /* Here, have chosen which of the two inputs to look at.  Only output
6102          * if the running count changes to/from 0, which marks the
6103          * beginning/end of a range in that's in the set */
6104         if (cp_in_set) {
6105             if (count == 0) {
6106                 array_u[i_u++] = cp;
6107             }
6108             count++;
6109         }
6110         else {
6111             count--;
6112             if (count == 0) {
6113                 array_u[i_u++] = cp;
6114             }
6115         }
6116     }
6117
6118     /* Here, we are finished going through at least one of the lists, which
6119      * means there is something remaining in at most one.  We check if the list
6120      * that hasn't been exhausted is positioned such that we are in the middle
6121      * of a range in its set or not.  (We are in the set if the next item in
6122      * the array marks the beginning of something not in the set)   If in the
6123      * set, we decrement 'count'; if 0, there is potentially more to output.
6124      * There are four cases:
6125      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6126      *     in the union is entirely from the non-exhausted set.
6127      *  2) Both were in their sets, count is 2.  Nothing further should
6128      *     be output, as everything that remains will be in the exhausted
6129      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6130      *     that
6131      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6132      *     Nothing further should be output because the union includes
6133      *     everything from the exhausted set.  Not decrementing insures that.
6134      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6135      *     decrementing to 0 insures that we look at the remainder of the
6136      *     non-exhausted set */
6137     if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6138         || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6139     {
6140         count--;
6141     }
6142
6143     /* The final length is what we've output so far, plus what else is about to
6144      * be output.  (If 'count' is non-zero, then the input list we exhausted
6145      * has everything remaining up to the machine's limit in its set, and hence
6146      * in the union, so there will be no further output. */
6147     len_u = i_u;
6148     if (count == 0) {
6149         /* At most one of the subexpressions will be non-zero */
6150         len_u += (len_a - i_a) + (len_b - i_b);
6151     }
6152
6153     /* Set result to final length, which can change the pointer to array_u, so
6154      * re-find it */
6155     if (len_u != invlist_len(u)) {
6156         invlist_set_len(u, len_u);
6157         invlist_trim(u);
6158         array_u = invlist_array(u);
6159     }
6160
6161     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6162      * the other) ended with everything above it not in its set.  That means
6163      * that the remaining part of the union is precisely the same as the
6164      * non-exhausted list, so can just copy it unchanged.  (If both list were
6165      * exhausted at the same time, then the operations below will be both 0.)
6166      */
6167     if (count == 0) {
6168         IV copy_count; /* At most one will have a non-zero copy count */
6169         if ((copy_count = len_a - i_a) > 0) {
6170             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6171         }
6172         else if ((copy_count = len_b - i_b) > 0) {
6173             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6174         }
6175     }
6176
6177     return u;
6178 }
6179
6180 STATIC HV*
6181 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6182 {
6183     /* Return the intersection of two inversion lists.  The basis for this
6184      * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6185      * by Addison-Wesley, and explained at some length there.  The preface says
6186      * to incorporate its examples into your code at your own risk.
6187      *
6188      * The algorithm is like a merge sort, and is essentially the same as the
6189      * union above
6190      */
6191
6192     UV* array_a = invlist_array(a);   /* a's array */
6193     UV* array_b = invlist_array(b);
6194     UV len_a = invlist_len(a);  /* length of a's array */
6195     UV len_b = invlist_len(b);
6196
6197     HV* r;                   /* the resulting intersection */
6198     UV* array_r;
6199     UV len_r;
6200
6201     UV i_a = 0;             /* current index into a's array */
6202     UV i_b = 0;
6203     UV i_r = 0;
6204
6205     /* running count, as explained in the algorithm source book; items are
6206      * stopped accumulating and are output when the count changes to/from 2.
6207      * The count is incremented when we start a range that's in the set, and
6208      * decremented when we start a range that's not in the set.  So its range
6209      * is 0 to 2.  Only when the count is 2 is something in the intersection.
6210      */
6211     UV count = 0;
6212
6213     PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6214
6215     /* Size the intersection for the worst case: that the intersection ends up
6216      * fragmenting everything to be completely disjoint */
6217     r= _new_invlist(len_a + len_b);
6218     array_r = invlist_array(r);
6219
6220     /* Go through each list item by item, stopping when exhausted one of
6221      * them */
6222     while (i_a < len_a && i_b < len_b) {
6223         UV cp;      /* The element to potentially add to the intersection's
6224                        array */
6225         bool cp_in_set; /* Is it in the input list's set or not */
6226
6227         /* We need to take one or the other of the two inputs for the union.
6228          * Since we are merging two sorted lists, we take the smaller of the
6229          * next items.  In case of a tie, we take the one that is not in its
6230          * set first (a difference from the union algorithm).  If we took one
6231          * in the set first, it would increment the count, possibly to 2 which
6232          * would cause it to be output as starting a range in the intersection,
6233          * and the next time through we would take that same number, and output
6234          * it again as ending the set.  By doing it the opposite of this, we
6235          * there is no possibility that the count will be momentarily
6236          * incremented to 2.  (In a tie and both are in the set or both not in
6237          * the set, it doesn't matter which we take first.) */
6238         if (array_a[i_a] < array_b[i_b]
6239             || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6240         {
6241             cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6242             cp= array_a[i_a++];
6243         }
6244         else {
6245             cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6246             cp= array_b[i_b++];
6247         }
6248
6249         /* Here, have chosen which of the two inputs to look at.  Only output
6250          * if the running count changes to/from 2, which marks the
6251          * beginning/end of a range that's in the intersection */
6252         if (cp_in_set) {
6253             count++;
6254             if (count == 2) {
6255                 array_r[i_r++] = cp;
6256             }
6257         }
6258         else {
6259             if (count == 2) {
6260                 array_r[i_r++] = cp;
6261             }
6262             count--;
6263         }
6264     }
6265
6266     /* Here, we are finished going through at least one of the sets, which
6267      * means there is something remaining in at most one.  See the comments in
6268      * the union code */
6269     if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6270         || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6271     {
6272         count--;
6273     }
6274
6275     /* The final length is what we've output so far plus what else is in the
6276      * intersection.  Only one of the subexpressions below will be non-zero */
6277     len_r = i_r;
6278     if (count == 2) {
6279         len_r += (len_a - i_a) + (len_b - i_b);
6280     }
6281
6282     /* Set result to final length, which can change the pointer to array_r, so
6283      * re-find it */
6284     if (len_r != invlist_len(r)) {
6285         invlist_set_len(r, len_r);
6286         invlist_trim(r);
6287         array_r = invlist_array(r);
6288     }
6289
6290     /* Finish outputting any remaining */
6291     if (count == 2) { /* Only one of will have a non-zero copy count */
6292         IV copy_count;
6293         if ((copy_count = len_a - i_a) > 0) {
6294             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6295         }
6296         else if ((copy_count = len_b - i_b) > 0) {
6297             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6298         }
6299     }
6300
6301     return r;
6302 }
6303
6304 STATIC HV*
6305 S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
6306 {
6307     /* Add the range from 'start' to 'end' inclusive to the inversion list's
6308      * set.  A pointer to the inversion list is returned.  This may actually be
6309      * a new list, in which case the passed in one has been destroyed.  The
6310      * passed in inversion list can be NULL, in which case a new one is created
6311      * with just the one range in it */
6312
6313     HV* range_invlist;
6314     HV* added_invlist;
6315     UV len;
6316
6317     if (invlist == NULL) {
6318         invlist = _new_invlist(2);
6319         len = 0;
6320     }
6321     else {
6322         len = invlist_len(invlist);
6323     }
6324
6325     /* If comes after the final entry, can just append it to the end */
6326     if (len == 0
6327         || start >= invlist_array(invlist)
6328                                     [invlist_len(invlist) - 1])
6329     {
6330         _append_range_to_invlist(invlist, start, end);
6331         return invlist;
6332     }
6333
6334     /* Here, can't just append things, create and return a new inversion list
6335      * which is the union of this range and the existing inversion list */
6336     range_invlist = _new_invlist(2);
6337     _append_range_to_invlist(range_invlist, start, end);
6338
6339     added_invlist = invlist_union(invlist, range_invlist);
6340
6341     /* The passed in list can be freed, as well as our temporary */
6342     invlist_destroy(range_invlist);
6343     if (invlist != added_invlist) {
6344         invlist_destroy(invlist);
6345     }
6346
6347     return added_invlist;
6348 }
6349
6350 PERL_STATIC_INLINE HV*
6351 S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6352     return add_range_to_invlist(invlist, cp, cp);
6353 }
6354
6355 /* End of inversion list object */
6356
6357 /*
6358  - reg - regular expression, i.e. main body or parenthesized thing
6359  *
6360  * Caller must absorb opening parenthesis.
6361  *
6362  * Combining parenthesis handling with the base level of regular expression
6363  * is a trifle forced, but the need to tie the tails of the branches to what
6364  * follows makes it hard to avoid.
6365  */
6366 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6367 #ifdef DEBUGGING
6368 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6369 #else
6370 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6371 #endif
6372
6373 STATIC regnode *
6374 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6375     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6376 {
6377     dVAR;
6378     register regnode *ret;              /* Will be the head of the group. */
6379     register regnode *br;
6380     register regnode *lastbr;
6381     register regnode *ender = NULL;
6382     register I32 parno = 0;
6383     I32 flags;
6384     U32 oregflags = RExC_flags;
6385     bool have_branch = 0;
6386     bool is_open = 0;
6387     I32 freeze_paren = 0;
6388     I32 after_freeze = 0;
6389
6390     /* for (?g), (?gc), and (?o) warnings; warning
6391        about (?c) will warn about (?g) -- japhy    */
6392
6393 #define WASTED_O  0x01
6394 #define WASTED_G  0x02
6395 #define WASTED_C  0x04
6396 #define WASTED_GC (0x02|0x04)
6397     I32 wastedflags = 0x00;
6398
6399     char * parse_start = RExC_parse; /* MJD */
6400     char * const oregcomp_parse = RExC_parse;
6401
6402     GET_RE_DEBUG_FLAGS_DECL;
6403
6404     PERL_ARGS_ASSERT_REG;
6405     DEBUG_PARSE("reg ");
6406
6407     *flagp = 0;                         /* Tentatively. */
6408
6409
6410     /* Make an OPEN node, if parenthesized. */
6411     if (paren) {
6412         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6413             char *start_verb = RExC_parse;
6414             STRLEN verb_len = 0;
6415             char *start_arg = NULL;
6416             unsigned char op = 0;
6417             int argok = 1;
6418             int internal_argval = 0; /* internal_argval is only useful if !argok */
6419             while ( *RExC_parse && *RExC_parse != ')' ) {
6420                 if ( *RExC_parse == ':' ) {
6421                     start_arg = RExC_parse + 1;
6422                     break;
6423                 }
6424                 RExC_parse++;
6425             }
6426             ++start_verb;
6427             verb_len = RExC_parse - start_verb;
6428             if ( start_arg ) {
6429                 RExC_parse++;
6430                 while ( *RExC_parse && *RExC_parse != ')' )
6431                     RExC_parse++;
6432                 if ( *RExC_parse != ')' )
6433                     vFAIL("Unterminated verb pattern argument");
6434                 if ( RExC_parse == start_arg )
6435                     start_arg = NULL;
6436             } else {
6437                 if ( *RExC_parse != ')' )
6438                     vFAIL("Unterminated verb pattern");
6439             }
6440
6441             switch ( *start_verb ) {
6442             case 'A':  /* (*ACCEPT) */
6443                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6444                     op = ACCEPT;
6445                     internal_argval = RExC_nestroot;
6446                 }
6447                 break;
6448             case 'C':  /* (*COMMIT) */
6449                 if ( memEQs(start_verb,verb_len,"COMMIT") )
6450                     op = COMMIT;
6451                 break;
6452             case 'F':  /* (*FAIL) */
6453                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6454                     op = OPFAIL;
6455                     argok = 0;
6456                 }
6457                 break;
6458             case ':':  /* (*:NAME) */
6459             case 'M':  /* (*MARK:NAME) */
6460                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6461                     op = MARKPOINT;
6462                     argok = -1;
6463                 }
6464                 break;
6465             case 'P':  /* (*PRUNE) */
6466                 if ( memEQs(start_verb,verb_len,"PRUNE") )
6467                     op = PRUNE;
6468                 break;
6469             case 'S':   /* (*SKIP) */
6470                 if ( memEQs(start_verb,verb_len,"SKIP") )
6471                     op = SKIP;
6472                 break;
6473             case 'T':  /* (*THEN) */
6474                 /* [19:06] <TimToady> :: is then */
6475                 if ( memEQs(start_verb,verb_len,"THEN") ) {
6476                     op = CUTGROUP;
6477                     RExC_seen |= REG_SEEN_CUTGROUP;
6478                 }
6479                 break;
6480             }
6481             if ( ! op ) {
6482                 RExC_parse++;
6483                 vFAIL3("Unknown verb pattern '%.*s'",
6484                     verb_len, start_verb);
6485             }
6486             if ( argok ) {
6487                 if ( start_arg && internal_argval ) {
6488                     vFAIL3("Verb pattern '%.*s' may not have an argument",
6489                         verb_len, start_verb);
6490                 } else if ( argok < 0 && !start_arg ) {
6491                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6492                         verb_len, start_verb);
6493                 } else {
6494                     ret = reganode(pRExC_state, op, internal_argval);
6495                     if ( ! internal_argval && ! SIZE_ONLY ) {
6496                         if (start_arg) {
6497                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6498                             ARG(ret) = add_data( pRExC_state, 1, "S" );
6499                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6500                             ret->flags = 0;
6501                         } else {
6502                             ret->flags = 1;
6503                         }
6504                     }
6505                 }
6506                 if (!internal_argval)
6507                     RExC_seen |= REG_SEEN_VERBARG;
6508             } else if ( start_arg ) {
6509                 vFAIL3("Verb pattern '%.*s' may not have an argument",
6510                         verb_len, start_verb);
6511             } else {
6512                 ret = reg_node(pRExC_state, op);
6513             }
6514             nextchar(pRExC_state);
6515             return ret;
6516         } else
6517         if (*RExC_parse == '?') { /* (?...) */
6518             bool is_logical = 0;
6519             const char * const seqstart = RExC_parse;
6520             bool has_use_defaults = FALSE;
6521
6522             RExC_parse++;
6523             paren = *RExC_parse++;
6524             ret = NULL;                 /* For look-ahead/behind. */
6525             switch (paren) {
6526
6527             case 'P':   /* (?P...) variants for those used to PCRE/Python */
6528                 paren = *RExC_parse++;
6529                 if ( paren == '<')         /* (?P<...>) named capture */
6530                     goto named_capture;
6531                 else if (paren == '>') {   /* (?P>name) named recursion */
6532                     goto named_recursion;
6533                 }
6534                 else if (paren == '=') {   /* (?P=...)  named backref */
6535                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
6536                        you change this make sure you change that */
6537                     char* name_start = RExC_parse;
6538                     U32 num = 0;
6539                     SV *sv_dat = reg_scan_name(pRExC_state,
6540                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6541                     if (RExC_parse == name_start || *RExC_parse != ')')
6542                         vFAIL2("Sequence %.3s... not terminated",parse_start);
6543
6544                     if (!SIZE_ONLY) {
6545                         num = add_data( pRExC_state, 1, "S" );
6546                         RExC_rxi->data->data[num]=(void*)sv_dat;
6547                         SvREFCNT_inc_simple_void(sv_dat);
6548                     }
6549                     RExC_sawback = 1;
6550                     ret = reganode(pRExC_state,
6551                                    ((! FOLD)
6552                                      ? NREF
6553                                      : (MORE_ASCII_RESTRICTED)
6554                                        ? NREFFA
6555                                        : (AT_LEAST_UNI_SEMANTICS)
6556                                          ? NREFFU
6557                                          : (LOC)
6558                                            ? NREFFL
6559                                            : NREFF),
6560                                     num);
6561                     *flagp |= HASWIDTH;
6562
6563                     Set_Node_Offset(ret, parse_start+1);
6564                     Set_Node_Cur_Length(ret); /* MJD */
6565
6566                     nextchar(pRExC_state);
6567                     return ret;
6568                 }
6569                 RExC_parse++;
6570                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6571                 /*NOTREACHED*/
6572             case '<':           /* (?<...) */
6573                 if (*RExC_parse == '!')
6574                     paren = ',';
6575                 else if (*RExC_parse != '=')
6576               named_capture:
6577                 {               /* (?<...>) */
6578                     char *name_start;
6579                     SV *svname;
6580                     paren= '>';
6581             case '\'':          /* (?'...') */
6582                     name_start= RExC_parse;
6583                     svname = reg_scan_name(pRExC_state,
6584                         SIZE_ONLY ?  /* reverse test from the others */
6585                         REG_RSN_RETURN_NAME :
6586                         REG_RSN_RETURN_NULL);
6587                     if (RExC_parse == name_start) {
6588                         RExC_parse++;
6589                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6590                         /*NOTREACHED*/
6591                     }
6592                     if (*RExC_parse != paren)
6593                         vFAIL2("Sequence (?%c... not terminated",
6594                             paren=='>' ? '<' : paren);
6595                     if (SIZE_ONLY) {
6596                         HE *he_str;
6597                         SV *sv_dat = NULL;
6598                         if (!svname) /* shouldn't happen */
6599                             Perl_croak(aTHX_
6600                                 "panic: reg_scan_name returned NULL");
6601                         if (!RExC_paren_names) {
6602                             RExC_paren_names= newHV();
6603                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
6604 #ifdef DEBUGGING
6605                             RExC_paren_name_list= newAV();
6606                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6607 #endif
6608                         }
6609                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6610                         if ( he_str )
6611                             sv_dat = HeVAL(he_str);
6612                         if ( ! sv_dat ) {
6613                             /* croak baby croak */
6614                             Perl_croak(aTHX_
6615                                 "panic: paren_name hash element allocation failed");
6616                         } else if ( SvPOK(sv_dat) ) {
6617                             /* (?|...) can mean we have dupes so scan to check
6618                                its already been stored. Maybe a flag indicating
6619                                we are inside such a construct would be useful,
6620                                but the arrays are likely to be quite small, so
6621                                for now we punt -- dmq */
6622                             IV count = SvIV(sv_dat);
6623                             I32 *pv = (I32*)SvPVX(sv_dat);
6624                             IV i;
6625                             for ( i = 0 ; i < count ; i++ ) {
6626                                 if ( pv[i] == RExC_npar ) {
6627                                     count = 0;
6628                                     break;
6629                                 }
6630                             }
6631                             if ( count ) {
6632                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6633                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6634                                 pv[count] = RExC_npar;
6635                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6636                             }
6637                         } else {
6638                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
6639                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6640                             SvIOK_on(sv_dat);
6641                             SvIV_set(sv_dat, 1);
6642                         }
6643 #ifdef DEBUGGING
6644                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6645                             SvREFCNT_dec(svname);
6646 #endif
6647
6648                         /*sv_dump(sv_dat);*/
6649                     }
6650                     nextchar(pRExC_state);
6651                     paren = 1;
6652                     goto capturing_parens;
6653                 }
6654                 RExC_seen |= REG_SEEN_LOOKBEHIND;
6655                 RExC_in_lookbehind++;
6656                 RExC_parse++;
6657             case '=':           /* (?=...) */
6658                 RExC_seen_zerolen++;
6659                 break;
6660             case '!':           /* (?!...) */
6661                 RExC_seen_zerolen++;
6662                 if (*RExC_parse == ')') {
6663                     ret=reg_node(pRExC_state, OPFAIL);
6664                     nextchar(pRExC_state);
6665                     return ret;
6666                 }
6667                 break;
6668             case '|':           /* (?|...) */
6669                 /* branch reset, behave like a (?:...) except that
6670                    buffers in alternations share the same numbers */
6671                 paren = ':';
6672                 after_freeze = freeze_paren = RExC_npar;
6673                 break;
6674             case ':':           /* (?:...) */
6675             case '>':           /* (?>...) */
6676                 break;
6677             case '$':           /* (?$...) */
6678             case '@':           /* (?@...) */
6679                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6680                 break;
6681             case '#':           /* (?#...) */
6682                 while (*RExC_parse && *RExC_parse != ')')
6683                     RExC_parse++;
6684                 if (*RExC_parse != ')')
6685                     FAIL("Sequence (?#... not terminated");
6686                 nextchar(pRExC_state);
6687                 *flagp = TRYAGAIN;
6688                 return NULL;
6689             case '0' :           /* (?0) */
6690             case 'R' :           /* (?R) */
6691                 if (*RExC_parse != ')')
6692                     FAIL("Sequence (?R) not terminated");
6693                 ret = reg_node(pRExC_state, GOSTART);
6694                 *flagp |= POSTPONED;
6695                 nextchar(pRExC_state);
6696                 return ret;
6697                 /*notreached*/
6698             { /* named and numeric backreferences */
6699                 I32 num;
6700             case '&':            /* (?&NAME) */
6701                 parse_start = RExC_parse - 1;
6702               named_recursion:
6703                 {
6704                     SV *sv_dat = reg_scan_name(pRExC_state,
6705                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6706                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6707                 }
6708                 goto gen_recurse_regop;
6709                 /* NOT REACHED */
6710             case '+':
6711                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6712                     RExC_parse++;
6713                     vFAIL("Illegal pattern");
6714                 }
6715                 goto parse_recursion;
6716                 /* NOT REACHED*/
6717             case '-': /* (?-1) */
6718                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6719                     RExC_parse--; /* rewind to let it be handled later */
6720                     goto parse_flags;
6721                 }
6722                 /*FALLTHROUGH */
6723             case '1': case '2': case '3': case '4': /* (?1) */
6724             case '5': case '6': case '7': case '8': case '9':
6725                 RExC_parse--;
6726               parse_recursion:
6727                 num = atoi(RExC_parse);
6728                 parse_start = RExC_parse - 1; /* MJD */
6729                 if (*RExC_parse == '-')
6730                     RExC_parse++;
6731                 while (isDIGIT(*RExC_parse))
6732                         RExC_parse++;
6733                 if (*RExC_parse!=')')
6734                     vFAIL("Expecting close bracket");
6735
6736               gen_recurse_regop:
6737                 if ( paren == '-' ) {
6738                     /*
6739                     Diagram of capture buffer numbering.
6740                     Top line is the normal capture buffer numbers
6741                     Bottom line is the negative indexing as from
6742                     the X (the (?-2))
6743
6744                     +   1 2    3 4 5 X          6 7
6745                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6746                     -   5 4    3 2 1 X          x x
6747
6748                     */
6749                     num = RExC_npar + num;
6750                     if (num < 1)  {
6751                         RExC_parse++;
6752                         vFAIL("Reference to nonexistent group");
6753                     }
6754                 } else if ( paren == '+' ) {
6755                     num = RExC_npar + num - 1;
6756                 }
6757
6758                 ret = reganode(pRExC_state, GOSUB, num);
6759                 if (!SIZE_ONLY) {
6760                     if (num > (I32)RExC_rx->nparens) {
6761                         RExC_parse++;
6762                         vFAIL("Reference to nonexistent group");
6763                     }
6764                     ARG2L_SET( ret, RExC_recurse_count++);
6765                     RExC_emit++;
6766                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6767                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6768                 } else {
6769                     RExC_size++;
6770                 }
6771                 RExC_seen |= REG_SEEN_RECURSE;
6772                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6773                 Set_Node_Offset(ret, parse_start); /* MJD */
6774
6775                 *flagp |= POSTPONED;
6776                 nextchar(pRExC_state);
6777                 return ret;
6778             } /* named and numeric backreferences */
6779             /* NOT REACHED */
6780
6781             case '?':           /* (??...) */
6782                 is_logical = 1;
6783                 if (*RExC_parse != '{') {
6784                     RExC_parse++;
6785                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6786                     /*NOTREACHED*/
6787                 }
6788                 *flagp |= POSTPONED;
6789                 paren = *RExC_parse++;
6790                 /* FALL THROUGH */
6791             case '{':           /* (?{...}) */
6792             {
6793                 I32 count = 1;
6794                 U32 n = 0;
6795                 char c;
6796                 char *s = RExC_parse;
6797
6798                 RExC_seen_zerolen++;
6799                 RExC_seen |= REG_SEEN_EVAL;
6800                 while (count && (c = *RExC_parse)) {
6801                     if (c == '\\') {
6802                         if (RExC_parse[1])
6803                             RExC_parse++;
6804                     }
6805                     else if (c == '{')
6806                         count++;
6807                     else if (c == '}')
6808                         count--;
6809                     RExC_parse++;
6810                 }
6811                 if (*RExC_parse != ')') {
6812                     RExC_parse = s;
6813                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6814                 }
6815                 if (!SIZE_ONLY) {
6816                     PAD *pad;
6817                     OP_4tree *sop, *rop;
6818                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6819
6820                     ENTER;
6821                     Perl_save_re_context(aTHX);
6822                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6823                     sop->op_private |= OPpREFCOUNTED;
6824                     /* re_dup will OpREFCNT_inc */
6825                     OpREFCNT_set(sop, 1);
6826                     LEAVE;
6827
6828                     n = add_data(pRExC_state, 3, "nop");
6829                     RExC_rxi->data->data[n] = (void*)rop;
6830                     RExC_rxi->data->data[n+1] = (void*)sop;
6831                     RExC_rxi->data->data[n+2] = (void*)pad;
6832                     SvREFCNT_dec(sv);
6833                 }
6834                 else {                                          /* First pass */
6835                     if (PL_reginterp_cnt < ++RExC_seen_evals
6836                         && IN_PERL_RUNTIME)
6837                         /* No compiled RE interpolated, has runtime
6838                            components ===> unsafe.  */
6839                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6840                     if (PL_tainting && PL_tainted)
6841                         FAIL("Eval-group in insecure regular expression");
6842 #if PERL_VERSION > 8
6843                     if (IN_PERL_COMPILETIME)
6844                         PL_cv_has_eval = 1;
6845 #endif
6846                 }
6847
6848                 nextchar(pRExC_state);
6849                 if (is_logical) {
6850                     ret = reg_node(pRExC_state, LOGICAL);
6851                     if (!SIZE_ONLY)
6852                         ret->flags = 2;
6853                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6854                     /* deal with the length of this later - MJD */
6855                     return ret;
6856                 }
6857                 ret = reganode(pRExC_state, EVAL, n);
6858                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6859                 Set_Node_Offset(ret, parse_start);
6860                 return ret;
6861             }
6862             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6863             {
6864                 int is_define= 0;
6865                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6866                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6867                         || RExC_parse[1] == '<'
6868                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6869                         I32 flag;
6870
6871                         ret = reg_node(pRExC_state, LOGICAL);
6872                         if (!SIZE_ONLY)
6873                             ret->flags = 1;
6874                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6875                         goto insert_if;
6876                     }
6877                 }
6878                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6879                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6880                 {
6881                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6882                     char *name_start= RExC_parse++;
6883                     U32 num = 0;
6884                     SV *sv_dat=reg_scan_name(pRExC_state,
6885                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6886                     if (RExC_parse == name_start || *RExC_parse != ch)
6887                         vFAIL2("Sequence (?(%c... not terminated",
6888                             (ch == '>' ? '<' : ch));
6889                     RExC_parse++;
6890                     if (!SIZE_ONLY) {
6891                         num = add_data( pRExC_state, 1, "S" );
6892                         RExC_rxi->data->data[num]=(void*)sv_dat;
6893                         SvREFCNT_inc_simple_void(sv_dat);
6894                     }
6895                     ret = reganode(pRExC_state,NGROUPP,num);
6896                     goto insert_if_check_paren;
6897                 }
6898                 else if (RExC_parse[0] == 'D' &&
6899                          RExC_parse[1] == 'E' &&
6900                          RExC_parse[2] == 'F' &&
6901                          RExC_parse[3] == 'I' &&
6902                          RExC_parse[4] == 'N' &&
6903                          RExC_parse[5] == 'E')
6904                 {
6905                     ret = reganode(pRExC_state,DEFINEP,0);
6906                     RExC_parse +=6 ;
6907                     is_define = 1;
6908                     goto insert_if_check_paren;
6909                 }
6910                 else if (RExC_parse[0] == 'R') {
6911                     RExC_parse++;
6912                     parno = 0;
6913                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6914                         parno = atoi(RExC_parse++);
6915                         while (isDIGIT(*RExC_parse))
6916                             RExC_parse++;
6917                     } else if (RExC_parse[0] == '&') {
6918                         SV *sv_dat;
6919                         RExC_parse++;
6920                         sv_dat = reg_scan_name(pRExC_state,
6921                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6922                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6923                     }
6924                     ret = reganode(pRExC_state,INSUBP,parno);
6925                     goto insert_if_check_paren;
6926                 }
6927                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6928                     /* (?(1)...) */
6929                     char c;
6930                     parno = atoi(RExC_parse++);
6931
6932                     while (isDIGIT(*RExC_parse))
6933                         RExC_parse++;
6934                     ret = reganode(pRExC_state, GROUPP, parno);
6935
6936                  insert_if_check_paren:
6937                     if ((c = *nextchar(pRExC_state)) != ')')
6938                         vFAIL("Switch condition not recognized");
6939                   insert_if:
6940                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6941                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6942                     if (br == NULL)
6943                         br = reganode(pRExC_state, LONGJMP, 0);
6944                     else
6945                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6946                     c = *nextchar(pRExC_state);
6947                     if (flags&HASWIDTH)
6948                         *flagp |= HASWIDTH;
6949                     if (c == '|') {
6950                         if (is_define)
6951                             vFAIL("(?(DEFINE)....) does not allow branches");
6952                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6953                         regbranch(pRExC_state, &flags, 1,depth+1);
6954                         REGTAIL(pRExC_state, ret, lastbr);
6955                         if (flags&HASWIDTH)
6956                             *flagp |= HASWIDTH;
6957                         c = *nextchar(pRExC_state);
6958                     }
6959                     else
6960                         lastbr = NULL;
6961                     if (c != ')')
6962                         vFAIL("Switch (?(condition)... contains too many branches");
6963                     ender = reg_node(pRExC_state, TAIL);
6964                     REGTAIL(pRExC_state, br, ender);
6965                     if (lastbr) {
6966                         REGTAIL(pRExC_state, lastbr, ender);
6967                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6968                     }
6969                     else
6970                         REGTAIL(pRExC_state, ret, ender);
6971                     RExC_size++; /* XXX WHY do we need this?!!
6972                                     For large programs it seems to be required
6973                                     but I can't figure out why. -- dmq*/
6974                     return ret;
6975                 }
6976                 else {
6977                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6978                 }
6979             }
6980             case 0:
6981                 RExC_parse--; /* for vFAIL to print correctly */
6982                 vFAIL("Sequence (? incomplete");
6983                 break;
6984             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6985                                        that follow */
6986                 has_use_defaults = TRUE;
6987                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6988                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
6989                                                 ? REGEX_UNICODE_CHARSET
6990                                                 : REGEX_DEPENDS_CHARSET);
6991                 goto parse_flags;
6992             default:
6993                 --RExC_parse;
6994                 parse_flags:      /* (?i) */
6995             {
6996                 U32 posflags = 0, negflags = 0;
6997                 U32 *flagsp = &posflags;
6998                 bool has_charset_modifier = 0;
6999                 regex_charset cs = REGEX_DEPENDS_CHARSET;
7000
7001                 while (*RExC_parse) {
7002                     /* && strchr("iogcmsx", *RExC_parse) */
7003                     /* (?g), (?gc) and (?o) are useless here
7004                        and must be globally applied -- japhy */
7005                     switch (*RExC_parse) {
7006                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7007                     case LOCALE_PAT_MOD:
7008                         if (has_charset_modifier || flagsp == &negflags) {
7009                             goto fail_modifiers;
7010                         }
7011                         cs = REGEX_LOCALE_CHARSET;
7012                         has_charset_modifier = 1;
7013                         break;
7014                     case UNICODE_PAT_MOD:
7015                         if (has_charset_modifier || flagsp == &negflags) {
7016                             goto fail_modifiers;
7017                         }
7018                         cs = REGEX_UNICODE_CHARSET;
7019                         has_charset_modifier = 1;
7020                         break;
7021                     case ASCII_RESTRICT_PAT_MOD:
7022                         if (has_charset_modifier || flagsp == &negflags) {
7023                             goto fail_modifiers;
7024                         }
7025                         if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
7026                             /* Doubled modifier implies more restricted */
7027                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7028                             RExC_parse++;
7029                         }
7030                         else {
7031                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
7032                         }
7033                         has_charset_modifier = 1;
7034                         break;
7035                     case DEPENDS_PAT_MOD:
7036                         if (has_use_defaults
7037                             || has_charset_modifier
7038                             || flagsp == &negflags)
7039                         {
7040                             goto fail_modifiers;
7041                         }
7042
7043                         /* The dual charset means unicode semantics if the
7044                          * pattern (or target, not known until runtime) are
7045                          * utf8, or something in the pattern indicates unicode
7046                          * semantics */
7047                         cs = (RExC_utf8 || RExC_uni_semantics)
7048                              ? REGEX_UNICODE_CHARSET
7049                              : REGEX_DEPENDS_CHARSET;
7050                         has_charset_modifier = 1;
7051                         break;
7052                     case ONCE_PAT_MOD: /* 'o' */
7053                     case GLOBAL_PAT_MOD: /* 'g' */
7054                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7055                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7056                             if (! (wastedflags & wflagbit) ) {
7057                                 wastedflags |= wflagbit;
7058                                 vWARN5(
7059                                     RExC_parse + 1,
7060                                     "Useless (%s%c) - %suse /%c modifier",
7061                                     flagsp == &negflags ? "?-" : "?",
7062                                     *RExC_parse,
7063                                     flagsp == &negflags ? "don't " : "",
7064                                     *RExC_parse
7065                                 );
7066                             }
7067                         }
7068                         break;
7069
7070                     case CONTINUE_PAT_MOD: /* 'c' */
7071                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7072                             if (! (wastedflags & WASTED_C) ) {
7073                                 wastedflags |= WASTED_GC;
7074                                 vWARN3(
7075                                     RExC_parse + 1,
7076                                     "Useless (%sc) - %suse /gc modifier",
7077                                     flagsp == &negflags ? "?-" : "?",
7078                                     flagsp == &negflags ? "don't " : ""
7079                                 );
7080                             }
7081                         }
7082                         break;
7083                     case KEEPCOPY_PAT_MOD: /* 'p' */
7084                         if (flagsp == &negflags) {
7085                             if (SIZE_ONLY)
7086                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7087                         } else {
7088                             *flagsp |= RXf_PMf_KEEPCOPY;
7089                         }
7090                         break;
7091                     case '-':
7092                         /* A flag is a default iff it is following a minus, so
7093                          * if there is a minus, it means will be trying to
7094                          * re-specify a default which is an error */
7095                         if (has_use_defaults || flagsp == &negflags) {
7096             fail_modifiers:
7097                             RExC_parse++;
7098                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7099                             /*NOTREACHED*/
7100                         }
7101                         flagsp = &negflags;
7102                         wastedflags = 0;  /* reset so (?g-c) warns twice */
7103                         break;
7104                     case ':':
7105                         paren = ':';
7106                         /*FALLTHROUGH*/
7107                     case ')':
7108                         RExC_flags |= posflags;
7109                         RExC_flags &= ~negflags;
7110                         set_regex_charset(&RExC_flags, cs);
7111                         if (paren != ':') {
7112                             oregflags |= posflags;
7113                             oregflags &= ~negflags;
7114                             set_regex_charset(&oregflags, cs);
7115                         }
7116                         nextchar(pRExC_state);
7117                         if (paren != ':') {
7118                             *flagp = TRYAGAIN;
7119                             return NULL;
7120                         } else {
7121                             ret = NULL;
7122                             goto parse_rest;
7123                         }
7124                         /*NOTREACHED*/
7125                     default:
7126                         RExC_parse++;
7127                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7128                         /*NOTREACHED*/
7129                     }
7130                     ++RExC_parse;
7131                 }
7132             }} /* one for the default block, one for the switch */
7133         }
7134         else {                  /* (...) */
7135           capturing_parens:
7136             parno = RExC_npar;
7137             RExC_npar++;
7138
7139             ret = reganode(pRExC_state, OPEN, parno);
7140             if (!SIZE_ONLY ){
7141                 if (!RExC_nestroot)
7142                     RExC_nestroot = parno;
7143                 if (RExC_seen & REG_SEEN_RECURSE
7144                     && !RExC_open_parens[parno-1])
7145                 {
7146                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7147                         "Setting open paren #%"IVdf" to %d\n",
7148                         (IV)parno, REG_NODE_NUM(ret)));
7149                     RExC_open_parens[parno-1]= ret;
7150                 }
7151             }
7152             Set_Node_Length(ret, 1); /* MJD */
7153             Set_Node_Offset(ret, RExC_parse); /* MJD */
7154             is_open = 1;
7155         }
7156     }
7157     else                        /* ! paren */
7158         ret = NULL;
7159
7160    parse_rest:
7161     /* Pick up the branches, linking them together. */
7162     parse_start = RExC_parse;   /* MJD */
7163     br = regbranch(pRExC_state, &flags, 1,depth+1);
7164
7165     /*     branch_len = (paren != 0); */
7166
7167     if (br == NULL)
7168         return(NULL);
7169     if (*RExC_parse == '|') {
7170         if (!SIZE_ONLY && RExC_extralen) {
7171             reginsert(pRExC_state, BRANCHJ, br, depth+1);
7172         }
7173         else {                  /* MJD */
7174             reginsert(pRExC_state, BRANCH, br, depth+1);
7175             Set_Node_Length(br, paren != 0);
7176             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7177         }
7178         have_branch = 1;
7179         if (SIZE_ONLY)
7180             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
7181     }
7182     else if (paren == ':') {
7183         *flagp |= flags&SIMPLE;
7184     }
7185     if (is_open) {                              /* Starts with OPEN. */
7186         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
7187     }
7188     else if (paren != '?')              /* Not Conditional */
7189         ret = br;
7190     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7191     lastbr = br;
7192     while (*RExC_parse == '|') {
7193         if (!SIZE_ONLY && RExC_extralen) {
7194             ender = reganode(pRExC_state, LONGJMP,0);
7195             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7196         }
7197         if (SIZE_ONLY)
7198             RExC_extralen += 2;         /* Account for LONGJMP. */
7199         nextchar(pRExC_state);
7200         if (freeze_paren) {
7201             if (RExC_npar > after_freeze)
7202                 after_freeze = RExC_npar;
7203             RExC_npar = freeze_paren;
7204         }
7205         br = regbranch(pRExC_state, &flags, 0, depth+1);
7206
7207         if (br == NULL)
7208             return(NULL);
7209         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
7210         lastbr = br;
7211         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7212     }
7213
7214     if (have_branch || paren != ':') {
7215         /* Make a closing node, and hook it on the end. */
7216         switch (paren) {
7217         case ':':
7218             ender = reg_node(pRExC_state, TAIL);
7219             break;
7220         case 1:
7221             ender = reganode(pRExC_state, CLOSE, parno);
7222             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7223                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7224                         "Setting close paren #%"IVdf" to %d\n",
7225                         (IV)parno, REG_NODE_NUM(ender)));
7226                 RExC_close_parens[parno-1]= ender;
7227                 if (RExC_nestroot == parno)
7228                     RExC_nestroot = 0;
7229             }
7230             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7231             Set_Node_Length(ender,1); /* MJD */
7232             break;
7233         case '<':
7234         case ',':
7235         case '=':
7236         case '!':
7237             *flagp &= ~HASWIDTH;
7238             /* FALL THROUGH */
7239         case '>':
7240             ender = reg_node(pRExC_state, SUCCEED);
7241             break;
7242         case 0:
7243             ender = reg_node(pRExC_state, END);
7244             if (!SIZE_ONLY) {
7245                 assert(!RExC_opend); /* there can only be one! */
7246                 RExC_opend = ender;
7247             }
7248             break;
7249         }
7250         REGTAIL(pRExC_state, lastbr, ender);
7251
7252         if (have_branch && !SIZE_ONLY) {
7253             if (depth==1)
7254                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7255
7256             /* Hook the tails of the branches to the closing node. */
7257             for (br = ret; br; br = regnext(br)) {
7258                 const U8 op = PL_regkind[OP(br)];
7259                 if (op == BRANCH) {
7260                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7261                 }
7262                 else if (op == BRANCHJ) {
7263                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7264                 }
7265             }
7266         }
7267     }
7268
7269     {
7270         const char *p;
7271         static const char parens[] = "=!<,>";
7272
7273         if (paren && (p = strchr(parens, paren))) {
7274             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7275             int flag = (p - parens) > 1;
7276
7277             if (paren == '>')
7278                 node = SUSPEND, flag = 0;
7279             reginsert(pRExC_state, node,ret, depth+1);
7280             Set_Node_Cur_Length(ret);
7281             Set_Node_Offset(ret, parse_start + 1);
7282             ret->flags = flag;
7283             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7284         }
7285     }
7286
7287     /* Check for proper termination. */
7288     if (paren) {
7289         RExC_flags = oregflags;
7290         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7291             RExC_parse = oregcomp_parse;
7292             vFAIL("Unmatched (");
7293         }
7294     }
7295     else if (!paren && RExC_parse < RExC_end) {
7296         if (*RExC_parse == ')') {
7297             RExC_parse++;
7298             vFAIL("Unmatched )");
7299         }
7300         else
7301             FAIL("Junk on end of regexp");      /* "Can't happen". */
7302         /* NOTREACHED */
7303     }
7304
7305     if (RExC_in_lookbehind) {
7306         RExC_in_lookbehind--;
7307     }
7308     if (after_freeze > RExC_npar)
7309         RExC_npar = after_freeze;
7310     return(ret);
7311 }
7312
7313 /*
7314  - regbranch - one alternative of an | operator
7315  *
7316  * Implements the concatenation operator.
7317  */
7318 STATIC regnode *
7319 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7320 {
7321     dVAR;
7322     register regnode *ret;
7323     register regnode *chain = NULL;
7324     register regnode *latest;
7325     I32 flags = 0, c = 0;
7326     GET_RE_DEBUG_FLAGS_DECL;
7327
7328     PERL_ARGS_ASSERT_REGBRANCH;
7329
7330     DEBUG_PARSE("brnc");
7331
7332     if (first)
7333         ret = NULL;
7334     else {
7335         if (!SIZE_ONLY && RExC_extralen)
7336             ret = reganode(pRExC_state, BRANCHJ,0);
7337         else {
7338             ret = reg_node(pRExC_state, BRANCH);
7339             Set_Node_Length(ret, 1);
7340         }
7341     }
7342
7343     if (!first && SIZE_ONLY)
7344         RExC_extralen += 1;                     /* BRANCHJ */
7345
7346     *flagp = WORST;                     /* Tentatively. */
7347
7348     RExC_parse--;
7349     nextchar(pRExC_state);
7350     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7351         flags &= ~TRYAGAIN;
7352         latest = regpiece(pRExC_state, &flags,depth+1);
7353         if (latest == NULL) {
7354             if (flags & TRYAGAIN)
7355                 continue;
7356             return(NULL);
7357         }
7358         else if (ret == NULL)
7359             ret = latest;
7360         *flagp |= flags&(HASWIDTH|POSTPONED);
7361         if (chain == NULL)      /* First piece. */
7362             *flagp |= flags&SPSTART;
7363         else {
7364             RExC_naughty++;
7365             REGTAIL(pRExC_state, chain, latest);
7366         }
7367         chain = latest;
7368         c++;
7369     }
7370     if (chain == NULL) {        /* Loop ran zero times. */
7371         chain = reg_node(pRExC_state, NOTHING);
7372         if (ret == NULL)
7373             ret = chain;
7374     }
7375     if (c == 1) {
7376         *flagp |= flags&SIMPLE;
7377     }
7378
7379     return ret;
7380 }
7381
7382 /*
7383  - regpiece - something followed by possible [*+?]
7384  *
7385  * Note that the branching code sequences used for ? and the general cases
7386  * of * and + are somewhat optimized:  they use the same NOTHING node as
7387  * both the endmarker for their branch list and the body of the last branch.
7388  * It might seem that this node could be dispensed with entirely, but the
7389  * endmarker role is not redundant.
7390  */
7391 STATIC regnode *
7392 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7393 {
7394     dVAR;
7395     register regnode *ret;
7396     register char op;
7397     register char *next;
7398     I32 flags;
7399     const char * const origparse = RExC_parse;
7400     I32 min;
7401     I32 max = REG_INFTY;
7402     char *parse_start;
7403     const char *maxpos = NULL;
7404     GET_RE_DEBUG_FLAGS_DECL;
7405
7406     PERL_ARGS_ASSERT_REGPIECE;
7407
7408     DEBUG_PARSE("piec");
7409
7410     ret = regatom(pRExC_state, &flags,depth+1);
7411     if (ret == NULL) {
7412         if (flags & TRYAGAIN)
7413             *flagp |= TRYAGAIN;
7414         return(NULL);
7415     }
7416
7417     op = *RExC_parse;
7418
7419     if (op == '{' && regcurly(RExC_parse)) {
7420         maxpos = NULL;
7421         parse_start = RExC_parse; /* MJD */
7422         next = RExC_parse + 1;
7423         while (isDIGIT(*next) || *next == ',') {
7424             if (*next == ',') {
7425                 if (maxpos)
7426                     break;
7427                 else
7428                     maxpos = next;
7429             }
7430             next++;
7431         }
7432         if (*next == '}') {             /* got one */
7433             if (!maxpos)
7434                 maxpos = next;
7435             RExC_parse++;
7436             min = atoi(RExC_parse);
7437             if (*maxpos == ',')
7438                 maxpos++;
7439             else
7440                 maxpos = RExC_parse;
7441             max = atoi(maxpos);
7442             if (!max && *maxpos != '0')
7443                 max = REG_INFTY;                /* meaning "infinity" */
7444             else if (max >= REG_INFTY)
7445                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7446             RExC_parse = next;
7447             nextchar(pRExC_state);
7448
7449         do_curly:
7450             if ((flags&SIMPLE)) {
7451                 RExC_naughty += 2 + RExC_naughty / 2;
7452                 reginsert(pRExC_state, CURLY, ret, depth+1);
7453                 Set_Node_Offset(ret, parse_start+1); /* MJD */
7454                 Set_Node_Cur_Length(ret);
7455             }
7456             else {
7457                 regnode * const w = reg_node(pRExC_state, WHILEM);
7458
7459                 w->flags = 0;
7460                 REGTAIL(pRExC_state, ret, w);
7461                 if (!SIZE_ONLY && RExC_extralen) {
7462                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
7463                     reginsert(pRExC_state, NOTHING,ret, depth+1);
7464                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
7465                 }
7466                 reginsert(pRExC_state, CURLYX,ret, depth+1);
7467                                 /* MJD hk */
7468                 Set_Node_Offset(ret, parse_start+1);
7469                 Set_Node_Length(ret,
7470                                 op == '{' ? (RExC_parse - parse_start) : 1);
7471
7472                 if (!SIZE_ONLY && RExC_extralen)
7473                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
7474                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7475                 if (SIZE_ONLY)
7476                     RExC_whilem_seen++, RExC_extralen += 3;
7477                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
7478             }
7479             ret->flags = 0;
7480
7481             if (min > 0)
7482                 *flagp = WORST;
7483             if (max > 0)
7484                 *flagp |= HASWIDTH;
7485             if (max < min)
7486                 vFAIL("Can't do {n,m} with n > m");
7487             if (!SIZE_ONLY) {
7488                 ARG1_SET(ret, (U16)min);
7489                 ARG2_SET(ret, (U16)max);
7490             }
7491
7492             goto nest_check;
7493         }
7494     }
7495
7496     if (!ISMULT1(op)) {
7497         *flagp = flags;
7498         return(ret);
7499     }
7500
7501 #if 0                           /* Now runtime fix should be reliable. */
7502
7503     /* if this is reinstated, don't forget to put this back into perldiag:
7504
7505             =item Regexp *+ operand could be empty at {#} in regex m/%s/
7506
7507            (F) The part of the regexp subject to either the * or + quantifier
7508            could match an empty string. The {#} shows in the regular
7509            expression about where the problem was discovered.
7510
7511     */
7512
7513     if (!(flags&HASWIDTH) && op != '?')
7514       vFAIL("Regexp *+ operand could be empty");
7515 #endif
7516
7517     parse_start = RExC_parse;
7518     nextchar(pRExC_state);
7519
7520     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7521
7522     if (op == '*' && (flags&SIMPLE)) {
7523         reginsert(pRExC_state, STAR, ret, depth+1);
7524         ret->flags = 0;
7525         RExC_naughty += 4;
7526     }
7527     else if (op == '*') {
7528         min = 0;
7529         goto do_curly;
7530     }
7531     else if (op == '+' && (flags&SIMPLE)) {
7532         reginsert(pRExC_state, PLUS, ret, depth+1);
7533         ret->flags = 0;
7534         RExC_naughty += 3;
7535     }
7536     else if (op == '+') {
7537         min = 1;
7538         goto do_curly;
7539     }
7540     else if (op == '?') {
7541         min = 0; max = 1;
7542         goto do_curly;
7543     }
7544   nest_check:
7545     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7546         ckWARN3reg(RExC_parse,
7547                    "%.*s matches null string many times",
7548                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7549                    origparse);
7550     }
7551
7552     if (RExC_parse < RExC_end && *RExC_parse == '?') {
7553         nextchar(pRExC_state);
7554         reginsert(pRExC_state, MINMOD, ret, depth+1);
7555         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7556     }
7557 #ifndef REG_ALLOW_MINMOD_SUSPEND
7558     else
7559 #endif
7560     if (RExC_parse < RExC_end && *RExC_parse == '+') {
7561         regnode *ender;
7562         nextchar(pRExC_state);
7563         ender = reg_node(pRExC_state, SUCCEED);
7564         REGTAIL(pRExC_state, ret, ender);
7565         reginsert(pRExC_state, SUSPEND, ret, depth+1);
7566         ret->flags = 0;
7567         ender = reg_node(pRExC_state, TAIL);
7568         REGTAIL(pRExC_state, ret, ender);
7569         /*ret= ender;*/
7570     }
7571
7572     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7573         RExC_parse++;
7574         vFAIL("Nested quantifiers");
7575     }
7576
7577     return(ret);
7578 }
7579
7580
7581 /* reg_namedseq(pRExC_state,UVp)
7582
7583    This is expected to be called by a parser routine that has
7584    recognized '\N' and needs to handle the rest. RExC_parse is
7585    expected to point at the first char following the N at the time
7586    of the call.
7587
7588    The \N may be inside (indicated by valuep not being NULL) or outside a
7589    character class.
7590
7591    \N may begin either a named sequence, or if outside a character class, mean
7592    to match a non-newline.  For non single-quoted regexes, the tokenizer has
7593    attempted to decide which, and in the case of a named sequence converted it
7594    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7595    where c1... are the characters in the sequence.  For single-quoted regexes,
7596    the tokenizer passes the \N sequence through unchanged; this code will not
7597    attempt to determine this nor expand those.  The net effect is that if the
7598    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7599    signals that this \N occurrence means to match a non-newline.
7600
7601    Only the \N{U+...} form should occur in a character class, for the same
7602    reason that '.' inside a character class means to just match a period: it
7603    just doesn't make sense.
7604
7605    If valuep is non-null then it is assumed that we are parsing inside
7606    of a charclass definition and the first codepoint in the resolved
7607    string is returned via *valuep and the routine will return NULL.
7608    In this mode if a multichar string is returned from the charnames
7609    handler, a warning will be issued, and only the first char in the
7610    sequence will be examined. If the string returned is zero length
7611    then the value of *valuep is undefined and NON-NULL will
7612    be returned to indicate failure. (This will NOT be a valid pointer
7613    to a regnode.)
7614
7615    If valuep is null then it is assumed that we are parsing normal text and a
7616    new EXACT node is inserted into the program containing the resolved string,
7617    and a pointer to the new node is returned.  But if the string is zero length
7618    a NOTHING node is emitted instead.
7619
7620    On success RExC_parse is set to the char following the endbrace.
7621    Parsing failures will generate a fatal error via vFAIL(...)
7622  */
7623 STATIC regnode *
7624 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
7625 {
7626     char * endbrace;    /* '}' following the name */
7627     regnode *ret = NULL;
7628 #ifdef DEBUGGING
7629     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
7630 #endif
7631     char* p;
7632
7633     GET_RE_DEBUG_FLAGS_DECL;
7634
7635     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7636
7637     GET_RE_DEBUG_FLAGS;
7638
7639     /* The [^\n] meaning of \N ignores spaces and comments under the /x
7640      * modifier.  The other meaning does not */
7641     p = (RExC_flags & RXf_PMf_EXTENDED)
7642         ? regwhite( pRExC_state, RExC_parse )
7643         : RExC_parse;
7644
7645     /* Disambiguate between \N meaning a named character versus \N meaning
7646      * [^\n].  The former is assumed when it can't be the latter. */
7647     if (*p != '{' || regcurly(p)) {
7648         RExC_parse = p;
7649         if (valuep) {
7650             /* no bare \N in a charclass */
7651             vFAIL("\\N in a character class must be a named character: \\N{...}");
7652         }
7653         nextchar(pRExC_state);
7654         ret = reg_node(pRExC_state, REG_ANY);
7655         *flagp |= HASWIDTH|SIMPLE;
7656         RExC_naughty++;
7657         RExC_parse--;
7658         Set_Node_Length(ret, 1); /* MJD */
7659         return ret;
7660     }
7661
7662     /* Here, we have decided it should be a named sequence */
7663
7664     /* The test above made sure that the next real character is a '{', but
7665      * under the /x modifier, it could be separated by space (or a comment and
7666      * \n) and this is not allowed (for consistency with \x{...} and the
7667      * tokenizer handling of \N{NAME}). */
7668     if (*RExC_parse != '{') {
7669         vFAIL("Missing braces on \\N{}");
7670     }
7671
7672     RExC_parse++;       /* Skip past the '{' */
7673
7674     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7675         || ! (endbrace == RExC_parse            /* nothing between the {} */
7676               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
7677                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7678     {
7679         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
7680         vFAIL("\\N{NAME} must be resolved by the lexer");
7681     }
7682
7683     if (endbrace == RExC_parse) {   /* empty: \N{} */
7684         if (! valuep) {
7685             RExC_parse = endbrace + 1;
7686             return reg_node(pRExC_state,NOTHING);
7687         }
7688
7689         if (SIZE_ONLY) {
7690             ckWARNreg(RExC_parse,
7691                     "Ignoring zero length \\N{} in character class"
7692             );
7693             RExC_parse = endbrace + 1;
7694         }
7695         *valuep = 0;
7696         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7697     }
7698
7699     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
7700     RExC_parse += 2;    /* Skip past the 'U+' */
7701
7702     if (valuep) {   /* In a bracketed char class */
7703         /* We only pay attention to the first char of
7704         multichar strings being returned. I kinda wonder
7705         if this makes sense as it does change the behaviour
7706         from earlier versions, OTOH that behaviour was broken
7707         as well. XXX Solution is to recharacterize as
7708         [rest-of-class]|multi1|multi2... */
7709
7710         STRLEN length_of_hex;
7711         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7712             | PERL_SCAN_DISALLOW_PREFIX
7713             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7714
7715         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7716         if (endchar < endbrace) {
7717             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7718         }
7719
7720         length_of_hex = (STRLEN)(endchar - RExC_parse);
7721         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7722
7723         /* The tokenizer should have guaranteed validity, but it's possible to
7724          * bypass it by using single quoting, so check */
7725         if (length_of_hex == 0
7726             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7727         {
7728             RExC_parse += length_of_hex;        /* Includes all the valid */
7729             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7730                             ? UTF8SKIP(RExC_parse)
7731                             : 1;
7732             /* Guard against malformed utf8 */
7733             if (RExC_parse >= endchar) RExC_parse = endchar;
7734             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7735         }
7736
7737         RExC_parse = endbrace + 1;
7738         if (endchar == endbrace) return NULL;
7739
7740         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7741     }
7742     else {      /* Not a char class */
7743         char *s;            /* String to put in generated EXACT node */
7744         STRLEN len = 0;     /* Its current byte length */
7745         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7746                                stream */
7747         ret = reg_node(pRExC_state,
7748                            (U8) ((! FOLD) ? EXACT
7749                                           : (LOC)
7750                                              ? EXACTFL
7751                                              : (MORE_ASCII_RESTRICTED)
7752                                                ? EXACTFA
7753                                                : (AT_LEAST_UNI_SEMANTICS)
7754                                                  ? EXACTFU
7755                                                  : EXACTF));
7756         s= STRING(ret);
7757
7758         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7759          * the input which is of the form now 'c1.c2.c3...}' until find the
7760          * ending brace or exceed length 255.  The characters that exceed this
7761          * limit are dropped.  The limit could be relaxed should it become
7762          * desirable by reparsing this as (?:\N{NAME}), so could generate
7763          * multiple EXACT nodes, as is done for just regular input.  But this
7764          * is primarily a named character, and not intended to be a huge long
7765          * string, so 255 bytes should be good enough */
7766         while (1) {
7767             STRLEN length_of_hex;
7768             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7769                             | PERL_SCAN_DISALLOW_PREFIX
7770                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7771             UV cp;  /* Ord of current character */
7772             bool use_this_char_fold = FOLD;
7773
7774             /* Code points are separated by dots.  If none, there is only one
7775              * code point, and is terminated by the brace */
7776             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7777
7778             /* The values are Unicode even on EBCDIC machines */
7779             length_of_hex = (STRLEN)(endchar - RExC_parse);
7780             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7781             if ( length_of_hex == 0
7782                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7783             {
7784                 RExC_parse += length_of_hex;        /* Includes all the valid */
7785                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7786                                 ? UTF8SKIP(RExC_parse)
7787                                 : 1;
7788                 /* Guard against malformed utf8 */
7789                 if (RExC_parse >= endchar) RExC_parse = endchar;
7790                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7791             }
7792
7793             /* XXX ? Change to ANYOF node
7794             if (FOLD
7795                 && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
7796                 && is_TRICKYFOLD_cp(cp))
7797             {
7798             }
7799             */
7800
7801             /* Under /aa, we can't mix ASCII with non- in a fold.  If we are
7802              * folding, and the source isn't ASCII, look through all the
7803              * characters it folds to.  If any one of them is ASCII, forbid
7804              * this fold.  (cp is uni, so the 127 below is correct even for
7805              * EBCDIC).  Similarly under locale rules, we don't mix under 256
7806              * with above 255.  XXX It really doesn't make sense to have \N{}
7807              * which means a Unicode rules under locale.  I (khw) think this
7808              * should be warned about, but the counter argument is that people
7809              * who have programmed around Perl's earlier lack of specifying the
7810              * rules and used \N{} to force Unicode things in a local
7811              * environment shouldn't get suddenly a warning */
7812             if (use_this_char_fold) {
7813                 if (LOC && cp < 256) {  /* Fold not known until run-time */
7814                     use_this_char_fold = FALSE;
7815                 }
7816                 else if ((cp > 127 && MORE_ASCII_RESTRICTED)
7817                          || (cp > 255 && LOC))
7818                 {
7819                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
7820                 U8* s = tmpbuf;
7821                 U8* e;
7822                 STRLEN foldlen;
7823
7824                 (void) toFOLD_uni(cp, tmpbuf, &foldlen);
7825                 e = s + foldlen;
7826
7827                 while (s < e) {
7828                     if (isASCII(*s)
7829                         || (LOC && (UTF8_IS_INVARIANT(*s)
7830                                     || UTF8_IS_DOWNGRADEABLE_START(*s))))
7831                     {
7832                         use_this_char_fold = FALSE;
7833                         break;
7834                     }
7835                     s += UTF8SKIP(s);
7836                 }
7837                 }
7838             }
7839
7840             if (! use_this_char_fold) { /* Not folding, just append to the
7841                                            string */
7842                 STRLEN unilen;
7843
7844                 /* Quit before adding this character if would exceed limit */
7845                 if (len + UNISKIP(cp) > U8_MAX) break;
7846
7847                 unilen = reguni(pRExC_state, cp, s);
7848                 if (unilen > 0) {
7849                     s   += unilen;
7850                     len += unilen;
7851                 }
7852             } else {    /* Folding, output the folded equivalent */
7853                 STRLEN foldlen,numlen;
7854                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7855                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7856
7857                 /* Quit before exceeding size limit */
7858                 if (len + foldlen > U8_MAX) break;
7859
7860                 for (foldbuf = tmpbuf;
7861                     foldlen;
7862                     foldlen -= numlen)
7863                 {
7864                     cp = utf8_to_uvchr(foldbuf, &numlen);
7865                     if (numlen > 0) {
7866                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7867                         s       += unilen;
7868                         len     += unilen;
7869                         /* In EBCDIC the numlen and unilen can differ. */
7870                         foldbuf += numlen;
7871                         if (numlen >= foldlen)
7872                             break;
7873                     }
7874                     else
7875                         break; /* "Can't happen." */
7876                 }
7877             }
7878
7879             /* Point to the beginning of the next character in the sequence. */
7880             RExC_parse = endchar + 1;
7881
7882             /* Quit if no more characters */
7883             if (RExC_parse >= endbrace) break;
7884         }
7885
7886
7887         if (SIZE_ONLY) {
7888             if (RExC_parse < endbrace) {
7889                 ckWARNreg(RExC_parse - 1,
7890                           "Using just the first characters returned by \\N{}");
7891             }
7892
7893             RExC_size += STR_SZ(len);
7894         } else {
7895             STR_LEN(ret) = len;
7896             RExC_emit += STR_SZ(len);
7897         }
7898
7899         RExC_parse = endbrace + 1;
7900
7901         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7902                                with malformed in t/re/pat_advanced.t */
7903         RExC_parse --;
7904         Set_Node_Cur_Length(ret); /* MJD */
7905         nextchar(pRExC_state);
7906     }
7907
7908     return ret;
7909 }
7910
7911
7912 /*
7913  * reg_recode
7914  *
7915  * It returns the code point in utf8 for the value in *encp.
7916  *    value: a code value in the source encoding
7917  *    encp:  a pointer to an Encode object
7918  *
7919  * If the result from Encode is not a single character,
7920  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7921  */
7922 STATIC UV
7923 S_reg_recode(pTHX_ const char value, SV **encp)
7924 {
7925     STRLEN numlen = 1;
7926     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7927     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7928     const STRLEN newlen = SvCUR(sv);
7929     UV uv = UNICODE_REPLACEMENT;
7930
7931     PERL_ARGS_ASSERT_REG_RECODE;
7932
7933     if (newlen)
7934         uv = SvUTF8(sv)
7935              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7936              : *(U8*)s;
7937
7938     if (!newlen || numlen != newlen) {
7939         uv = UNICODE_REPLACEMENT;
7940         *encp = NULL;
7941     }
7942     return uv;
7943 }
7944
7945
7946 /*
7947  - regatom - the lowest level
7948
7949    Try to identify anything special at the start of the pattern. If there
7950    is, then handle it as required. This may involve generating a single regop,
7951    such as for an assertion; or it may involve recursing, such as to
7952    handle a () structure.
7953
7954    If the string doesn't start with something special then we gobble up
7955    as much literal text as we can.
7956
7957    Once we have been able to handle whatever type of thing started the
7958    sequence, we return.
7959
7960    Note: we have to be careful with escapes, as they can be both literal
7961    and special, and in the case of \10 and friends can either, depending
7962    on context. Specifically there are two separate switches for handling
7963    escape sequences, with the one for handling literal escapes requiring
7964    a dummy entry for all of the special escapes that are actually handled
7965    by the other.
7966 */
7967
7968 STATIC regnode *
7969 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7970 {
7971     dVAR;
7972     register regnode *ret = NULL;
7973     I32 flags;
7974     char *parse_start = RExC_parse;
7975     U8 op;
7976     GET_RE_DEBUG_FLAGS_DECL;
7977     DEBUG_PARSE("atom");
7978     *flagp = WORST;             /* Tentatively. */
7979
7980     PERL_ARGS_ASSERT_REGATOM;
7981
7982 tryagain:
7983     switch ((U8)*RExC_parse) {
7984     case '^':
7985         RExC_seen_zerolen++;
7986         nextchar(pRExC_state);
7987         if (RExC_flags & RXf_PMf_MULTILINE)
7988             ret = reg_node(pRExC_state, MBOL);
7989         else if (RExC_flags & RXf_PMf_SINGLELINE)
7990             ret = reg_node(pRExC_state, SBOL);
7991         else
7992             ret = reg_node(pRExC_state, BOL);
7993         Set_Node_Length(ret, 1); /* MJD */
7994         break;
7995     case '$':
7996         nextchar(pRExC_state);
7997         if (*RExC_parse)
7998             RExC_seen_zerolen++;
7999         if (RExC_flags & RXf_PMf_MULTILINE)
8000             ret = reg_node(pRExC_state, MEOL);
8001         else if (RExC_flags & RXf_PMf_SINGLELINE)
8002             ret = reg_node(pRExC_state, SEOL);
8003         else
8004             ret = reg_node(pRExC_state, EOL);
8005         Set_Node_Length(ret, 1); /* MJD */
8006         break;
8007     case '.':
8008         nextchar(pRExC_state);
8009         if (RExC_flags & RXf_PMf_SINGLELINE)
8010             ret = reg_node(pRExC_state, SANY);
8011         else
8012             ret = reg_node(pRExC_state, REG_ANY);
8013         *flagp |= HASWIDTH|SIMPLE;
8014         RExC_naughty++;
8015         Set_Node_Length(ret, 1); /* MJD */
8016         break;
8017     case '[':
8018     {
8019         char * const oregcomp_parse = ++RExC_parse;
8020         ret = regclass(pRExC_state,depth+1);
8021         if (*RExC_parse != ']') {
8022             RExC_parse = oregcomp_parse;
8023             vFAIL("Unmatched [");
8024         }
8025         nextchar(pRExC_state);
8026         *flagp |= HASWIDTH|SIMPLE;
8027         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8028         break;
8029     }
8030     case '(':
8031         nextchar(pRExC_state);
8032         ret = reg(pRExC_state, 1, &flags,depth+1);
8033         if (ret == NULL) {
8034                 if (flags & TRYAGAIN) {
8035                     if (RExC_parse == RExC_end) {
8036                          /* Make parent create an empty node if needed. */
8037                         *flagp |= TRYAGAIN;
8038                         return(NULL);
8039                     }
8040                     goto tryagain;
8041                 }
8042                 return(NULL);
8043         }
8044         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8045         break;
8046     case '|':
8047     case ')':
8048         if (flags & TRYAGAIN) {
8049             *flagp |= TRYAGAIN;
8050             return NULL;
8051         }
8052         vFAIL("Internal urp");
8053                                 /* Supposed to be caught earlier. */
8054         break;
8055     case '{':
8056         if (!regcurly(RExC_parse)) {
8057             RExC_parse++;
8058             goto defchar;
8059         }
8060         /* FALL THROUGH */
8061     case '?':
8062     case '+':
8063     case '*':
8064         RExC_parse++;
8065         vFAIL("Quantifier follows nothing");
8066         break;
8067     case LATIN_SMALL_LETTER_SHARP_S:
8068     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8069     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8070 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
8071 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
8072     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
8073 #endif
8074         do_foldchar:
8075         if (!LOC && FOLD) {
8076             U32 len,cp;
8077             len=0; /* silence a spurious compiler warning */
8078             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
8079                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
8080                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
8081                 ret = reganode(pRExC_state, FOLDCHAR, cp);
8082                 Set_Node_Length(ret, 1); /* MJD */
8083                 nextchar(pRExC_state); /* kill whitespace under /x */
8084                 return ret;
8085             }
8086         }
8087         goto outer_default;
8088     case '\\':
8089         /* Special Escapes
8090
8091            This switch handles escape sequences that resolve to some kind
8092            of special regop and not to literal text. Escape sequnces that
8093            resolve to literal text are handled below in the switch marked
8094            "Literal Escapes".
8095
8096            Every entry in this switch *must* have a corresponding entry
8097            in the literal escape switch. However, the opposite is not
8098            required, as the default for this switch is to jump to the
8099            literal text handling code.
8100         */
8101         switch ((U8)*++RExC_parse) {
8102         case LATIN_SMALL_LETTER_SHARP_S:
8103         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8104         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8105                    goto do_foldchar;
8106         /* Special Escapes */
8107         case 'A':
8108             RExC_seen_zerolen++;
8109             ret = reg_node(pRExC_state, SBOL);
8110             *flagp |= SIMPLE;
8111             goto finish_meta_pat;
8112         case 'G':
8113             ret = reg_node(pRExC_state, GPOS);
8114             RExC_seen |= REG_SEEN_GPOS;
8115             *flagp |= SIMPLE;
8116             goto finish_meta_pat;
8117         case 'K':
8118             RExC_seen_zerolen++;
8119             ret = reg_node(pRExC_state, KEEPS);
8120             *flagp |= SIMPLE;
8121             /* XXX:dmq : disabling in-place substitution seems to
8122              * be necessary here to avoid cases of memory corruption, as
8123              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8124              */
8125             RExC_seen |= REG_SEEN_LOOKBEHIND;
8126             goto finish_meta_pat;
8127         case 'Z':
8128             ret = reg_node(pRExC_state, SEOL);
8129             *flagp |= SIMPLE;
8130             RExC_seen_zerolen++;                /* Do not optimize RE away */
8131             goto finish_meta_pat;
8132         case 'z':
8133             ret = reg_node(pRExC_state, EOS);
8134             *flagp |= SIMPLE;
8135             RExC_seen_zerolen++;                /* Do not optimize RE away */
8136             goto finish_meta_pat;
8137         case 'C':
8138             ret = reg_node(pRExC_state, CANY);
8139             RExC_seen |= REG_SEEN_CANY;
8140             *flagp |= HASWIDTH|SIMPLE;
8141             goto finish_meta_pat;
8142         case 'X':
8143             ret = reg_node(pRExC_state, CLUMP);
8144             *flagp |= HASWIDTH;
8145             goto finish_meta_pat;
8146         case 'w':
8147             switch (get_regex_charset(RExC_flags)) {
8148                 case REGEX_LOCALE_CHARSET:
8149                     op = ALNUML;
8150                     break;
8151                 case REGEX_UNICODE_CHARSET:
8152                     op = ALNUMU;
8153                     break;
8154                 case REGEX_ASCII_RESTRICTED_CHARSET:
8155                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8156                     op = ALNUMA;
8157                     break;
8158                 case REGEX_DEPENDS_CHARSET:
8159                     op = ALNUM;
8160                     break;
8161                 default:
8162                     goto bad_charset;
8163             }
8164             ret = reg_node(pRExC_state, op);
8165             *flagp |= HASWIDTH|SIMPLE;
8166             goto finish_meta_pat;
8167         case 'W':
8168             switch (get_regex_charset(RExC_flags)) {
8169                 case REGEX_LOCALE_CHARSET:
8170                     op = NALNUML;
8171                     break;
8172                 case REGEX_UNICODE_CHARSET:
8173                     op = NALNUMU;
8174                     break;
8175                 case REGEX_ASCII_RESTRICTED_CHARSET:
8176                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8177                     op = NALNUMA;
8178                     break;
8179                 case REGEX_DEPENDS_CHARSET:
8180                     op = NALNUM;
8181                     break;
8182                 default:
8183                     goto bad_charset;
8184             }
8185             ret = reg_node(pRExC_state, op);
8186             *flagp |= HASWIDTH|SIMPLE;
8187             goto finish_meta_pat;
8188         case 'b':
8189             RExC_seen_zerolen++;
8190             RExC_seen |= REG_SEEN_LOOKBEHIND;
8191             switch (get_regex_charset(RExC_flags)) {
8192                 case REGEX_LOCALE_CHARSET:
8193                     op = BOUNDL;
8194                     break;
8195                 case REGEX_UNICODE_CHARSET:
8196                     op = BOUNDU;
8197                     break;
8198                 case REGEX_ASCII_RESTRICTED_CHARSET:
8199                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8200                     op = BOUNDA;
8201                     break;
8202                 case REGEX_DEPENDS_CHARSET:
8203                     op = BOUND;
8204                     break;
8205                 default:
8206                     goto bad_charset;
8207             }
8208             ret = reg_node(pRExC_state, op);
8209             FLAGS(ret) = get_regex_charset(RExC_flags);
8210             *flagp |= SIMPLE;
8211             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8212                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8213             }
8214             goto finish_meta_pat;
8215         case 'B':
8216             RExC_seen_zerolen++;
8217             RExC_seen |= REG_SEEN_LOOKBEHIND;
8218             switch (get_regex_charset(RExC_flags)) {
8219                 case REGEX_LOCALE_CHARSET:
8220                     op = NBOUNDL;
8221                     break;
8222                 case REGEX_UNICODE_CHARSET:
8223                     op = NBOUNDU;
8224                     break;
8225                 case REGEX_ASCII_RESTRICTED_CHARSET:
8226                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8227                     op = NBOUNDA;
8228                     break;
8229                 case REGEX_DEPENDS_CHARSET:
8230                     op = NBOUND;
8231                     break;
8232                 default:
8233                     goto bad_charset;
8234             }
8235             ret = reg_node(pRExC_state, op);
8236             FLAGS(ret) = get_regex_charset(RExC_flags);
8237             *flagp |= SIMPLE;
8238             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8239                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8240             }
8241             goto finish_meta_pat;
8242         case 's':
8243             switch (get_regex_charset(RExC_flags)) {
8244                 case REGEX_LOCALE_CHARSET:
8245                     op = SPACEL;
8246                     break;
8247                 case REGEX_UNICODE_CHARSET:
8248                     op = SPACEU;
8249                     break;
8250                 case REGEX_ASCII_RESTRICTED_CHARSET:
8251                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8252                     op = SPACEA;
8253                     break;
8254                 case REGEX_DEPENDS_CHARSET:
8255                     op = SPACE;
8256                     break;
8257                 default:
8258                     goto bad_charset;
8259             }
8260             ret = reg_node(pRExC_state, op);
8261             *flagp |= HASWIDTH|SIMPLE;
8262             goto finish_meta_pat;
8263         case 'S':
8264             switch (get_regex_charset(RExC_flags)) {
8265                 case REGEX_LOCALE_CHARSET:
8266                     op = NSPACEL;
8267                     break;
8268                 case REGEX_UNICODE_CHARSET:
8269                     op = NSPACEU;
8270                     break;
8271                 case REGEX_ASCII_RESTRICTED_CHARSET:
8272                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8273                     op = NSPACEA;
8274                     break;
8275                 case REGEX_DEPENDS_CHARSET:
8276                     op = NSPACE;
8277                     break;
8278                 default:
8279                     goto bad_charset;
8280             }
8281             ret = reg_node(pRExC_state, op);
8282             *flagp |= HASWIDTH|SIMPLE;
8283             goto finish_meta_pat;
8284         case 'd':
8285             switch (get_regex_charset(RExC_flags)) {
8286                 case REGEX_LOCALE_CHARSET:
8287                     op = DIGITL;
8288                     break;
8289                 case REGEX_ASCII_RESTRICTED_CHARSET:
8290                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8291                     op = DIGITA;
8292                     break;
8293                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8294                 case REGEX_UNICODE_CHARSET:
8295                     op = DIGIT;
8296                     break;
8297                 default:
8298                     goto bad_charset;
8299             }
8300             ret = reg_node(pRExC_state, op);
8301             *flagp |= HASWIDTH|SIMPLE;
8302             goto finish_meta_pat;
8303         case 'D':
8304             switch (get_regex_charset(RExC_flags)) {
8305                 case REGEX_LOCALE_CHARSET:
8306                     op = NDIGITL;
8307                     break;
8308                 case REGEX_ASCII_RESTRICTED_CHARSET:
8309                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8310                     op = NDIGITA;
8311                     break;
8312                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8313                 case REGEX_UNICODE_CHARSET:
8314                     op = NDIGIT;
8315                     break;
8316                 default:
8317                     goto bad_charset;
8318             }
8319             ret = reg_node(pRExC_state, op);
8320             *flagp |= HASWIDTH|SIMPLE;
8321             goto finish_meta_pat;
8322         case 'R':
8323             ret = reg_node(pRExC_state, LNBREAK);
8324             *flagp |= HASWIDTH|SIMPLE;
8325             goto finish_meta_pat;
8326         case 'h':
8327             ret = reg_node(pRExC_state, HORIZWS);
8328             *flagp |= HASWIDTH|SIMPLE;
8329             goto finish_meta_pat;
8330         case 'H':
8331             ret = reg_node(pRExC_state, NHORIZWS);
8332             *flagp |= HASWIDTH|SIMPLE;
8333             goto finish_meta_pat;
8334         case 'v':
8335             ret = reg_node(pRExC_state, VERTWS);
8336             *flagp |= HASWIDTH|SIMPLE;
8337             goto finish_meta_pat;
8338         case 'V':
8339             ret = reg_node(pRExC_state, NVERTWS);
8340             *flagp |= HASWIDTH|SIMPLE;
8341          finish_meta_pat:
8342             nextchar(pRExC_state);
8343             Set_Node_Length(ret, 2); /* MJD */
8344             break;
8345         case 'p':
8346         case 'P':
8347             {
8348                 char* const oldregxend = RExC_end;
8349 #ifdef DEBUGGING
8350                 char* parse_start = RExC_parse - 2;
8351 #endif
8352
8353                 if (RExC_parse[1] == '{') {
8354                   /* a lovely hack--pretend we saw [\pX] instead */
8355                     RExC_end = strchr(RExC_parse, '}');
8356                     if (!RExC_end) {
8357                         const U8 c = (U8)*RExC_parse;
8358                         RExC_parse += 2;
8359                         RExC_end = oldregxend;
8360                         vFAIL2("Missing right brace on \\%c{}", c);
8361                     }
8362                     RExC_end++;
8363                 }
8364                 else {
8365                     RExC_end = RExC_parse + 2;
8366                     if (RExC_end > oldregxend)
8367                         RExC_end = oldregxend;
8368                 }
8369                 RExC_parse--;
8370
8371                 ret = regclass(pRExC_state,depth+1);
8372
8373                 RExC_end = oldregxend;
8374                 RExC_parse--;
8375
8376                 Set_Node_Offset(ret, parse_start + 2);
8377                 Set_Node_Cur_Length(ret);
8378                 nextchar(pRExC_state);
8379                 *flagp |= HASWIDTH|SIMPLE;
8380             }
8381             break;
8382         case 'N':
8383             /* Handle \N and \N{NAME} here and not below because it can be
8384             multicharacter. join_exact() will join them up later on.
8385             Also this makes sure that things like /\N{BLAH}+/ and
8386             \N{BLAH} being multi char Just Happen. dmq*/
8387             ++RExC_parse;
8388             ret= reg_namedseq(pRExC_state, NULL, flagp);
8389             break;
8390         case 'k':    /* Handle \k<NAME> and \k'NAME' */
8391         parse_named_seq:
8392         {
8393             char ch= RExC_parse[1];
8394             if (ch != '<' && ch != '\'' && ch != '{') {
8395                 RExC_parse++;
8396                 vFAIL2("Sequence %.2s... not terminated",parse_start);
8397             } else {
8398                 /* this pretty much dupes the code for (?P=...) in reg(), if
8399                    you change this make sure you change that */
8400                 char* name_start = (RExC_parse += 2);
8401                 U32 num = 0;
8402                 SV *sv_dat = reg_scan_name(pRExC_state,
8403                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8404                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8405                 if (RExC_parse == name_start || *RExC_parse != ch)
8406                     vFAIL2("Sequence %.3s... not terminated",parse_start);
8407
8408                 if (!SIZE_ONLY) {
8409                     num = add_data( pRExC_state, 1, "S" );
8410                     RExC_rxi->data->data[num]=(void*)sv_dat;
8411                     SvREFCNT_inc_simple_void(sv_dat);
8412                 }
8413
8414                 RExC_sawback = 1;
8415                 ret = reganode(pRExC_state,
8416                                ((! FOLD)
8417                                  ? NREF
8418                                  : (MORE_ASCII_RESTRICTED)
8419                                    ? NREFFA
8420                                    : (AT_LEAST_UNI_SEMANTICS)
8421                                      ? NREFFU
8422                                      : (LOC)
8423                                        ? NREFFL
8424                                        : NREFF),
8425                                 num);
8426                 *flagp |= HASWIDTH;
8427
8428                 /* override incorrect value set in reganode MJD */
8429                 Set_Node_Offset(ret, parse_start+1);
8430                 Set_Node_Cur_Length(ret); /* MJD */
8431                 nextchar(pRExC_state);
8432
8433             }
8434             break;
8435         }
8436         case 'g':
8437         case '1': case '2': case '3': case '4':
8438         case '5': case '6': case '7': case '8': case '9':
8439             {
8440                 I32 num;
8441                 bool isg = *RExC_parse == 'g';
8442                 bool isrel = 0;
8443                 bool hasbrace = 0;
8444                 if (isg) {
8445                     RExC_parse++;
8446                     if (*RExC_parse == '{') {
8447                         RExC_parse++;
8448                         hasbrace = 1;
8449                     }
8450                     if (*RExC_parse == '-') {
8451                         RExC_parse++;
8452                         isrel = 1;
8453                     }
8454                     if (hasbrace && !isDIGIT(*RExC_parse)) {
8455                         if (isrel) RExC_parse--;
8456                         RExC_parse -= 2;
8457                         goto parse_named_seq;
8458                 }   }
8459                 num = atoi(RExC_parse);
8460                 if (isg && num == 0)
8461                     vFAIL("Reference to invalid group 0");
8462                 if (isrel) {
8463                     num = RExC_npar - num;
8464                     if (num < 1)
8465                         vFAIL("Reference to nonexistent or unclosed group");
8466                 }
8467                 if (!isg && num > 9 && num >= RExC_npar)
8468                     goto defchar;
8469                 else {
8470                     char * const parse_start = RExC_parse - 1; /* MJD */
8471                     while (isDIGIT(*RExC_parse))
8472                         RExC_parse++;
8473                     if (parse_start == RExC_parse - 1)
8474                         vFAIL("Unterminated \\g... pattern");
8475                     if (hasbrace) {
8476                         if (*RExC_parse != '}')
8477                             vFAIL("Unterminated \\g{...} pattern");
8478                         RExC_parse++;
8479                     }
8480                     if (!SIZE_ONLY) {
8481                         if (num > (I32)RExC_rx->nparens)
8482                             vFAIL("Reference to nonexistent group");
8483                     }
8484                     RExC_sawback = 1;
8485                     ret = reganode(pRExC_state,
8486                                    ((! FOLD)
8487                                      ? REF
8488                                      : (MORE_ASCII_RESTRICTED)
8489                                        ? REFFA
8490                                        : (AT_LEAST_UNI_SEMANTICS)
8491                                          ? REFFU
8492                                          : (LOC)
8493                                            ? REFFL
8494                                            : REFF),
8495                                     num);
8496                     *flagp |= HASWIDTH;
8497
8498                     /* override incorrect value set in reganode MJD */
8499                     Set_Node_Offset(ret, parse_start+1);
8500                     Set_Node_Cur_Length(ret); /* MJD */
8501                     RExC_parse--;
8502                     nextchar(pRExC_state);
8503                 }
8504             }
8505             break;
8506         case '\0':
8507             if (RExC_parse >= RExC_end)
8508                 FAIL("Trailing \\");
8509             /* FALL THROUGH */
8510         default:
8511             /* Do not generate "unrecognized" warnings here, we fall
8512                back into the quick-grab loop below */
8513             parse_start--;
8514             goto defchar;
8515         }
8516         break;
8517
8518     case '#':
8519         if (RExC_flags & RXf_PMf_EXTENDED) {
8520             if ( reg_skipcomment( pRExC_state ) )
8521                 goto tryagain;
8522         }
8523         /* FALL THROUGH */
8524
8525     default:
8526         outer_default:{
8527             register STRLEN len;
8528             register UV ender;
8529             register char *p;
8530             char *s;
8531             STRLEN foldlen;
8532             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8533             regnode * orig_emit;
8534
8535             parse_start = RExC_parse - 1;
8536
8537             RExC_parse++;
8538
8539         defchar:
8540             ender = 0;
8541             orig_emit = RExC_emit; /* Save the original output node position in
8542                                       case we need to output a different node
8543                                       type */
8544             ret = reg_node(pRExC_state,
8545                            (U8) ((! FOLD) ? EXACT
8546                                           : (LOC)
8547                                              ? EXACTFL
8548                                              : (MORE_ASCII_RESTRICTED)
8549                                                ? EXACTFA
8550                                                : (AT_LEAST_UNI_SEMANTICS)
8551                                                  ? EXACTFU
8552                                                  : EXACTF)
8553                     );
8554             s = STRING(ret);
8555             for (len = 0, p = RExC_parse - 1;
8556               len < 127 && p < RExC_end;
8557               len++)
8558             {
8559                 char * const oldp = p;
8560
8561                 if (RExC_flags & RXf_PMf_EXTENDED)
8562                     p = regwhite( pRExC_state, p );
8563                 switch ((U8)*p) {
8564                 case LATIN_SMALL_LETTER_SHARP_S:
8565                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8566                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8567                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8568                                 goto normal_default;
8569                 case '^':
8570                 case '$':
8571                 case '.':
8572                 case '[':
8573                 case '(':
8574                 case ')':
8575                 case '|':
8576                     goto loopdone;
8577                 case '\\':
8578                     /* Literal Escapes Switch
8579
8580                        This switch is meant to handle escape sequences that
8581                        resolve to a literal character.
8582
8583                        Every escape sequence that represents something
8584                        else, like an assertion or a char class, is handled
8585                        in the switch marked 'Special Escapes' above in this
8586                        routine, but also has an entry here as anything that
8587                        isn't explicitly mentioned here will be treated as
8588                        an unescaped equivalent literal.
8589                     */
8590
8591                     switch ((U8)*++p) {
8592                     /* These are all the special escapes. */
8593                     case LATIN_SMALL_LETTER_SHARP_S:
8594                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8595                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8596                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8597                                 goto normal_default;
8598                     case 'A':             /* Start assertion */
8599                     case 'b': case 'B':   /* Word-boundary assertion*/
8600                     case 'C':             /* Single char !DANGEROUS! */
8601                     case 'd': case 'D':   /* digit class */
8602                     case 'g': case 'G':   /* generic-backref, pos assertion */
8603                     case 'h': case 'H':   /* HORIZWS */
8604                     case 'k': case 'K':   /* named backref, keep marker */
8605                     case 'N':             /* named char sequence */
8606                     case 'p': case 'P':   /* Unicode property */
8607                               case 'R':   /* LNBREAK */
8608                     case 's': case 'S':   /* space class */
8609                     case 'v': case 'V':   /* VERTWS */
8610                     case 'w': case 'W':   /* word class */
8611                     case 'X':             /* eXtended Unicode "combining character sequence" */
8612                     case 'z': case 'Z':   /* End of line/string assertion */
8613                         --p;
8614                         goto loopdone;
8615
8616                     /* Anything after here is an escape that resolves to a
8617                        literal. (Except digits, which may or may not)
8618                      */
8619                     case 'n':
8620                         ender = '\n';
8621                         p++;
8622                         break;
8623                     case 'r':
8624                         ender = '\r';
8625                         p++;
8626                         break;
8627                     case 't':
8628                         ender = '\t';
8629                         p++;
8630                         break;
8631                     case 'f':
8632                         ender = '\f';
8633                         p++;
8634                         break;
8635                     case 'e':
8636                           ender = ASCII_TO_NATIVE('\033');
8637                         p++;
8638                         break;
8639                     case 'a':
8640                           ender = ASCII_TO_NATIVE('\007');
8641                         p++;
8642                         break;
8643                     case 'o':
8644                         {
8645                             STRLEN brace_len = len;
8646                             UV result;
8647                             const char* error_msg;
8648
8649                             bool valid = grok_bslash_o(p,
8650                                                        &result,
8651                                                        &brace_len,
8652                                                        &error_msg,
8653                                                        1);
8654                             p += brace_len;
8655                             if (! valid) {
8656                                 RExC_parse = p; /* going to die anyway; point
8657                                                    to exact spot of failure */
8658                                 vFAIL(error_msg);
8659                             }
8660                             else
8661                             {
8662                                 ender = result;
8663                             }
8664                             if (PL_encoding && ender < 0x100) {
8665                                 goto recode_encoding;
8666                             }
8667                             if (ender > 0xff) {
8668                                 REQUIRE_UTF8;
8669                             }
8670                             break;
8671                         }
8672                     case 'x':
8673                         if (*++p == '{') {
8674                             char* const e = strchr(p, '}');
8675
8676                             if (!e) {
8677                                 RExC_parse = p + 1;
8678                                 vFAIL("Missing right brace on \\x{}");
8679                             }
8680                             else {
8681                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8682                                     | PERL_SCAN_DISALLOW_PREFIX;
8683                                 STRLEN numlen = e - p - 1;
8684                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
8685                                 if (ender > 0xff)
8686                                     REQUIRE_UTF8;
8687                                 p = e + 1;
8688                             }
8689                         }
8690                         else {
8691                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8692                             STRLEN numlen = 2;
8693                             ender = grok_hex(p, &numlen, &flags, NULL);
8694                             p += numlen;
8695                         }
8696                         if (PL_encoding && ender < 0x100)
8697                             goto recode_encoding;
8698                         break;
8699                     case 'c':
8700                         p++;
8701                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8702                         break;
8703                     case '0': case '1': case '2': case '3':case '4':
8704                     case '5': case '6': case '7': case '8':case '9':
8705                         if (*p == '0' ||
8706                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8707                         {
8708                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8709                             STRLEN numlen = 3;
8710                             ender = grok_oct(p, &numlen, &flags, NULL);
8711                             if (ender > 0xff) {
8712                                 REQUIRE_UTF8;
8713                             }
8714                             p += numlen;
8715                         }
8716                         else {
8717                             --p;
8718                             goto loopdone;
8719                         }
8720                         if (PL_encoding && ender < 0x100)
8721                             goto recode_encoding;
8722                         break;
8723                     recode_encoding:
8724                         {
8725                             SV* enc = PL_encoding;
8726                             ender = reg_recode((const char)(U8)ender, &enc);
8727                             if (!enc && SIZE_ONLY)
8728                                 ckWARNreg(p, "Invalid escape in the specified encoding");
8729                             REQUIRE_UTF8;
8730                         }
8731                         break;
8732                     case '\0':
8733                         if (p >= RExC_end)
8734                             FAIL("Trailing \\");
8735                         /* FALL THROUGH */
8736                     default:
8737                         if (!SIZE_ONLY&& isALPHA(*p)) {
8738                             /* Include any { following the alpha to emphasize
8739                              * that it could be part of an escape at some point
8740                              * in the future */
8741                             int len = (*(p + 1) == '{') ? 2 : 1;
8742                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8743                         }
8744                         goto normal_default;
8745                     }
8746                     break;
8747                 default:
8748                   normal_default:
8749                     if (UTF8_IS_START(*p) && UTF) {
8750                         STRLEN numlen;
8751                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8752                                                &numlen, UTF8_ALLOW_DEFAULT);
8753                         p += numlen;
8754                     }
8755                     else
8756                         ender = (U8) *p++;
8757                     break;
8758                 } /* End of switch on the literal */
8759
8760                 /* Certain characters are problematic because their folded
8761                  * length is so different from their original length that it
8762                  * isn't handleable by the optimizer.  They are therefore not
8763                  * placed in an EXACTish node; and are here handled specially.
8764                  * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8765                  * putting it in a special node keeps regexec from having to
8766                  * deal with a non-utf8 multi-char fold */
8767                 if (FOLD
8768                     && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
8769                     && is_TRICKYFOLD_cp(ender))
8770                 {
8771                     /* If is in middle of outputting characters into an
8772                      * EXACTish node, go output what we have so far, and
8773                      * position the parse so that this will be called again
8774                      * immediately */
8775                     if (len) {
8776                         p  = RExC_parse + len - 1;
8777                         goto loopdone;
8778                     }
8779                     else {
8780
8781                         /* Here we are ready to output our tricky fold
8782                          * character.  What's done is to pretend it's in a
8783                          * [bracketed] class, and let the code that deals with
8784                          * those handle it, as that code has all the
8785                          * intelligence necessary.  First save the current
8786                          * parse state, get rid of the already allocated EXACT
8787                          * node that the ANYOFV node will replace, and point
8788                          * the parse to a buffer which we fill with the
8789                          * character we want the regclass code to think is
8790                          * being parsed */
8791                         char* const oldregxend = RExC_end;
8792                         char tmpbuf[2];
8793                         RExC_emit = orig_emit;
8794                         RExC_parse = tmpbuf;
8795                         if (UTF) {
8796                             tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
8797                             tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
8798                             RExC_end = RExC_parse + 2;
8799                         }
8800                         else {
8801                             tmpbuf[0] = (char) ender;
8802                             RExC_end = RExC_parse + 1;
8803                         }
8804
8805                         ret = regclass(pRExC_state,depth+1);
8806
8807                         /* Here, have parsed the buffer.  Reset the parse to
8808                          * the actual input, and return */
8809                         RExC_end = oldregxend;
8810                         RExC_parse = p - 1;
8811
8812                         Set_Node_Offset(ret, RExC_parse);
8813                         Set_Node_Cur_Length(ret);
8814                         nextchar(pRExC_state);
8815                         *flagp |= HASWIDTH|SIMPLE;
8816                         return ret;
8817                     }
8818                 }
8819
8820                 if ( RExC_flags & RXf_PMf_EXTENDED)
8821                     p = regwhite( pRExC_state, p );
8822                 if (UTF && FOLD) {
8823                     /* Prime the casefolded buffer.  Locale rules, which apply
8824                      * only to code points < 256, aren't known until execution,
8825                      * so for them, just output the original character using
8826                      * utf8 */
8827                     if (LOC && ender < 256) {
8828                         if (UNI_IS_INVARIANT(ender)) {
8829                             *tmpbuf = (U8) ender;
8830                             foldlen = 1;
8831                         } else {
8832                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8833                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8834                             foldlen = 2;
8835                         }
8836                     }
8837                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
8838                                                  */
8839                         ender = toLOWER(ender);
8840                         *tmpbuf = (U8) ender;
8841                         foldlen = 1;
8842                     }
8843                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8844
8845                         /* Locale and /aa require more selectivity about the
8846                          * fold, so are handled below.  Otherwise, here, just
8847                          * use the fold */
8848                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8849                     }
8850                     else {
8851                         /* Under locale rules or /aa we are not to mix,
8852                          * respectively, ords < 256 or ASCII with non-.  So
8853                          * reject folds that mix them, using only the
8854                          * non-folded code point.  So do the fold to a
8855                          * temporary, and inspect each character in it. */
8856                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8857                         U8* s = trialbuf;
8858                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8859                         U8* e = s + foldlen;
8860                         bool fold_ok = TRUE;
8861
8862                         while (s < e) {
8863                             if (isASCII(*s)
8864                                 || (LOC && (UTF8_IS_INVARIANT(*s)
8865                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
8866                             {
8867                                 fold_ok = FALSE;
8868                                 break;
8869                             }
8870                             s += UTF8SKIP(s);
8871                         }
8872                         if (fold_ok) {
8873                             Copy(trialbuf, tmpbuf, foldlen, U8);
8874                             ender = tmpender;
8875                         }
8876                         else {
8877                             uvuni_to_utf8(tmpbuf, ender);
8878                             foldlen = UNISKIP(ender);
8879                         }
8880                     }
8881                 }
8882                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8883                     if (len)
8884                         p = oldp;
8885                     else if (UTF) {
8886                          if (FOLD) {
8887                               /* Emit all the Unicode characters. */
8888                               STRLEN numlen;
8889                               for (foldbuf = tmpbuf;
8890                                    foldlen;
8891                                    foldlen -= numlen) {
8892                                    ender = utf8_to_uvchr(foldbuf, &numlen);
8893                                    if (numlen > 0) {
8894                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
8895                                         s       += unilen;
8896                                         len     += unilen;
8897                                         /* In EBCDIC the numlen
8898                                          * and unilen can differ. */
8899                                         foldbuf += numlen;
8900                                         if (numlen >= foldlen)
8901                                              break;
8902                                    }
8903                                    else
8904                                         break; /* "Can't happen." */
8905                               }
8906                          }
8907                          else {
8908                               const STRLEN unilen = reguni(pRExC_state, ender, s);
8909                               if (unilen > 0) {
8910                                    s   += unilen;
8911                                    len += unilen;
8912                               }
8913                          }
8914                     }
8915                     else {
8916                         len++;
8917                         REGC((char)ender, s++);
8918                     }
8919                     break;
8920                 }
8921                 if (UTF) {
8922                      if (FOLD) {
8923                           /* Emit all the Unicode characters. */
8924                           STRLEN numlen;
8925                           for (foldbuf = tmpbuf;
8926                                foldlen;
8927                                foldlen -= numlen) {
8928                                ender = utf8_to_uvchr(foldbuf, &numlen);
8929                                if (numlen > 0) {
8930                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
8931                                     len     += unilen;
8932                                     s       += unilen;
8933                                     /* In EBCDIC the numlen
8934                                      * and unilen can differ. */
8935                                     foldbuf += numlen;
8936                                     if (numlen >= foldlen)
8937                                          break;
8938                                }
8939                                else
8940                                     break;
8941                           }
8942                      }
8943                      else {
8944                           const STRLEN unilen = reguni(pRExC_state, ender, s);
8945                           if (unilen > 0) {
8946                                s   += unilen;
8947                                len += unilen;
8948                           }
8949                      }
8950                      len--;
8951                 }
8952                 else
8953                     REGC((char)ender, s++);
8954             }
8955         loopdone:   /* Jumped to when encounters something that shouldn't be in
8956                        the node */
8957             RExC_parse = p - 1;
8958             Set_Node_Cur_Length(ret); /* MJD */
8959             nextchar(pRExC_state);
8960             {
8961                 /* len is STRLEN which is unsigned, need to copy to signed */
8962                 IV iv = len;
8963                 if (iv < 0)
8964                     vFAIL("Internal disaster");
8965             }
8966             if (len > 0)
8967                 *flagp |= HASWIDTH;
8968             if (len == 1 && UNI_IS_INVARIANT(ender))
8969                 *flagp |= SIMPLE;
8970
8971             if (SIZE_ONLY)
8972                 RExC_size += STR_SZ(len);
8973             else {
8974                 STR_LEN(ret) = len;
8975                 RExC_emit += STR_SZ(len);
8976             }
8977         }
8978         break;
8979     }
8980
8981     return(ret);
8982
8983 /* Jumped to when an unrecognized character set is encountered */
8984 bad_charset:
8985     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
8986     return(NULL);
8987 }
8988
8989 STATIC char *
8990 S_regwhite( RExC_state_t *pRExC_state, char *p )
8991 {
8992     const char *e = RExC_end;
8993
8994     PERL_ARGS_ASSERT_REGWHITE;
8995
8996     while (p < e) {
8997         if (isSPACE(*p))
8998             ++p;
8999         else if (*p == '#') {
9000             bool ended = 0;
9001             do {
9002                 if (*p++ == '\n') {
9003                     ended = 1;
9004                     break;
9005                 }
9006             } while (p < e);
9007             if (!ended)
9008                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9009         }
9010         else
9011             break;
9012     }
9013     return p;
9014 }
9015
9016 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9017    Character classes ([:foo:]) can also be negated ([:^foo:]).
9018    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9019    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9020    but trigger failures because they are currently unimplemented. */
9021
9022 #define POSIXCC_DONE(c)   ((c) == ':')
9023 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9024 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9025
9026 STATIC I32
9027 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9028 {
9029     dVAR;
9030     I32 namedclass = OOB_NAMEDCLASS;
9031
9032     PERL_ARGS_ASSERT_REGPPOSIXCC;
9033
9034     if (value == '[' && RExC_parse + 1 < RExC_end &&
9035         /* I smell either [: or [= or [. -- POSIX has been here, right? */
9036         POSIXCC(UCHARAT(RExC_parse))) {
9037         const char c = UCHARAT(RExC_parse);
9038         char* const s = RExC_parse++;
9039
9040         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9041             RExC_parse++;
9042         if (RExC_parse == RExC_end)
9043             /* Grandfather lone [:, [=, [. */
9044             RExC_parse = s;
9045         else {
9046             const char* const t = RExC_parse++; /* skip over the c */
9047             assert(*t == c);
9048
9049             if (UCHARAT(RExC_parse) == ']') {
9050                 const char *posixcc = s + 1;
9051                 RExC_parse++; /* skip over the ending ] */
9052
9053                 if (*s == ':') {
9054                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9055                     const I32 skip = t - posixcc;
9056
9057                     /* Initially switch on the length of the name.  */
9058                     switch (skip) {
9059                     case 4:
9060                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9061                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9062                         break;
9063                     case 5:
9064                         /* Names all of length 5.  */
9065                         /* alnum alpha ascii blank cntrl digit graph lower
9066                            print punct space upper  */
9067                         /* Offset 4 gives the best switch position.  */
9068                         switch (posixcc[4]) {
9069                         case 'a':
9070                             if (memEQ(posixcc, "alph", 4)) /* alpha */
9071                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9072                             break;
9073                         case 'e':
9074                             if (memEQ(posixcc, "spac", 4)) /* space */
9075                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9076                             break;
9077                         case 'h':
9078                             if (memEQ(posixcc, "grap", 4)) /* graph */
9079                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9080                             break;
9081                         case 'i':
9082                             if (memEQ(posixcc, "asci", 4)) /* ascii */
9083                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9084                             break;
9085                         case 'k':
9086                             if (memEQ(posixcc, "blan", 4)) /* blank */
9087                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9088                             break;
9089                         case 'l':
9090                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9091                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9092                             break;
9093                         case 'm':
9094                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
9095                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9096                             break;
9097                         case 'r':
9098                             if (memEQ(posixcc, "lowe", 4)) /* lower */
9099                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9100                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
9101                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9102                             break;
9103                         case 't':
9104                             if (memEQ(posixcc, "digi", 4)) /* digit */
9105                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9106                             else if (memEQ(posixcc, "prin", 4)) /* print */
9107                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9108                             else if (memEQ(posixcc, "punc", 4)) /* punct */
9109                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9110                             break;
9111                         }
9112                         break;
9113                     case 6:
9114                         if (memEQ(posixcc, "xdigit", 6))
9115                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9116                         break;
9117                     }
9118
9119                     if (namedclass == OOB_NAMEDCLASS)
9120                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9121                                       t - s - 1, s + 1);
9122                     assert (posixcc[skip] == ':');
9123                     assert (posixcc[skip+1] == ']');
9124                 } else if (!SIZE_ONLY) {
9125                     /* [[=foo=]] and [[.foo.]] are still future. */
9126
9127                     /* adjust RExC_parse so the warning shows after
9128                        the class closes */
9129                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9130                         RExC_parse++;
9131                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9132                 }
9133             } else {
9134                 /* Maternal grandfather:
9135                  * "[:" ending in ":" but not in ":]" */
9136                 RExC_parse = s;
9137             }
9138         }
9139     }
9140
9141     return namedclass;
9142 }
9143
9144 STATIC void
9145 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9146 {
9147     dVAR;
9148
9149     PERL_ARGS_ASSERT_CHECKPOSIXCC;
9150
9151     if (POSIXCC(UCHARAT(RExC_parse))) {
9152         const char *s = RExC_parse;
9153         const char  c = *s++;
9154
9155         while (isALNUM(*s))
9156             s++;
9157         if (*s && c == *s && s[1] == ']') {
9158             ckWARN3reg(s+2,
9159                        "POSIX syntax [%c %c] belongs inside character classes",
9160                        c, c);
9161
9162             /* [[=foo=]] and [[.foo.]] are still future. */
9163             if (POSIXCC_NOTYET(c)) {
9164                 /* adjust RExC_parse so the error shows after
9165                    the class closes */
9166                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9167                     NOOP;
9168                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9169             }
9170         }
9171     }
9172 }
9173
9174 /* No locale test, and always Unicode semantics */
9175 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
9176 ANYOF_##NAME:                                                                  \
9177         for (value = 0; value < 256; value++)                                  \
9178             if (TEST)                                                          \
9179             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9180     yesno = '+';                                                               \
9181     what = WORD;                                                               \
9182     break;                                                                     \
9183 case ANYOF_N##NAME:                                                            \
9184         for (value = 0; value < 256; value++)                                  \
9185             if (!TEST)                                                         \
9186             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9187     yesno = '!';                                                               \
9188     what = WORD;                                                               \
9189     break
9190
9191 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9192  * there are two tests passed in, to use depending on that. There aren't any
9193  * cases where the label is different from the name, so no need for that
9194  * parameter */
9195 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD)                                    \
9196 ANYOF_##NAME:                                                                  \
9197     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
9198     else if (UNI_SEMANTICS) {                                                  \
9199         for (value = 0; value < 256; value++) {                                \
9200             if (TEST_8(value)) stored +=                                       \
9201                       set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9202         }                                                                      \
9203     }                                                                          \
9204     else {                                                                     \
9205         for (value = 0; value < 128; value++) {                                \
9206             if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
9207                 set_regclass_bit(pRExC_state, ret,                     \
9208                                    (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);                 \
9209         }                                                                      \
9210     }                                                                          \
9211     yesno = '+';                                                               \
9212     what = WORD;                                                               \
9213     break;                                                                     \
9214 case ANYOF_N##NAME:                                                            \
9215     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
9216     else if (UNI_SEMANTICS) {                                                  \
9217         for (value = 0; value < 256; value++) {                                \
9218             if (! TEST_8(value)) stored +=                                     \
9219                     set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);    \
9220         }                                                                      \
9221     }                                                                          \
9222     else {                                                                     \
9223         for (value = 0; value < 128; value++) {                                \
9224             if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
9225                         pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);    \
9226         }                                                                      \
9227         if (AT_LEAST_ASCII_RESTRICTED) {                                       \
9228             for (value = 128; value < 256; value++) {                          \
9229              stored += set_regclass_bit(                                     \
9230                            pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9231             }                                                                  \
9232             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;                             \
9233         }                                                                      \
9234         else {                                                                 \
9235             /* For a non-ut8 target string with DEPENDS semantics, all above   \
9236              * ASCII Latin1 code points match the complement of any of the     \
9237              * classes.  But in utf8, they have their Unicode semantics, so    \
9238              * can't just set them in the bitmap, or else regexec.c will think \
9239              * they matched when they shouldn't. */                            \
9240             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;                     \
9241         }                                                                      \
9242     }                                                                          \
9243     yesno = '!';                                                               \
9244     what = WORD;                                                               \
9245     break
9246
9247 STATIC U8
9248 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9249 {
9250
9251     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9252      * Locale folding is done at run-time, so this function should not be
9253      * called for nodes that are for locales.
9254      *
9255      * This function sets the bit corresponding to the fold of the input
9256      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
9257      * 'F' is 'f'.
9258      *
9259      * It also knows about the characters that are in the bitmap that have
9260      * folds that are matchable only outside it, and sets the appropriate lists
9261      * and flags.
9262      *
9263      * It returns the number of bits that actually changed from 0 to 1 */
9264
9265     U8 stored = 0;
9266     U8 fold;
9267
9268     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9269
9270     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9271                                     : PL_fold[value];
9272
9273     /* It assumes the bit for 'value' has already been set */
9274     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9275         ANYOF_BITMAP_SET(node, fold);
9276         stored++;
9277     }
9278     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9279         /* Certain Latin1 characters have matches outside the bitmap.  To get
9280          * here, 'value' is one of those characters.   None of these matches is
9281          * valid for ASCII characters under /aa, which have been excluded by
9282          * the 'if' above.  The matches fall into three categories:
9283          * 1) They are singly folded-to or -from an above 255 character, as
9284          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9285          *    WITH DIAERESIS;
9286          * 2) They are part of a multi-char fold with another character in the
9287          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9288          * 3) They are part of a multi-char fold with a character not in the
9289          *    bitmap, such as various ligatures.
9290          * We aren't dealing fully with multi-char folds, except we do deal
9291          * with the pattern containing a character that has a multi-char fold
9292          * (not so much the inverse).
9293          * For types 1) and 3), the matches only happen when the target string
9294          * is utf8; that's not true for 2), and we set a flag for it.
9295          *
9296          * The code below adds to the passed in inversion list the single fold
9297          * closures for 'value'.  The values are hard-coded here so that an
9298          * innocent-looking character class, like /[ks]/i won't have to go out
9299          * to disk to find the possible matches.  XXX It would be better to
9300          * generate these via regen, in case a new version of the Unicode
9301          * standard adds new mappings, though that is not really likely. */
9302         switch (value) {
9303             case 'k':
9304             case 'K':
9305                 /* KELVIN SIGN */
9306                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9307                 break;
9308             case 's':
9309             case 'S':
9310                 /* LATIN SMALL LETTER LONG S */
9311                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9312                 break;
9313             case MICRO_SIGN:
9314                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9315                                                  GREEK_SMALL_LETTER_MU);
9316                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9317                                                  GREEK_CAPITAL_LETTER_MU);
9318                 break;
9319             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9320             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9321                 /* ANGSTROM SIGN */
9322                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9323                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
9324                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9325                                                      PL_fold_latin1[value]);
9326                 }
9327                 break;
9328             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9329                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9330                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9331                 break;
9332             case LATIN_SMALL_LETTER_SHARP_S:
9333
9334                 /* Under /d and /u, this can match the two chars "ss" */
9335                 if (! MORE_ASCII_RESTRICTED) {
9336                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
9337
9338                     /* And under /u, it can match even if the target is not
9339                      * utf8 */
9340                     if (UNI_SEMANTICS) {
9341                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9342                     }
9343                 }
9344                 break;
9345             case 'F': case 'f':
9346             case 'I': case 'i':
9347             case 'L': case 'l':
9348             case 'T': case 't':
9349                 /* These all are targets of multi-character folds, which can
9350                  * occur with only non-Latin1 characters in the fold, so they
9351                  * can match if the target string isn't UTF-8 */
9352                 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9353                 break;
9354             case 'A': case 'a':
9355             case 'H': case 'h':
9356             case 'J': case 'j':
9357             case 'N': case 'n':
9358             case 'W': case 'w':
9359             case 'Y': case 'y':
9360                 /* These all are targets of multi-character folds, which occur
9361                  * only with a non-Latin1 character as part of the fold, so
9362                  * they can't match unless the target string is in UTF-8, so no
9363                  * action here is necessary */
9364                 break;
9365             default:
9366                 /* Use deprecated warning to increase the chances of this
9367                  * being output */
9368                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9369                 break;
9370         }
9371     }
9372     else if (DEPENDS_SEMANTICS
9373             && ! isASCII(value)
9374             && PL_fold_latin1[value] != value)
9375     {
9376            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9377             * folds only when the target string is in UTF-8.  We add the fold
9378             * here to the list of things to match outside the bitmap, which
9379             * won't be looked at unless it is UTF8 (or else if something else
9380             * says to look even if not utf8, but those things better not happen
9381             * under DEPENDS semantics. */
9382         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
9383     }
9384
9385     return stored;
9386 }
9387
9388
9389 PERL_STATIC_INLINE U8
9390 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9391 {
9392     /* This inline function sets a bit in the bitmap if not already set, and if
9393      * appropriate, its fold, returning the number of bits that actually
9394      * changed from 0 to 1 */
9395
9396     U8 stored;
9397
9398     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9399
9400     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
9401         return 0;
9402     }
9403
9404     ANYOF_BITMAP_SET(node, value);
9405     stored = 1;
9406
9407     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
9408         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
9409     }
9410
9411     return stored;
9412 }
9413
9414 STATIC void
9415 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9416 {
9417     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9418      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
9419      * the multi-character folds of characters in the node */
9420     SV *sv;
9421
9422     PERL_ARGS_ASSERT_ADD_ALTERNATE;
9423
9424     if (! *alternate_ptr) {
9425         *alternate_ptr = newAV();
9426     }
9427     sv = newSVpvn_utf8((char*)string, len, TRUE);
9428     av_push(*alternate_ptr, sv);
9429     return;
9430 }
9431
9432 /*
9433    parse a class specification and produce either an ANYOF node that
9434    matches the pattern or perhaps will be optimized into an EXACTish node
9435    instead. The node contains a bit map for the first 256 characters, with the
9436    corresponding bit set if that character is in the list.  For characters
9437    above 255, a range list is used */
9438
9439 STATIC regnode *
9440 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9441 {
9442     dVAR;
9443     register UV nextvalue;
9444     register IV prevvalue = OOB_UNICODE;
9445     register IV range = 0;
9446     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9447     register regnode *ret;
9448     STRLEN numlen;
9449     IV namedclass;
9450     char *rangebegin = NULL;
9451     bool need_class = 0;
9452     SV *listsv = NULL;
9453     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9454                                       than just initialized.  */
9455     UV n;
9456
9457     /* code points this node matches that can't be stored in the bitmap */
9458     HV* nonbitmap = NULL;
9459
9460     /* The items that are to match that aren't stored in the bitmap, but are a
9461      * result of things that are stored there.  This is the fold closure of
9462      * such a character, either because it has DEPENDS semantics and shouldn't
9463      * be matched unless the target string is utf8, or is a code point that is
9464      * too large for the bit map, as for example, the fold of the MICRO SIGN is
9465      * above 255.  This all is solely for performance reasons.  By having this
9466      * code know the outside-the-bitmap folds that the bitmapped characters are
9467      * involved with, we don't have to go out to disk to find the list of
9468      * matches, unless the character class includes code points that aren't
9469      * storable in the bit map.  That means that a character class with an 's'
9470      * in it, for example, doesn't need to go out to disk to find everything
9471      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
9472      * empty unless there is something whose fold we don't know about, and will
9473      * have to go out to the disk to find. */
9474     HV* l1_fold_invlist = NULL;
9475
9476     /* List of multi-character folds that are matched by this node */
9477     AV* unicode_alternate  = NULL;
9478 #ifdef EBCDIC
9479     UV literal_endpoint = 0;
9480 #endif
9481     UV stored = 0;  /* how many chars stored in the bitmap */
9482
9483     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9484         case we need to change the emitted regop to an EXACT. */
9485     const char * orig_parse = RExC_parse;
9486     GET_RE_DEBUG_FLAGS_DECL;
9487
9488     PERL_ARGS_ASSERT_REGCLASS;
9489 #ifndef DEBUGGING
9490     PERL_UNUSED_ARG(depth);
9491 #endif
9492
9493     DEBUG_PARSE("clas");
9494
9495     /* Assume we are going to generate an ANYOF node. */
9496     ret = reganode(pRExC_state, ANYOF, 0);
9497
9498
9499     if (!SIZE_ONLY) {
9500         ANYOF_FLAGS(ret) = 0;
9501     }
9502
9503     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
9504         RExC_naughty++;
9505         RExC_parse++;
9506         if (!SIZE_ONLY)
9507             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9508     }
9509
9510     if (SIZE_ONLY) {
9511         RExC_size += ANYOF_SKIP;
9512 #ifdef ANYOF_ADD_LOC_SKIP
9513         if (LOC) {
9514             RExC_size += ANYOF_ADD_LOC_SKIP;
9515         }
9516 #endif
9517         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9518     }
9519     else {
9520         RExC_emit += ANYOF_SKIP;
9521         if (LOC) {
9522             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9523 #ifdef ANYOF_ADD_LOC_SKIP
9524             RExC_emit += ANYOF_ADD_LOC_SKIP;
9525 #endif
9526         }
9527         ANYOF_BITMAP_ZERO(ret);
9528         listsv = newSVpvs("# comment\n");
9529         initial_listsv_len = SvCUR(listsv);
9530     }
9531
9532     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9533
9534     if (!SIZE_ONLY && POSIXCC(nextvalue))
9535         checkposixcc(pRExC_state);
9536
9537     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9538     if (UCHARAT(RExC_parse) == ']')
9539         goto charclassloop;
9540
9541 parseit:
9542     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9543
9544     charclassloop:
9545
9546         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9547
9548         if (!range)
9549             rangebegin = RExC_parse;
9550         if (UTF) {
9551             value = utf8n_to_uvchr((U8*)RExC_parse,
9552                                    RExC_end - RExC_parse,
9553                                    &numlen, UTF8_ALLOW_DEFAULT);
9554             RExC_parse += numlen;
9555         }
9556         else
9557             value = UCHARAT(RExC_parse++);
9558
9559         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9560         if (value == '[' && POSIXCC(nextvalue))
9561             namedclass = regpposixcc(pRExC_state, value);
9562         else if (value == '\\') {
9563             if (UTF) {
9564                 value = utf8n_to_uvchr((U8*)RExC_parse,
9565                                    RExC_end - RExC_parse,
9566                                    &numlen, UTF8_ALLOW_DEFAULT);
9567                 RExC_parse += numlen;
9568             }
9569             else
9570                 value = UCHARAT(RExC_parse++);
9571             /* Some compilers cannot handle switching on 64-bit integer
9572              * values, therefore value cannot be an UV.  Yes, this will
9573              * be a problem later if we want switch on Unicode.
9574              * A similar issue a little bit later when switching on
9575              * namedclass. --jhi */
9576             switch ((I32)value) {
9577             case 'w':   namedclass = ANYOF_ALNUM;       break;
9578             case 'W':   namedclass = ANYOF_NALNUM;      break;
9579             case 's':   namedclass = ANYOF_SPACE;       break;
9580             case 'S':   namedclass = ANYOF_NSPACE;      break;
9581             case 'd':   namedclass = ANYOF_DIGIT;       break;
9582             case 'D':   namedclass = ANYOF_NDIGIT;      break;
9583             case 'v':   namedclass = ANYOF_VERTWS;      break;
9584             case 'V':   namedclass = ANYOF_NVERTWS;     break;
9585             case 'h':   namedclass = ANYOF_HORIZWS;     break;
9586             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
9587             case 'N':  /* Handle \N{NAME} in class */
9588                 {
9589                     /* We only pay attention to the first char of
9590                     multichar strings being returned. I kinda wonder
9591                     if this makes sense as it does change the behaviour
9592                     from earlier versions, OTOH that behaviour was broken
9593                     as well. */
9594                     UV v; /* value is register so we cant & it /grrr */
9595                     if (reg_namedseq(pRExC_state, &v, NULL)) {
9596                         goto parseit;
9597                     }
9598                     value= v;
9599                 }
9600                 break;
9601             case 'p':
9602             case 'P':
9603                 {
9604                 char *e;
9605                 if (RExC_parse >= RExC_end)
9606                     vFAIL2("Empty \\%c{}", (U8)value);
9607                 if (*RExC_parse == '{') {
9608                     const U8 c = (U8)value;
9609                     e = strchr(RExC_parse++, '}');
9610                     if (!e)
9611                         vFAIL2("Missing right brace on \\%c{}", c);
9612                     while (isSPACE(UCHARAT(RExC_parse)))
9613                         RExC_parse++;
9614                     if (e == RExC_parse)
9615                         vFAIL2("Empty \\%c{}", c);
9616                     n = e - RExC_parse;
9617                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9618                         n--;
9619                 }
9620                 else {
9621                     e = RExC_parse;
9622                     n = 1;
9623                 }
9624                 if (!SIZE_ONLY) {
9625                     if (UCHARAT(RExC_parse) == '^') {
9626                          RExC_parse++;
9627                          n--;
9628                          value = value == 'p' ? 'P' : 'p'; /* toggle */
9629                          while (isSPACE(UCHARAT(RExC_parse))) {
9630                               RExC_parse++;
9631                               n--;
9632                          }
9633                     }
9634
9635                     /* Add the property name to the list.  If /i matching, give
9636                      * a different name which consists of the normal name
9637                      * sandwiched between two underscores and '_i'.  The design
9638                      * is discussed in the commit message for this. */
9639                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9640                                         (value=='p' ? '+' : '!'),
9641                                         (FOLD) ? "__" : "",
9642                                         (int)n,
9643                                         RExC_parse,
9644                                         (FOLD) ? "_i" : ""
9645                                     );
9646                 }
9647                 RExC_parse = e + 1;
9648
9649                 /* The \p could match something in the Latin1 range, hence
9650                  * something that isn't utf8 */
9651                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9652                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
9653
9654                 /* \p means they want Unicode semantics */
9655                 RExC_uni_semantics = 1;
9656                 }
9657                 break;
9658             case 'n':   value = '\n';                   break;
9659             case 'r':   value = '\r';                   break;
9660             case 't':   value = '\t';                   break;
9661             case 'f':   value = '\f';                   break;
9662             case 'b':   value = '\b';                   break;
9663             case 'e':   value = ASCII_TO_NATIVE('\033');break;
9664             case 'a':   value = ASCII_TO_NATIVE('\007');break;
9665             case 'o':
9666                 RExC_parse--;   /* function expects to be pointed at the 'o' */
9667                 {
9668                     const char* error_msg;
9669                     bool valid = grok_bslash_o(RExC_parse,
9670                                                &value,
9671                                                &numlen,
9672                                                &error_msg,
9673                                                SIZE_ONLY);
9674                     RExC_parse += numlen;
9675                     if (! valid) {
9676                         vFAIL(error_msg);
9677                     }
9678                 }
9679                 if (PL_encoding && value < 0x100) {
9680                     goto recode_encoding;
9681                 }
9682                 break;
9683             case 'x':
9684                 if (*RExC_parse == '{') {
9685                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9686                         | PERL_SCAN_DISALLOW_PREFIX;
9687                     char * const e = strchr(RExC_parse++, '}');
9688                     if (!e)
9689                         vFAIL("Missing right brace on \\x{}");
9690
9691                     numlen = e - RExC_parse;
9692                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9693                     RExC_parse = e + 1;
9694                 }
9695                 else {
9696                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9697                     numlen = 2;
9698                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9699                     RExC_parse += numlen;
9700                 }
9701                 if (PL_encoding && value < 0x100)
9702                     goto recode_encoding;
9703                 break;
9704             case 'c':
9705                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9706                 break;
9707             case '0': case '1': case '2': case '3': case '4':
9708             case '5': case '6': case '7':
9709                 {
9710                     /* Take 1-3 octal digits */
9711                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9712                     numlen = 3;
9713                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9714                     RExC_parse += numlen;
9715                     if (PL_encoding && value < 0x100)
9716                         goto recode_encoding;
9717                     break;
9718                 }
9719             recode_encoding:
9720                 {
9721                     SV* enc = PL_encoding;
9722                     value = reg_recode((const char)(U8)value, &enc);
9723                     if (!enc && SIZE_ONLY)
9724                         ckWARNreg(RExC_parse,
9725                                   "Invalid escape in the specified encoding");
9726                     break;
9727                 }
9728             default:
9729                 /* Allow \_ to not give an error */
9730                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9731                     ckWARN2reg(RExC_parse,
9732                                "Unrecognized escape \\%c in character class passed through",
9733                                (int)value);
9734                 }
9735                 break;
9736             }
9737         } /* end of \blah */
9738 #ifdef EBCDIC
9739         else
9740             literal_endpoint++;
9741 #endif
9742
9743         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9744
9745             /* What matches in a locale is not known until runtime, so need to
9746              * (one time per class) allocate extra space to pass to regexec.
9747              * The space will contain a bit for each named class that is to be
9748              * matched against.  This isn't needed for \p{} and pseudo-classes,
9749              * as they are not affected by locale, and hence are dealt with
9750              * separately */
9751             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9752                 need_class = 1;
9753                 if (SIZE_ONLY) {
9754 #ifdef ANYOF_CLASS_ADD_SKIP
9755                     RExC_size += ANYOF_CLASS_ADD_SKIP;
9756 #endif
9757                 }
9758                 else {
9759 #ifdef ANYOF_CLASS_ADD_SKIP
9760                     RExC_emit += ANYOF_CLASS_ADD_SKIP;
9761 #endif
9762                     ANYOF_CLASS_ZERO(ret);
9763                 }
9764                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9765             }
9766
9767             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
9768              * literal, as is the character that began the false range, i.e.
9769              * the 'a' in the examples */
9770             if (range) {
9771                 if (!SIZE_ONLY) {
9772                     const int w =
9773                         RExC_parse >= rangebegin ?
9774                         RExC_parse - rangebegin : 0;
9775                     ckWARN4reg(RExC_parse,
9776                                "False [] range \"%*.*s\"",
9777                                w, w, rangebegin);
9778
9779                     stored +=
9780                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9781                     if (prevvalue < 256) {
9782                         stored +=
9783                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
9784                     }
9785                     else {
9786                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
9787                     }
9788                 }
9789
9790                 range = 0; /* this was not a true range */
9791             }
9792
9793
9794
9795             if (!SIZE_ONLY) {
9796                 const char *what = NULL;
9797                 char yesno = 0;
9798
9799                 /* Possible truncation here but in some 64-bit environments
9800                  * the compiler gets heartburn about switch on 64-bit values.
9801                  * A similar issue a little earlier when switching on value.
9802                  * --jhi */
9803                 switch ((I32)namedclass) {
9804
9805                 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9806                 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9807                 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9808                 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9809                 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9810                 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9811                 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9812                 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9813                 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9814                 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9815                 /* \s, \w match all unicode if utf8. */
9816                 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9817                 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9818                 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9819                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9820                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9821                 case ANYOF_ASCII:
9822                     if (LOC)
9823                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9824                     else {
9825                         for (value = 0; value < 128; value++)
9826                             stored +=
9827                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9828                     }
9829                     yesno = '+';
9830                     what = NULL;        /* Doesn't match outside ascii, so
9831                                            don't want to add +utf8:: */
9832                     break;
9833                 case ANYOF_NASCII:
9834                     if (LOC)
9835                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9836                     else {
9837                         for (value = 128; value < 256; value++)
9838                             stored +=
9839                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9840                     }
9841                     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9842                     yesno = '!';
9843                     what = "ASCII";
9844                     break;
9845                 case ANYOF_DIGIT:
9846                     if (LOC)
9847                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9848                     else {
9849                         /* consecutive digits assumed */
9850                         for (value = '0'; value <= '9'; value++)
9851                             stored +=
9852                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9853                     }
9854                     yesno = '+';
9855                     what = "Digit";
9856                     break;
9857                 case ANYOF_NDIGIT:
9858                     if (LOC)
9859                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9860                     else {
9861                         /* consecutive digits assumed */
9862                         for (value = 0; value < '0'; value++)
9863                             stored +=
9864                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9865                         for (value = '9' + 1; value < 256; value++)
9866                             stored +=
9867                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9868                     }
9869                     yesno = '!';
9870                     what = "Digit";
9871                     if (AT_LEAST_ASCII_RESTRICTED ) {
9872                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9873                     }
9874                     break;
9875                 case ANYOF_MAX:
9876                     /* this is to handle \p and \P */
9877                     break;
9878                 default:
9879                     vFAIL("Invalid [::] class");
9880                     break;
9881                 }
9882                 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9883                     /* Strings such as "+utf8::isWord\n" */
9884                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9885                 }
9886
9887                 continue;
9888             }
9889         } /* end of namedclass \blah */
9890
9891         if (range) {
9892             if (prevvalue > (IV)value) /* b-a */ {
9893                 const int w = RExC_parse - rangebegin;
9894                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9895                 range = 0; /* not a valid range */
9896             }
9897         }
9898         else {
9899             prevvalue = value; /* save the beginning of the range */
9900             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
9901                 RExC_parse[1] != ']') {
9902                 RExC_parse++;
9903
9904                 /* a bad range like \w-, [:word:]- ? */
9905                 if (namedclass > OOB_NAMEDCLASS) {
9906                     if (ckWARN(WARN_REGEXP)) {
9907                         const int w =
9908                             RExC_parse >= rangebegin ?
9909                             RExC_parse - rangebegin : 0;
9910                         vWARN4(RExC_parse,
9911                                "False [] range \"%*.*s\"",
9912                                w, w, rangebegin);
9913                     }
9914                     if (!SIZE_ONLY)
9915                         stored +=
9916                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9917                 } else
9918                     range = 1;  /* yeah, it's a range! */
9919                 continue;       /* but do it the next time */
9920             }
9921         }
9922
9923         /* non-Latin1 code point implies unicode semantics.  Must be set in
9924          * pass1 so is there for the whole of pass 2 */
9925         if (value > 255) {
9926             RExC_uni_semantics = 1;
9927         }
9928
9929         /* now is the next time */
9930         if (!SIZE_ONLY) {
9931             if (prevvalue < 256) {
9932                 const IV ceilvalue = value < 256 ? value : 255;
9933                 IV i;
9934 #ifdef EBCDIC
9935                 /* In EBCDIC [\x89-\x91] should include
9936                  * the \x8e but [i-j] should not. */
9937                 if (literal_endpoint == 2 &&
9938                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9939                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9940                 {
9941                     if (isLOWER(prevvalue)) {
9942                         for (i = prevvalue; i <= ceilvalue; i++)
9943                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9944                                 stored +=
9945                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9946                             }
9947                     } else {
9948                         for (i = prevvalue; i <= ceilvalue; i++)
9949                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9950                                 stored +=
9951                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9952                             }
9953                     }
9954                 }
9955                 else
9956 #endif
9957                       for (i = prevvalue; i <= ceilvalue; i++) {
9958                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9959                       }
9960           }
9961           if (value > 255) {
9962             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
9963             const UV natvalue      = NATIVE_TO_UNI(value);
9964             nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
9965         }
9966 #ifdef EBCDIC
9967             literal_endpoint = 0;
9968 #endif
9969         }
9970
9971         range = 0; /* this range (if it was one) is done now */
9972     }
9973
9974
9975
9976     if (SIZE_ONLY)
9977         return ret;
9978     /****** !SIZE_ONLY AFTER HERE *********/
9979
9980     /* If folding and there are code points above 255, we calculate all
9981      * characters that could fold to or from the ones already on the list */
9982     if (FOLD && nonbitmap) {
9983         UV i;
9984
9985         HV* fold_intersection;
9986         UV* fold_list;
9987
9988         /* This is a list of all the characters that participate in folds
9989             * (except marks, etc in multi-char folds */
9990         if (! PL_utf8_foldable) {
9991             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
9992             PL_utf8_foldable = _swash_to_invlist(swash);
9993         }
9994
9995         /* This is a hash that for a particular fold gives all characters
9996             * that are involved in it */
9997         if (! PL_utf8_foldclosures) {
9998
9999             /* If we were unable to find any folds, then we likely won't be
10000              * able to find the closures.  So just create an empty list.
10001              * Folding will effectively be restricted to the non-Unicode rules
10002              * hard-coded into Perl.  (This case happens legitimately during
10003              * compilation of Perl itself before the Unicode tables are
10004              * generated) */
10005             if (invlist_len(PL_utf8_foldable) == 0) {
10006                 PL_utf8_foldclosures = _new_invlist(0);
10007             } else {
10008                 /* If the folds haven't been read in, call a fold function
10009                     * to force that */
10010                 if (! PL_utf8_tofold) {
10011                     U8 dummy[UTF8_MAXBYTES+1];
10012                     STRLEN dummy_len;
10013                     to_utf8_fold((U8*) "A", dummy, &dummy_len);
10014                 }
10015                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10016             }
10017         }
10018
10019         /* Only the characters in this class that participate in folds need
10020             * be checked.  Get the intersection of this class and all the
10021             * possible characters that are foldable.  This can quickly narrow
10022             * down a large class */
10023         fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
10024
10025         /* Now look at the foldable characters in this class individually */
10026         fold_list = invlist_array(fold_intersection);
10027         for (i = 0; i < invlist_len(fold_intersection); i++) {
10028             UV j;
10029
10030             /* The next entry is the beginning of the range that is in the
10031              * class */
10032             UV start = fold_list[i++];
10033
10034
10035             /* The next entry is the beginning of the next range, which
10036                 * isn't in the class, so the end of the current range is one
10037                 * less than that */
10038             UV end = fold_list[i] - 1;
10039
10040             /* Look at every character in the range */
10041             for (j = start; j <= end; j++) {
10042
10043                 /* Get its fold */
10044                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10045                 STRLEN foldlen;
10046                 const UV f = to_uni_fold(j, foldbuf, &foldlen);
10047
10048                 if (foldlen > (STRLEN)UNISKIP(f)) {
10049
10050                     /* Any multicharacter foldings (disallowed in
10051                         * lookbehind patterns) require the following
10052                         * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10053                         * E folds into "pq" and F folds into "rst", all other
10054                         * characters fold to single characters.  We save away
10055                         * these multicharacter foldings, to be later saved as
10056                         * part of the additional "s" data. */
10057                     if (! RExC_in_lookbehind) {
10058                         U8* loc = foldbuf;
10059                         U8* e = foldbuf + foldlen;
10060
10061                         /* If any of the folded characters of this are in
10062                             * the Latin1 range, tell the regex engine that
10063                             * this can match a non-utf8 target string.  The
10064                             * only multi-byte fold whose source is in the
10065                             * Latin1 range (U+00DF) applies only when the
10066                             * target string is utf8, or under unicode rules */
10067                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10068                             while (loc < e) {
10069
10070                                 /* Can't mix ascii with non- under /aa */
10071                                 if (MORE_ASCII_RESTRICTED
10072                                     && (isASCII(*loc) != isASCII(j)))
10073                                 {
10074                                     goto end_multi_fold;
10075                                 }
10076                                 if (UTF8_IS_INVARIANT(*loc)
10077                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
10078                                 {
10079                                     /* Can't mix above and below 256 under
10080                                         * LOC */
10081                                     if (LOC) {
10082                                         goto end_multi_fold;
10083                                     }
10084                                     ANYOF_FLAGS(ret)
10085                                             |= ANYOF_NONBITMAP_NON_UTF8;
10086                                     break;
10087                                 }
10088                                 loc += UTF8SKIP(loc);
10089                             }
10090                         }
10091
10092                         add_alternate(&unicode_alternate, foldbuf, foldlen);
10093                     end_multi_fold: ;
10094                     }
10095                 }
10096                 else {
10097                     /* Single character fold.  Add everything in its fold
10098                         * closure to the list that this node should match */
10099                     SV** listp;
10100
10101                     /* The fold closures data structure is a hash with the
10102                         * keys being every character that is folded to, like
10103                         * 'k', and the values each an array of everything that
10104                         * folds to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
10105                     if ((listp = hv_fetch(PL_utf8_foldclosures,
10106                                     (char *) foldbuf, foldlen, FALSE)))
10107                     {
10108                         AV* list = (AV*) *listp;
10109                         IV k;
10110                         for (k = 0; k <= av_len(list); k++) {
10111                             SV** c_p = av_fetch(list, k, FALSE);
10112                             UV c;
10113                             if (c_p == NULL) {
10114                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10115                             }
10116                             c = SvUV(*c_p);
10117
10118                             /* /aa doesn't allow folds between ASCII and
10119                                 * non-; /l doesn't allow them between above
10120                                 * and below 256 */
10121                             if ((MORE_ASCII_RESTRICTED
10122                                  && (isASCII(c) != isASCII(j)))
10123                                     || (LOC && ((c < 256) != (j < 256))))
10124                             {
10125                                 continue;
10126                             }
10127
10128                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10129                                 stored += set_regclass_bit(pRExC_state,
10130                                         ret,
10131                                         (U8) c,
10132                                         &l1_fold_invlist, &unicode_alternate);
10133                             }
10134                                 /* It may be that the code point is already
10135                                     * in this range or already in the bitmap,
10136                                     * in which case we need do nothing */
10137                             else if ((c < start || c > end)
10138                                         && (c > 255
10139                                             || ! ANYOF_BITMAP_TEST(ret, c)))
10140                             {
10141                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
10142                             }
10143                         }
10144                     }
10145                 }
10146             }
10147         }
10148         invlist_destroy(fold_intersection);
10149     }
10150
10151     /* Combine the two lists into one. */
10152     if (l1_fold_invlist) {
10153         if (nonbitmap) {
10154             nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
10155         }
10156         else {
10157             nonbitmap = l1_fold_invlist;
10158         }
10159     }
10160
10161     /* Here, we have calculated what code points should be in the character
10162      * class.   Now we can see about various optimizations.  Fold calculation
10163      * needs to take place before inversion.  Otherwise /[^k]/i would invert to
10164      * include K, which under /i would match k. */
10165
10166     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
10167      * set the FOLD flag yet, so this this does optimize those.  It doesn't
10168      * optimize locale.  Doing so perhaps could be done as long as there is
10169      * nothing like \w in it; some thought also would have to be given to the
10170      * interaction with above 0x100 chars */
10171     if (! LOC
10172         && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10173         && ! unicode_alternate
10174         && ! nonbitmap
10175         && SvCUR(listsv) == initial_listsv_len)
10176     {
10177         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10178             ANYOF_BITMAP(ret)[value] ^= 0xFF;
10179         stored = 256 - stored;
10180
10181         /* The inversion means that everything above 255 is matched; and at the
10182          * same time we clear the invert flag */
10183         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
10184     }
10185
10186     /* Folding in the bitmap is taken care of above, but not for locale (for
10187      * which we have to wait to see what folding is in effect at runtime), and
10188      * for things not in the bitmap.  Set run-time fold flag for these */
10189     if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
10190         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10191     }
10192
10193     /* A single character class can be "optimized" into an EXACTish node.
10194      * Note that since we don't currently count how many characters there are
10195      * outside the bitmap, we are XXX missing optimization possibilities for
10196      * them.  This optimization can't happen unless this is a truly single
10197      * character class, which means that it can't be an inversion into a
10198      * many-character class, and there must be no possibility of there being
10199      * things outside the bitmap.  'stored' (only) for locales doesn't include
10200      * \w, etc, so have to make a special test that they aren't present
10201      *
10202      * Similarly A 2-character class of the very special form like [bB] can be
10203      * optimized into an EXACTFish node, but only for non-locales, and for
10204      * characters which only have the two folds; so things like 'fF' and 'Ii'
10205      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10206      * FI'. */
10207     if (! nonbitmap
10208         && ! unicode_alternate
10209         && SvCUR(listsv) == initial_listsv_len
10210         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
10211         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10212                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10213             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10214                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10215                                  /* If the latest code point has a fold whose
10216                                   * bit is set, it must be the only other one */
10217                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10218                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10219     {
10220         /* Note that the information needed to decide to do this optimization
10221          * is not currently available until the 2nd pass, and that the actually
10222          * used EXACTish node takes less space than the calculated ANYOF node,
10223          * and hence the amount of space calculated in the first pass is larger
10224          * than actually used, so this optimization doesn't gain us any space.
10225          * But an EXACT node is faster than an ANYOF node, and can be combined
10226          * with any adjacent EXACT nodes later by the optimizer for further
10227          * gains.  The speed of executing an EXACTF is similar to an ANYOF
10228          * node, so the optimization advantage comes from the ability to join
10229          * it to adjacent EXACT nodes */
10230
10231         const char * cur_parse= RExC_parse;
10232         U8 op;
10233         RExC_emit = (regnode *)orig_emit;
10234         RExC_parse = (char *)orig_parse;
10235
10236         if (stored == 1) {
10237
10238             /* A locale node with one point can be folded; all the other cases
10239              * with folding will have two points, since we calculate them above
10240              */
10241             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10242                  op = EXACTFL;
10243             }
10244             else {
10245                 op = EXACT;
10246             }
10247         }   /* else 2 chars in the bit map: the folds of each other */
10248         else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10249
10250             /* To join adjacent nodes, they must be the exact EXACTish type.
10251              * Try to use the most likely type, by using EXACTFU if the regex
10252              * calls for them, or is required because the character is
10253              * non-ASCII */
10254             op = EXACTFU;
10255         }
10256         else {    /* Otherwise, more likely to be EXACTF type */
10257             op = EXACTF;
10258         }
10259
10260         ret = reg_node(pRExC_state, op);
10261         RExC_parse = (char *)cur_parse;
10262         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10263             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10264             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10265             STR_LEN(ret)= 2;
10266             RExC_emit += STR_SZ(2);
10267         }
10268         else {
10269             *STRING(ret)= (char)value;
10270             STR_LEN(ret)= 1;
10271             RExC_emit += STR_SZ(1);
10272         }
10273         SvREFCNT_dec(listsv);
10274         return ret;
10275     }
10276
10277     if (nonbitmap) {
10278         UV* nonbitmap_array = invlist_array(nonbitmap);
10279         UV nonbitmap_len = invlist_len(nonbitmap);
10280         UV i;
10281
10282         /*  Here have the full list of items to match that aren't in the
10283          *  bitmap.  Convert to the structure that the rest of the code is
10284          *  expecting.   XXX That rest of the code should convert to this
10285          *  structure */
10286         for (i = 0; i < nonbitmap_len; i++) {
10287
10288             /* The next entry is the beginning of the range that is in the
10289              * class */
10290             UV start = nonbitmap_array[i++];
10291             UV end;
10292
10293             /* The next entry is the beginning of the next range, which isn't
10294              * in the class, so the end of the current range is one less than
10295              * that.  But if there is no next range, it means that the range
10296              * begun by 'start' extends to infinity, which for this platform
10297              * ends at UV_MAX */
10298             if (i == nonbitmap_len) {
10299                 end = UV_MAX;
10300             }
10301             else {
10302                 end = nonbitmap_array[i] - 1;
10303             }
10304
10305             if (start == end) {
10306                 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10307             }
10308             else {
10309                 /* The \t sets the whole range */
10310                 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10311                         /* XXX EBCDIC */
10312                                    start, end);
10313             }
10314         }
10315         invlist_destroy(nonbitmap);
10316     }
10317
10318     if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10319         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10320         SvREFCNT_dec(listsv);
10321         SvREFCNT_dec(unicode_alternate);
10322     }
10323     else {
10324
10325         AV * const av = newAV();
10326         SV *rv;
10327         /* The 0th element stores the character class description
10328          * in its textual form: used later (regexec.c:Perl_regclass_swash())
10329          * to initialize the appropriate swash (which gets stored in
10330          * the 1st element), and also useful for dumping the regnode.
10331          * The 2nd element stores the multicharacter foldings,
10332          * used later (regexec.c:S_reginclass()). */
10333         av_store(av, 0, listsv);
10334         av_store(av, 1, NULL);
10335         av_store(av, 2, MUTABLE_SV(unicode_alternate));
10336         if (unicode_alternate) { /* This node is variable length */
10337             OP(ret) = ANYOFV;
10338         }
10339         rv = newRV_noinc(MUTABLE_SV(av));
10340         n = add_data(pRExC_state, 1, "s");
10341         RExC_rxi->data->data[n] = (void*)rv;
10342         ARG_SET(ret, n);
10343     }
10344     return ret;
10345 }
10346 #undef _C_C_T_
10347
10348
10349 /* reg_skipcomment()
10350
10351    Absorbs an /x style # comments from the input stream.
10352    Returns true if there is more text remaining in the stream.
10353    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10354    terminates the pattern without including a newline.
10355
10356    Note its the callers responsibility to ensure that we are
10357    actually in /x mode
10358
10359 */
10360
10361 STATIC bool
10362 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10363 {
10364     bool ended = 0;
10365
10366     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10367
10368     while (RExC_parse < RExC_end)
10369         if (*RExC_parse++ == '\n') {
10370             ended = 1;
10371             break;
10372         }
10373     if (!ended) {
10374         /* we ran off the end of the pattern without ending
10375            the comment, so we have to add an \n when wrapping */
10376         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10377         return 0;
10378     } else
10379         return 1;
10380 }
10381
10382 /* nextchar()
10383
10384    Advances the parse position, and optionally absorbs
10385    "whitespace" from the inputstream.
10386
10387    Without /x "whitespace" means (?#...) style comments only,
10388    with /x this means (?#...) and # comments and whitespace proper.
10389
10390    Returns the RExC_parse point from BEFORE the scan occurs.
10391
10392    This is the /x friendly way of saying RExC_parse++.
10393 */
10394
10395 STATIC char*
10396 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10397 {
10398     char* const retval = RExC_parse++;
10399
10400     PERL_ARGS_ASSERT_NEXTCHAR;
10401
10402     for (;;) {
10403         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10404                 RExC_parse[2] == '#') {
10405             while (*RExC_parse != ')') {
10406                 if (RExC_parse == RExC_end)
10407                     FAIL("Sequence (?#... not terminated");
10408                 RExC_parse++;
10409             }
10410             RExC_parse++;
10411             continue;
10412         }
10413         if (RExC_flags & RXf_PMf_EXTENDED) {
10414             if (isSPACE(*RExC_parse)) {
10415                 RExC_parse++;
10416                 continue;
10417             }
10418             else if (*RExC_parse == '#') {
10419                 if ( reg_skipcomment( pRExC_state ) )
10420                     continue;
10421             }
10422         }
10423         return retval;
10424     }
10425 }
10426
10427 /*
10428 - reg_node - emit a node
10429 */
10430 STATIC regnode *                        /* Location. */
10431 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10432 {
10433     dVAR;
10434     register regnode *ptr;
10435     regnode * const ret = RExC_emit;
10436     GET_RE_DEBUG_FLAGS_DECL;
10437
10438     PERL_ARGS_ASSERT_REG_NODE;
10439
10440     if (SIZE_ONLY) {
10441         SIZE_ALIGN(RExC_size);
10442         RExC_size += 1;
10443         return(ret);
10444     }
10445     if (RExC_emit >= RExC_emit_bound)
10446         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10447
10448     NODE_ALIGN_FILL(ret);
10449     ptr = ret;
10450     FILL_ADVANCE_NODE(ptr, op);
10451 #ifdef RE_TRACK_PATTERN_OFFSETS
10452     if (RExC_offsets) {         /* MJD */
10453         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10454               "reg_node", __LINE__,
10455               PL_reg_name[op],
10456               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10457                 ? "Overwriting end of array!\n" : "OK",
10458               (UV)(RExC_emit - RExC_emit_start),
10459               (UV)(RExC_parse - RExC_start),
10460               (UV)RExC_offsets[0]));
10461         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10462     }
10463 #endif
10464     RExC_emit = ptr;
10465     return(ret);
10466 }
10467
10468 /*
10469 - reganode - emit a node with an argument
10470 */
10471 STATIC regnode *                        /* Location. */
10472 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10473 {
10474     dVAR;
10475     register regnode *ptr;
10476     regnode * const ret = RExC_emit;
10477     GET_RE_DEBUG_FLAGS_DECL;
10478
10479     PERL_ARGS_ASSERT_REGANODE;
10480
10481     if (SIZE_ONLY) {
10482         SIZE_ALIGN(RExC_size);
10483         RExC_size += 2;
10484         /*
10485            We can't do this:
10486
10487            assert(2==regarglen[op]+1);
10488
10489            Anything larger than this has to allocate the extra amount.
10490            If we changed this to be:
10491
10492            RExC_size += (1 + regarglen[op]);
10493
10494            then it wouldn't matter. Its not clear what side effect
10495            might come from that so its not done so far.
10496            -- dmq
10497         */
10498         return(ret);
10499     }
10500     if (RExC_emit >= RExC_emit_bound)
10501         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10502
10503     NODE_ALIGN_FILL(ret);
10504     ptr = ret;
10505     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10506 #ifdef RE_TRACK_PATTERN_OFFSETS
10507     if (RExC_offsets) {         /* MJD */
10508         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10509               "reganode",
10510               __LINE__,
10511               PL_reg_name[op],
10512               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10513               "Overwriting end of array!\n" : "OK",
10514               (UV)(RExC_emit - RExC_emit_start),
10515               (UV)(RExC_parse - RExC_start),
10516               (UV)RExC_offsets[0]));
10517         Set_Cur_Node_Offset;
10518     }
10519 #endif
10520     RExC_emit = ptr;
10521     return(ret);
10522 }
10523
10524 /*
10525 - reguni - emit (if appropriate) a Unicode character
10526 */
10527 STATIC STRLEN
10528 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10529 {
10530     dVAR;
10531
10532     PERL_ARGS_ASSERT_REGUNI;
10533
10534     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10535 }
10536
10537 /*
10538 - reginsert - insert an operator in front of already-emitted operand
10539 *
10540 * Means relocating the operand.
10541 */
10542 STATIC void
10543 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10544 {
10545     dVAR;
10546     register regnode *src;
10547     register regnode *dst;
10548     register regnode *place;
10549     const int offset = regarglen[(U8)op];
10550     const int size = NODE_STEP_REGNODE + offset;
10551     GET_RE_DEBUG_FLAGS_DECL;
10552
10553     PERL_ARGS_ASSERT_REGINSERT;
10554     PERL_UNUSED_ARG(depth);
10555 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10556     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10557     if (SIZE_ONLY) {
10558         RExC_size += size;
10559         return;
10560     }
10561
10562     src = RExC_emit;
10563     RExC_emit += size;
10564     dst = RExC_emit;
10565     if (RExC_open_parens) {
10566         int paren;
10567         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10568         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10569             if ( RExC_open_parens[paren] >= opnd ) {
10570                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10571                 RExC_open_parens[paren] += size;
10572             } else {
10573                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10574             }
10575             if ( RExC_close_parens[paren] >= opnd ) {
10576                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10577                 RExC_close_parens[paren] += size;
10578             } else {
10579                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10580             }
10581         }
10582     }
10583
10584     while (src > opnd) {
10585         StructCopy(--src, --dst, regnode);
10586 #ifdef RE_TRACK_PATTERN_OFFSETS
10587         if (RExC_offsets) {     /* MJD 20010112 */
10588             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10589                   "reg_insert",
10590                   __LINE__,
10591                   PL_reg_name[op],
10592                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10593                     ? "Overwriting end of array!\n" : "OK",
10594                   (UV)(src - RExC_emit_start),
10595                   (UV)(dst - RExC_emit_start),
10596                   (UV)RExC_offsets[0]));
10597             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10598             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10599         }
10600 #endif
10601     }
10602
10603
10604     place = opnd;               /* Op node, where operand used to be. */
10605 #ifdef RE_TRACK_PATTERN_OFFSETS
10606     if (RExC_offsets) {         /* MJD */
10607         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10608               "reginsert",
10609               __LINE__,
10610               PL_reg_name[op],
10611               (UV)(place - RExC_emit_start) > RExC_offsets[0]
10612               ? "Overwriting end of array!\n" : "OK",
10613               (UV)(place - RExC_emit_start),
10614               (UV)(RExC_parse - RExC_start),
10615               (UV)RExC_offsets[0]));
10616         Set_Node_Offset(place, RExC_parse);
10617         Set_Node_Length(place, 1);
10618     }
10619 #endif
10620     src = NEXTOPER(place);
10621     FILL_ADVANCE_NODE(place, op);
10622     Zero(src, offset, regnode);
10623 }
10624
10625 /*
10626 - regtail - set the next-pointer at the end of a node chain of p to val.
10627 - SEE ALSO: regtail_study
10628 */
10629 /* TODO: All three parms should be const */
10630 STATIC void
10631 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10632 {
10633     dVAR;
10634     register regnode *scan;
10635     GET_RE_DEBUG_FLAGS_DECL;
10636
10637     PERL_ARGS_ASSERT_REGTAIL;
10638 #ifndef DEBUGGING
10639     PERL_UNUSED_ARG(depth);
10640 #endif
10641
10642     if (SIZE_ONLY)
10643         return;
10644
10645     /* Find last node. */
10646     scan = p;
10647     for (;;) {
10648         regnode * const temp = regnext(scan);
10649         DEBUG_PARSE_r({
10650             SV * const mysv=sv_newmortal();
10651             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10652             regprop(RExC_rx, mysv, scan);
10653             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10654                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10655                     (temp == NULL ? "->" : ""),
10656                     (temp == NULL ? PL_reg_name[OP(val)] : "")
10657             );
10658         });
10659         if (temp == NULL)
10660             break;
10661         scan = temp;
10662     }
10663
10664     if (reg_off_by_arg[OP(scan)]) {
10665         ARG_SET(scan, val - scan);
10666     }
10667     else {
10668         NEXT_OFF(scan) = val - scan;
10669     }
10670 }
10671
10672 #ifdef DEBUGGING
10673 /*
10674 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10675 - Look for optimizable sequences at the same time.
10676 - currently only looks for EXACT chains.
10677
10678 This is experimental code. The idea is to use this routine to perform
10679 in place optimizations on branches and groups as they are constructed,
10680 with the long term intention of removing optimization from study_chunk so
10681 that it is purely analytical.
10682
10683 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10684 to control which is which.
10685
10686 */
10687 /* TODO: All four parms should be const */
10688
10689 STATIC U8
10690 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10691 {
10692     dVAR;
10693     register regnode *scan;
10694     U8 exact = PSEUDO;
10695 #ifdef EXPERIMENTAL_INPLACESCAN
10696     I32 min = 0;
10697 #endif
10698     GET_RE_DEBUG_FLAGS_DECL;
10699
10700     PERL_ARGS_ASSERT_REGTAIL_STUDY;
10701
10702
10703     if (SIZE_ONLY)
10704         return exact;
10705
10706     /* Find last node. */
10707
10708     scan = p;
10709     for (;;) {
10710         regnode * const temp = regnext(scan);
10711 #ifdef EXPERIMENTAL_INPLACESCAN
10712         if (PL_regkind[OP(scan)] == EXACT)
10713             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10714                 return EXACT;
10715 #endif
10716         if ( exact ) {
10717             switch (OP(scan)) {
10718                 case EXACT:
10719                 case EXACTF:
10720                 case EXACTFA:
10721                 case EXACTFU:
10722                 case EXACTFL:
10723                         if( exact == PSEUDO )
10724                             exact= OP(scan);
10725                         else if ( exact != OP(scan) )
10726                             exact= 0;
10727                 case NOTHING:
10728                     break;
10729                 default:
10730                     exact= 0;
10731             }
10732         }
10733         DEBUG_PARSE_r({
10734             SV * const mysv=sv_newmortal();
10735             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10736             regprop(RExC_rx, mysv, scan);
10737             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10738                 SvPV_nolen_const(mysv),
10739                 REG_NODE_NUM(scan),
10740                 PL_reg_name[exact]);
10741         });
10742         if (temp == NULL)
10743             break;
10744         scan = temp;
10745     }
10746     DEBUG_PARSE_r({
10747         SV * const mysv_val=sv_newmortal();
10748         DEBUG_PARSE_MSG("");
10749         regprop(RExC_rx, mysv_val, val);
10750         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10751                       SvPV_nolen_const(mysv_val),
10752                       (IV)REG_NODE_NUM(val),
10753                       (IV)(val - scan)
10754         );
10755     });
10756     if (reg_off_by_arg[OP(scan)]) {
10757         ARG_SET(scan, val - scan);
10758     }
10759     else {
10760         NEXT_OFF(scan) = val - scan;
10761     }
10762
10763     return exact;
10764 }
10765 #endif
10766
10767 /*
10768  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10769  */
10770 #ifdef DEBUGGING
10771 static void
10772 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10773 {
10774     int bit;
10775     int set=0;
10776     regex_charset cs;
10777
10778     for (bit=0; bit<32; bit++) {
10779         if (flags & (1<<bit)) {
10780             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
10781                 continue;
10782             }
10783             if (!set++ && lead)
10784                 PerlIO_printf(Perl_debug_log, "%s",lead);
10785             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10786         }
10787     }
10788     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10789             if (!set++ && lead) {
10790                 PerlIO_printf(Perl_debug_log, "%s",lead);
10791             }
10792             switch (cs) {
10793                 case REGEX_UNICODE_CHARSET:
10794                     PerlIO_printf(Perl_debug_log, "UNICODE");
10795                     break;
10796                 case REGEX_LOCALE_CHARSET:
10797                     PerlIO_printf(Perl_debug_log, "LOCALE");
10798                     break;
10799                 case REGEX_ASCII_RESTRICTED_CHARSET:
10800                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10801                     break;
10802                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10803                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10804                     break;
10805                 default:
10806                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10807                     break;
10808             }
10809     }
10810     if (lead)  {
10811         if (set)
10812             PerlIO_printf(Perl_debug_log, "\n");
10813         else
10814             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10815     }
10816 }
10817 #endif
10818
10819 void
10820 Perl_regdump(pTHX_ const regexp *r)
10821 {
10822 #ifdef DEBUGGING
10823     dVAR;
10824     SV * const sv = sv_newmortal();
10825     SV *dsv= sv_newmortal();
10826     RXi_GET_DECL(r,ri);
10827     GET_RE_DEBUG_FLAGS_DECL;
10828
10829     PERL_ARGS_ASSERT_REGDUMP;
10830
10831     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10832
10833     /* Header fields of interest. */
10834     if (r->anchored_substr) {
10835         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10836             RE_SV_DUMPLEN(r->anchored_substr), 30);
10837         PerlIO_printf(Perl_debug_log,
10838                       "anchored %s%s at %"IVdf" ",
10839                       s, RE_SV_TAIL(r->anchored_substr),
10840                       (IV)r->anchored_offset);
10841     } else if (r->anchored_utf8) {
10842         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10843             RE_SV_DUMPLEN(r->anchored_utf8), 30);
10844         PerlIO_printf(Perl_debug_log,
10845                       "anchored utf8 %s%s at %"IVdf" ",
10846                       s, RE_SV_TAIL(r->anchored_utf8),
10847                       (IV)r->anchored_offset);
10848     }
10849     if (r->float_substr) {
10850         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10851             RE_SV_DUMPLEN(r->float_substr), 30);
10852         PerlIO_printf(Perl_debug_log,
10853                       "floating %s%s at %"IVdf"..%"UVuf" ",
10854                       s, RE_SV_TAIL(r->float_substr),
10855                       (IV)r->float_min_offset, (UV)r->float_max_offset);
10856     } else if (r->float_utf8) {
10857         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10858             RE_SV_DUMPLEN(r->float_utf8), 30);
10859         PerlIO_printf(Perl_debug_log,
10860                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10861                       s, RE_SV_TAIL(r->float_utf8),
10862                       (IV)r->float_min_offset, (UV)r->float_max_offset);
10863     }
10864     if (r->check_substr || r->check_utf8)
10865         PerlIO_printf(Perl_debug_log,
10866                       (const char *)
10867                       (r->check_substr == r->float_substr
10868                        && r->check_utf8 == r->float_utf8
10869                        ? "(checking floating" : "(checking anchored"));
10870     if (r->extflags & RXf_NOSCAN)
10871         PerlIO_printf(Perl_debug_log, " noscan");
10872     if (r->extflags & RXf_CHECK_ALL)
10873         PerlIO_printf(Perl_debug_log, " isall");
10874     if (r->check_substr || r->check_utf8)
10875         PerlIO_printf(Perl_debug_log, ") ");
10876
10877     if (ri->regstclass) {
10878         regprop(r, sv, ri->regstclass);
10879         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10880     }
10881     if (r->extflags & RXf_ANCH) {
10882         PerlIO_printf(Perl_debug_log, "anchored");
10883         if (r->extflags & RXf_ANCH_BOL)
10884             PerlIO_printf(Perl_debug_log, "(BOL)");
10885         if (r->extflags & RXf_ANCH_MBOL)
10886             PerlIO_printf(Perl_debug_log, "(MBOL)");
10887         if (r->extflags & RXf_ANCH_SBOL)
10888             PerlIO_printf(Perl_debug_log, "(SBOL)");
10889         if (r->extflags & RXf_ANCH_GPOS)
10890             PerlIO_printf(Perl_debug_log, "(GPOS)");
10891         PerlIO_putc(Perl_debug_log, ' ');
10892     }
10893     if (r->extflags & RXf_GPOS_SEEN)
10894         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10895     if (r->intflags & PREGf_SKIP)
10896         PerlIO_printf(Perl_debug_log, "plus ");
10897     if (r->intflags & PREGf_IMPLICIT)
10898         PerlIO_printf(Perl_debug_log, "implicit ");
10899     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10900     if (r->extflags & RXf_EVAL_SEEN)
10901         PerlIO_printf(Perl_debug_log, "with eval ");
10902     PerlIO_printf(Perl_debug_log, "\n");
10903     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10904 #else
10905     PERL_ARGS_ASSERT_REGDUMP;
10906     PERL_UNUSED_CONTEXT;
10907     PERL_UNUSED_ARG(r);
10908 #endif  /* DEBUGGING */
10909 }
10910
10911 /*
10912 - regprop - printable representation of opcode
10913 */
10914 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10915 STMT_START { \
10916         if (do_sep) {                           \
10917             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10918             if (flags & ANYOF_INVERT)           \
10919                 /*make sure the invert info is in each */ \
10920                 sv_catpvs(sv, "^");             \
10921             do_sep = 0;                         \
10922         }                                       \
10923 } STMT_END
10924
10925 void
10926 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10927 {
10928 #ifdef DEBUGGING
10929     dVAR;
10930     register int k;
10931     RXi_GET_DECL(prog,progi);
10932     GET_RE_DEBUG_FLAGS_DECL;
10933
10934     PERL_ARGS_ASSERT_REGPROP;
10935
10936     sv_setpvs(sv, "");
10937
10938     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
10939         /* It would be nice to FAIL() here, but this may be called from
10940            regexec.c, and it would be hard to supply pRExC_state. */
10941         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
10942     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
10943
10944     k = PL_regkind[OP(o)];
10945
10946     if (k == EXACT) {
10947         sv_catpvs(sv, " ");
10948         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10949          * is a crude hack but it may be the best for now since
10950          * we have no flag "this EXACTish node was UTF-8"
10951          * --jhi */
10952         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
10953                   PERL_PV_ESCAPE_UNI_DETECT |
10954                   PERL_PV_ESCAPE_NONASCII   |
10955                   PERL_PV_PRETTY_ELLIPSES   |
10956                   PERL_PV_PRETTY_LTGT       |
10957                   PERL_PV_PRETTY_NOCLEAR
10958                   );
10959     } else if (k == TRIE) {
10960         /* print the details of the trie in dumpuntil instead, as
10961          * progi->data isn't available here */
10962         const char op = OP(o);
10963         const U32 n = ARG(o);
10964         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
10965                (reg_ac_data *)progi->data->data[n] :
10966                NULL;
10967         const reg_trie_data * const trie
10968             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
10969
10970         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
10971         DEBUG_TRIE_COMPILE_r(
10972             Perl_sv_catpvf(aTHX_ sv,
10973                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
10974                 (UV)trie->startstate,
10975                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
10976                 (UV)trie->wordcount,
10977                 (UV)trie->minlen,
10978                 (UV)trie->maxlen,
10979                 (UV)TRIE_CHARCOUNT(trie),
10980                 (UV)trie->uniquecharcount
10981             )
10982         );
10983         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
10984             int i;
10985             int rangestart = -1;
10986             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
10987             sv_catpvs(sv, "[");
10988             for (i = 0; i <= 256; i++) {
10989                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
10990                     if (rangestart == -1)
10991                         rangestart = i;
10992                 } else if (rangestart != -1) {
10993                     if (i <= rangestart + 3)
10994                         for (; rangestart < i; rangestart++)
10995                             put_byte(sv, rangestart);
10996                     else {
10997                         put_byte(sv, rangestart);
10998                         sv_catpvs(sv, "-");
10999                         put_byte(sv, i - 1);
11000                     }
11001                     rangestart = -1;
11002                 }
11003             }
11004             sv_catpvs(sv, "]");
11005         }
11006
11007     } else if (k == CURLY) {
11008         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
11009             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
11010         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
11011     }
11012     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
11013         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
11014     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
11015         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
11016         if ( RXp_PAREN_NAMES(prog) ) {
11017             if ( k != REF || (OP(o) < NREF)) {
11018                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
11019                 SV **name= av_fetch(list, ARG(o), 0 );
11020                 if (name)
11021                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11022             }
11023             else {
11024                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
11025                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
11026                 I32 *nums=(I32*)SvPVX(sv_dat);
11027                 SV **name= av_fetch(list, nums[0], 0 );
11028                 I32 n;
11029                 if (name) {
11030                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
11031                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
11032                                     (n ? "," : ""), (IV)nums[n]);
11033                     }
11034                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11035                 }
11036             }
11037         }
11038     } else if (k == GOSUB)
11039         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
11040     else if (k == VERB) {
11041         if (!o->flags)
11042             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
11043                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
11044     } else if (k == LOGICAL)
11045         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
11046     else if (k == FOLDCHAR)
11047         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
11048     else if (k == ANYOF) {
11049         int i, rangestart = -1;
11050         const U8 flags = ANYOF_FLAGS(o);
11051         int do_sep = 0;
11052
11053         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11054         static const char * const anyofs[] = {
11055             "\\w",
11056             "\\W",
11057             "\\s",
11058             "\\S",
11059             "\\d",
11060             "\\D",
11061             "[:alnum:]",
11062             "[:^alnum:]",
11063             "[:alpha:]",
11064             "[:^alpha:]",
11065             "[:ascii:]",
11066             "[:^ascii:]",
11067             "[:cntrl:]",
11068             "[:^cntrl:]",
11069             "[:graph:]",
11070             "[:^graph:]",
11071             "[:lower:]",
11072             "[:^lower:]",
11073             "[:print:]",
11074             "[:^print:]",
11075             "[:punct:]",
11076             "[:^punct:]",
11077             "[:upper:]",
11078             "[:^upper:]",
11079             "[:xdigit:]",
11080             "[:^xdigit:]",
11081             "[:space:]",
11082             "[:^space:]",
11083             "[:blank:]",
11084             "[:^blank:]"
11085         };
11086
11087         if (flags & ANYOF_LOCALE)
11088             sv_catpvs(sv, "{loc}");
11089         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
11090             sv_catpvs(sv, "{i}");
11091         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
11092         if (flags & ANYOF_INVERT)
11093             sv_catpvs(sv, "^");
11094
11095         /* output what the standard cp 0-255 bitmap matches */
11096         for (i = 0; i <= 256; i++) {
11097             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11098                 if (rangestart == -1)
11099                     rangestart = i;
11100             } else if (rangestart != -1) {
11101                 if (i <= rangestart + 3)
11102                     for (; rangestart < i; rangestart++)
11103                         put_byte(sv, rangestart);
11104                 else {
11105                     put_byte(sv, rangestart);
11106                     sv_catpvs(sv, "-");
11107                     put_byte(sv, i - 1);
11108                 }
11109                 do_sep = 1;
11110                 rangestart = -1;
11111             }
11112         }
11113
11114         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11115         /* output any special charclass tests (used entirely under use locale) */
11116         if (ANYOF_CLASS_TEST_ANY_SET(o))
11117             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11118                 if (ANYOF_CLASS_TEST(o,i)) {
11119                     sv_catpv(sv, anyofs[i]);
11120                     do_sep = 1;
11121                 }
11122
11123         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11124
11125         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11126             sv_catpvs(sv, "{non-utf8-latin1-all}");
11127         }
11128
11129         /* output information about the unicode matching */
11130         if (flags & ANYOF_UNICODE_ALL)
11131             sv_catpvs(sv, "{unicode_all}");
11132         else if (ANYOF_NONBITMAP(o))
11133             sv_catpvs(sv, "{unicode}");
11134         if (flags & ANYOF_NONBITMAP_NON_UTF8)
11135             sv_catpvs(sv, "{outside bitmap}");
11136
11137         {
11138             SV *lv;
11139             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11140
11141             if (lv) {
11142                 if (sw) {
11143                     U8 s[UTF8_MAXBYTES_CASE+1];
11144
11145                     for (i = 0; i <= 256; i++) { /* just the first 256 */
11146                         uvchr_to_utf8(s, i);
11147
11148                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
11149                             if (rangestart == -1)
11150                                 rangestart = i;
11151                         } else if (rangestart != -1) {
11152                             if (i <= rangestart + 3)
11153                                 for (; rangestart < i; rangestart++) {
11154                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
11155                                     U8 *p;
11156                                     for(p = s; p < e; p++)
11157                                         put_byte(sv, *p);
11158                                 }
11159                             else {
11160                                 const U8 *e = uvchr_to_utf8(s,rangestart);
11161                                 U8 *p;
11162                                 for (p = s; p < e; p++)
11163                                     put_byte(sv, *p);
11164                                 sv_catpvs(sv, "-");
11165                                 e = uvchr_to_utf8(s, i-1);
11166                                 for (p = s; p < e; p++)
11167                                     put_byte(sv, *p);
11168                                 }
11169                                 rangestart = -1;
11170                             }
11171                         }
11172
11173                     sv_catpvs(sv, "..."); /* et cetera */
11174                 }
11175
11176                 {
11177                     char *s = savesvpv(lv);
11178                     char * const origs = s;
11179
11180                     while (*s && *s != '\n')
11181                         s++;
11182
11183                     if (*s == '\n') {
11184                         const char * const t = ++s;
11185
11186                         while (*s) {
11187                             if (*s == '\n')
11188                                 *s = ' ';
11189                             s++;
11190                         }
11191                         if (s[-1] == ' ')
11192                             s[-1] = 0;
11193
11194                         sv_catpv(sv, t);
11195                     }
11196
11197                     Safefree(origs);
11198                 }
11199             }
11200         }
11201
11202         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11203     }
11204     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11205         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11206 #else
11207     PERL_UNUSED_CONTEXT;
11208     PERL_UNUSED_ARG(sv);
11209     PERL_UNUSED_ARG(o);
11210     PERL_UNUSED_ARG(prog);
11211 #endif  /* DEBUGGING */
11212 }
11213
11214 SV *
11215 Perl_re_intuit_string(pTHX_ REGEXP * const r)
11216 {                               /* Assume that RE_INTUIT is set */
11217     dVAR;
11218     struct regexp *const prog = (struct regexp *)SvANY(r);
11219     GET_RE_DEBUG_FLAGS_DECL;
11220
11221     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11222     PERL_UNUSED_CONTEXT;
11223
11224     DEBUG_COMPILE_r(
11225         {
11226             const char * const s = SvPV_nolen_const(prog->check_substr
11227                       ? prog->check_substr : prog->check_utf8);
11228
11229             if (!PL_colorset) reginitcolors();
11230             PerlIO_printf(Perl_debug_log,
11231                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11232                       PL_colors[4],
11233                       prog->check_substr ? "" : "utf8 ",
11234                       PL_colors[5],PL_colors[0],
11235                       s,
11236                       PL_colors[1],
11237                       (strlen(s) > 60 ? "..." : ""));
11238         } );
11239
11240     return prog->check_substr ? prog->check_substr : prog->check_utf8;
11241 }
11242
11243 /*
11244    pregfree()
11245
11246    handles refcounting and freeing the perl core regexp structure. When
11247    it is necessary to actually free the structure the first thing it
11248    does is call the 'free' method of the regexp_engine associated to
11249    the regexp, allowing the handling of the void *pprivate; member
11250    first. (This routine is not overridable by extensions, which is why
11251    the extensions free is called first.)
11252
11253    See regdupe and regdupe_internal if you change anything here.
11254 */
11255 #ifndef PERL_IN_XSUB_RE
11256 void
11257 Perl_pregfree(pTHX_ REGEXP *r)
11258 {
11259     SvREFCNT_dec(r);
11260 }
11261
11262 void
11263 Perl_pregfree2(pTHX_ REGEXP *rx)
11264 {
11265     dVAR;
11266     struct regexp *const r = (struct regexp *)SvANY(rx);
11267     GET_RE_DEBUG_FLAGS_DECL;
11268
11269     PERL_ARGS_ASSERT_PREGFREE2;
11270
11271     if (r->mother_re) {
11272         ReREFCNT_dec(r->mother_re);
11273     } else {
11274         CALLREGFREE_PVT(rx); /* free the private data */
11275         SvREFCNT_dec(RXp_PAREN_NAMES(r));
11276     }
11277     if (r->substrs) {
11278         SvREFCNT_dec(r->anchored_substr);
11279         SvREFCNT_dec(r->anchored_utf8);
11280         SvREFCNT_dec(r->float_substr);
11281         SvREFCNT_dec(r->float_utf8);
11282         Safefree(r->substrs);
11283     }
11284     RX_MATCH_COPY_FREE(rx);
11285 #ifdef PERL_OLD_COPY_ON_WRITE
11286     SvREFCNT_dec(r->saved_copy);
11287 #endif
11288     Safefree(r->offs);
11289 }
11290
11291 /*  reg_temp_copy()
11292
11293     This is a hacky workaround to the structural issue of match results
11294     being stored in the regexp structure which is in turn stored in
11295     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11296     could be PL_curpm in multiple contexts, and could require multiple
11297     result sets being associated with the pattern simultaneously, such
11298     as when doing a recursive match with (??{$qr})
11299
11300     The solution is to make a lightweight copy of the regexp structure
11301     when a qr// is returned from the code executed by (??{$qr}) this
11302     lightweight copy doesn't actually own any of its data except for
11303     the starp/end and the actual regexp structure itself.
11304
11305 */
11306
11307
11308 REGEXP *
11309 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11310 {
11311     struct regexp *ret;
11312     struct regexp *const r = (struct regexp *)SvANY(rx);
11313     register const I32 npar = r->nparens+1;
11314
11315     PERL_ARGS_ASSERT_REG_TEMP_COPY;
11316
11317     if (!ret_x)
11318         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11319     ret = (struct regexp *)SvANY(ret_x);
11320
11321     (void)ReREFCNT_inc(rx);
11322     /* We can take advantage of the existing "copied buffer" mechanism in SVs
11323        by pointing directly at the buffer, but flagging that the allocated
11324        space in the copy is zero. As we've just done a struct copy, it's now
11325        a case of zero-ing that, rather than copying the current length.  */
11326     SvPV_set(ret_x, RX_WRAPPED(rx));
11327     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11328     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11329            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11330     SvLEN_set(ret_x, 0);
11331     SvSTASH_set(ret_x, NULL);
11332     SvMAGIC_set(ret_x, NULL);
11333     Newx(ret->offs, npar, regexp_paren_pair);
11334     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11335     if (r->substrs) {
11336         Newx(ret->substrs, 1, struct reg_substr_data);
11337         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11338
11339         SvREFCNT_inc_void(ret->anchored_substr);
11340         SvREFCNT_inc_void(ret->anchored_utf8);
11341         SvREFCNT_inc_void(ret->float_substr);
11342         SvREFCNT_inc_void(ret->float_utf8);
11343
11344         /* check_substr and check_utf8, if non-NULL, point to either their
11345            anchored or float namesakes, and don't hold a second reference.  */
11346     }
11347     RX_MATCH_COPIED_off(ret_x);
11348 #ifdef PERL_OLD_COPY_ON_WRITE
11349     ret->saved_copy = NULL;
11350 #endif
11351     ret->mother_re = rx;
11352
11353     return ret_x;
11354 }
11355 #endif
11356
11357 /* regfree_internal()
11358
11359    Free the private data in a regexp. This is overloadable by
11360    extensions. Perl takes care of the regexp structure in pregfree(),
11361    this covers the *pprivate pointer which technically perl doesn't
11362    know about, however of course we have to handle the
11363    regexp_internal structure when no extension is in use.
11364
11365    Note this is called before freeing anything in the regexp
11366    structure.
11367  */
11368
11369 void
11370 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11371 {
11372     dVAR;
11373     struct regexp *const r = (struct regexp *)SvANY(rx);
11374     RXi_GET_DECL(r,ri);
11375     GET_RE_DEBUG_FLAGS_DECL;
11376
11377     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11378
11379     DEBUG_COMPILE_r({
11380         if (!PL_colorset)
11381             reginitcolors();
11382         {
11383             SV *dsv= sv_newmortal();
11384             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11385                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11386             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11387                 PL_colors[4],PL_colors[5],s);
11388         }
11389     });
11390 #ifdef RE_TRACK_PATTERN_OFFSETS
11391     if (ri->u.offsets)
11392         Safefree(ri->u.offsets);             /* 20010421 MJD */
11393 #endif
11394     if (ri->data) {
11395         int n = ri->data->count;
11396         PAD* new_comppad = NULL;
11397         PAD* old_comppad;
11398         PADOFFSET refcnt;
11399
11400         while (--n >= 0) {
11401           /* If you add a ->what type here, update the comment in regcomp.h */
11402             switch (ri->data->what[n]) {
11403             case 'a':
11404             case 's':
11405             case 'S':
11406             case 'u':
11407                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11408                 break;
11409             case 'f':
11410                 Safefree(ri->data->data[n]);
11411                 break;
11412             case 'p':
11413                 new_comppad = MUTABLE_AV(ri->data->data[n]);
11414                 break;
11415             case 'o':
11416                 if (new_comppad == NULL)
11417                     Perl_croak(aTHX_ "panic: pregfree comppad");
11418                 PAD_SAVE_LOCAL(old_comppad,
11419                     /* Watch out for global destruction's random ordering. */
11420                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11421                 );
11422                 OP_REFCNT_LOCK;
11423                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11424                 OP_REFCNT_UNLOCK;
11425                 if (!refcnt)
11426                     op_free((OP_4tree*)ri->data->data[n]);
11427
11428                 PAD_RESTORE_LOCAL(old_comppad);
11429                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
11430                 new_comppad = NULL;
11431                 break;
11432             case 'n':
11433                 break;
11434             case 'T':
11435                 { /* Aho Corasick add-on structure for a trie node.
11436                      Used in stclass optimization only */
11437                     U32 refcount;
11438                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11439                     OP_REFCNT_LOCK;
11440                     refcount = --aho->refcount;
11441                     OP_REFCNT_UNLOCK;
11442                     if ( !refcount ) {
11443                         PerlMemShared_free(aho->states);
11444                         PerlMemShared_free(aho->fail);
11445                          /* do this last!!!! */
11446                         PerlMemShared_free(ri->data->data[n]);
11447                         PerlMemShared_free(ri->regstclass);
11448                     }
11449                 }
11450                 break;
11451             case 't':
11452                 {
11453                     /* trie structure. */
11454                     U32 refcount;
11455                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11456                     OP_REFCNT_LOCK;
11457                     refcount = --trie->refcount;
11458                     OP_REFCNT_UNLOCK;
11459                     if ( !refcount ) {
11460                         PerlMemShared_free(trie->charmap);
11461                         PerlMemShared_free(trie->states);
11462                         PerlMemShared_free(trie->trans);
11463                         if (trie->bitmap)
11464                             PerlMemShared_free(trie->bitmap);
11465                         if (trie->jump)
11466                             PerlMemShared_free(trie->jump);
11467                         PerlMemShared_free(trie->wordinfo);
11468                         /* do this last!!!! */
11469                         PerlMemShared_free(ri->data->data[n]);
11470                     }
11471                 }
11472                 break;
11473             default:
11474                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11475             }
11476         }
11477         Safefree(ri->data->what);
11478         Safefree(ri->data);
11479     }
11480
11481     Safefree(ri);
11482 }
11483
11484 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11485 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11486 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
11487
11488 /*
11489    re_dup - duplicate a regexp.
11490
11491    This routine is expected to clone a given regexp structure. It is only
11492    compiled under USE_ITHREADS.
11493
11494    After all of the core data stored in struct regexp is duplicated
11495    the regexp_engine.dupe method is used to copy any private data
11496    stored in the *pprivate pointer. This allows extensions to handle
11497    any duplication it needs to do.
11498
11499    See pregfree() and regfree_internal() if you change anything here.
11500 */
11501 #if defined(USE_ITHREADS)
11502 #ifndef PERL_IN_XSUB_RE
11503 void
11504 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11505 {
11506     dVAR;
11507     I32 npar;
11508     const struct regexp *r = (const struct regexp *)SvANY(sstr);
11509     struct regexp *ret = (struct regexp *)SvANY(dstr);
11510
11511     PERL_ARGS_ASSERT_RE_DUP_GUTS;
11512
11513     npar = r->nparens+1;
11514     Newx(ret->offs, npar, regexp_paren_pair);
11515     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11516     if(ret->swap) {
11517         /* no need to copy these */
11518         Newx(ret->swap, npar, regexp_paren_pair);
11519     }
11520
11521     if (ret->substrs) {
11522         /* Do it this way to avoid reading from *r after the StructCopy().
11523            That way, if any of the sv_dup_inc()s dislodge *r from the L1
11524            cache, it doesn't matter.  */
11525         const bool anchored = r->check_substr
11526             ? r->check_substr == r->anchored_substr
11527             : r->check_utf8 == r->anchored_utf8;
11528         Newx(ret->substrs, 1, struct reg_substr_data);
11529         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11530
11531         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11532         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11533         ret->float_substr = sv_dup_inc(ret->float_substr, param);
11534         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11535
11536         /* check_substr and check_utf8, if non-NULL, point to either their
11537            anchored or float namesakes, and don't hold a second reference.  */
11538
11539         if (ret->check_substr) {
11540             if (anchored) {
11541                 assert(r->check_utf8 == r->anchored_utf8);
11542                 ret->check_substr = ret->anchored_substr;
11543                 ret->check_utf8 = ret->anchored_utf8;
11544             } else {
11545                 assert(r->check_substr == r->float_substr);
11546                 assert(r->check_utf8 == r->float_utf8);
11547                 ret->check_substr = ret->float_substr;
11548                 ret->check_utf8 = ret->float_utf8;
11549             }
11550         } else if (ret->check_utf8) {
11551             if (anchored) {
11552                 ret->check_utf8 = ret->anchored_utf8;
11553             } else {
11554                 ret->check_utf8 = ret->float_utf8;
11555             }
11556         }
11557     }
11558
11559     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11560
11561     if (ret->pprivate)
11562         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11563
11564     if (RX_MATCH_COPIED(dstr))
11565         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
11566     else
11567         ret->subbeg = NULL;
11568 #ifdef PERL_OLD_COPY_ON_WRITE
11569     ret->saved_copy = NULL;
11570 #endif
11571
11572     if (ret->mother_re) {
11573         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11574             /* Our storage points directly to our mother regexp, but that's
11575                1: a buffer in a different thread
11576                2: something we no longer hold a reference on
11577                so we need to copy it locally.  */
11578             /* Note we need to sue SvCUR() on our mother_re, because it, in
11579                turn, may well be pointing to its own mother_re.  */
11580             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11581                                    SvCUR(ret->mother_re)+1));
11582             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11583         }
11584         ret->mother_re      = NULL;
11585     }
11586     ret->gofs = 0;
11587 }
11588 #endif /* PERL_IN_XSUB_RE */
11589
11590 /*
11591    regdupe_internal()
11592
11593    This is the internal complement to regdupe() which is used to copy
11594    the structure pointed to by the *pprivate pointer in the regexp.
11595    This is the core version of the extension overridable cloning hook.
11596    The regexp structure being duplicated will be copied by perl prior
11597    to this and will be provided as the regexp *r argument, however
11598    with the /old/ structures pprivate pointer value. Thus this routine
11599    may override any copying normally done by perl.
11600
11601    It returns a pointer to the new regexp_internal structure.
11602 */
11603
11604 void *
11605 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11606 {
11607     dVAR;
11608     struct regexp *const r = (struct regexp *)SvANY(rx);
11609     regexp_internal *reti;
11610     int len, npar;
11611     RXi_GET_DECL(r,ri);
11612
11613     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11614
11615     npar = r->nparens+1;
11616     len = ProgLen(ri);
11617
11618     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11619     Copy(ri->program, reti->program, len+1, regnode);
11620
11621
11622     reti->regstclass = NULL;
11623
11624     if (ri->data) {
11625         struct reg_data *d;
11626         const int count = ri->data->count;
11627         int i;
11628
11629         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11630                 char, struct reg_data);
11631         Newx(d->what, count, U8);
11632
11633         d->count = count;
11634         for (i = 0; i < count; i++) {
11635             d->what[i] = ri->data->what[i];
11636             switch (d->what[i]) {
11637                 /* legal options are one of: sSfpontTua
11638                    see also regcomp.h and pregfree() */
11639             case 'a': /* actually an AV, but the dup function is identical.  */
11640             case 's':
11641             case 'S':
11642             case 'p': /* actually an AV, but the dup function is identical.  */
11643             case 'u': /* actually an HV, but the dup function is identical.  */
11644                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11645                 break;
11646             case 'f':
11647                 /* This is cheating. */
11648                 Newx(d->data[i], 1, struct regnode_charclass_class);
11649                 StructCopy(ri->data->data[i], d->data[i],
11650                             struct regnode_charclass_class);
11651                 reti->regstclass = (regnode*)d->data[i];
11652                 break;
11653             case 'o':
11654                 /* Compiled op trees are readonly and in shared memory,
11655                    and can thus be shared without duplication. */
11656                 OP_REFCNT_LOCK;
11657                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11658                 OP_REFCNT_UNLOCK;
11659                 break;
11660             case 'T':
11661                 /* Trie stclasses are readonly and can thus be shared
11662                  * without duplication. We free the stclass in pregfree
11663                  * when the corresponding reg_ac_data struct is freed.
11664                  */
11665                 reti->regstclass= ri->regstclass;
11666                 /* Fall through */
11667             case 't':
11668                 OP_REFCNT_LOCK;
11669                 ((reg_trie_data*)ri->data->data[i])->refcount++;
11670                 OP_REFCNT_UNLOCK;
11671                 /* Fall through */
11672             case 'n':
11673                 d->data[i] = ri->data->data[i];
11674                 break;
11675             default:
11676                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11677             }
11678         }
11679
11680         reti->data = d;
11681     }
11682     else
11683         reti->data = NULL;
11684
11685     reti->name_list_idx = ri->name_list_idx;
11686
11687 #ifdef RE_TRACK_PATTERN_OFFSETS
11688     if (ri->u.offsets) {
11689         Newx(reti->u.offsets, 2*len+1, U32);
11690         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11691     }
11692 #else
11693     SetProgLen(reti,len);
11694 #endif
11695
11696     return (void*)reti;
11697 }
11698
11699 #endif    /* USE_ITHREADS */
11700
11701 #ifndef PERL_IN_XSUB_RE
11702
11703 /*
11704  - regnext - dig the "next" pointer out of a node
11705  */
11706 regnode *
11707 Perl_regnext(pTHX_ register regnode *p)
11708 {
11709     dVAR;
11710     register I32 offset;
11711
11712     if (!p)
11713         return(NULL);
11714
11715     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
11716         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11717     }
11718
11719     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11720     if (offset == 0)
11721         return(NULL);
11722
11723     return(p+offset);
11724 }
11725 #endif
11726
11727 STATIC void
11728 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11729 {
11730     va_list args;
11731     STRLEN l1 = strlen(pat1);
11732     STRLEN l2 = strlen(pat2);
11733     char buf[512];
11734     SV *msv;
11735     const char *message;
11736
11737     PERL_ARGS_ASSERT_RE_CROAK2;
11738
11739     if (l1 > 510)
11740         l1 = 510;
11741     if (l1 + l2 > 510)
11742         l2 = 510 - l1;
11743     Copy(pat1, buf, l1 , char);
11744     Copy(pat2, buf + l1, l2 , char);
11745     buf[l1 + l2] = '\n';
11746     buf[l1 + l2 + 1] = '\0';
11747 #ifdef I_STDARG
11748     /* ANSI variant takes additional second argument */
11749     va_start(args, pat2);
11750 #else
11751     va_start(args);
11752 #endif
11753     msv = vmess(buf, &args);
11754     va_end(args);
11755     message = SvPV_const(msv,l1);
11756     if (l1 > 512)
11757         l1 = 512;
11758     Copy(message, buf, l1 , char);
11759     buf[l1-1] = '\0';                   /* Overwrite \n */
11760     Perl_croak(aTHX_ "%s", buf);
11761 }
11762
11763 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
11764
11765 #ifndef PERL_IN_XSUB_RE
11766 void
11767 Perl_save_re_context(pTHX)
11768 {
11769     dVAR;
11770
11771     struct re_save_state *state;
11772
11773     SAVEVPTR(PL_curcop);
11774     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11775
11776     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11777     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11778     SSPUSHUV(SAVEt_RE_STATE);
11779
11780     Copy(&PL_reg_state, state, 1, struct re_save_state);
11781
11782     PL_reg_start_tmp = 0;
11783     PL_reg_start_tmpl = 0;
11784     PL_reg_oldsaved = NULL;
11785     PL_reg_oldsavedlen = 0;
11786     PL_reg_maxiter = 0;
11787     PL_reg_leftiter = 0;
11788     PL_reg_poscache = NULL;
11789     PL_reg_poscache_size = 0;
11790 #ifdef PERL_OLD_COPY_ON_WRITE
11791     PL_nrs = NULL;
11792 #endif
11793
11794     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11795     if (PL_curpm) {
11796         const REGEXP * const rx = PM_GETRE(PL_curpm);
11797         if (rx) {
11798             U32 i;
11799             for (i = 1; i <= RX_NPARENS(rx); i++) {
11800                 char digits[TYPE_CHARS(long)];
11801                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11802                 GV *const *const gvp
11803                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11804
11805                 if (gvp) {
11806                     GV * const gv = *gvp;
11807                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11808                         save_scalar(gv);
11809                 }
11810             }
11811         }
11812     }
11813 }
11814 #endif
11815
11816 static void
11817 clear_re(pTHX_ void *r)
11818 {
11819     dVAR;
11820     ReREFCNT_dec((REGEXP *)r);
11821 }
11822
11823 #ifdef DEBUGGING
11824
11825 STATIC void
11826 S_put_byte(pTHX_ SV *sv, int c)
11827 {
11828     PERL_ARGS_ASSERT_PUT_BYTE;
11829
11830     /* Our definition of isPRINT() ignores locales, so only bytes that are
11831        not part of UTF-8 are considered printable. I assume that the same
11832        holds for UTF-EBCDIC.
11833        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11834        which Wikipedia says:
11835
11836        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11837        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11838        identical, to the ASCII delete (DEL) or rubout control character.
11839        ) So the old condition can be simplified to !isPRINT(c)  */
11840     if (!isPRINT(c)) {
11841         if (c < 256) {
11842             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11843         }
11844         else {
11845             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11846         }
11847     }
11848     else {
11849         const char string = c;
11850         if (c == '-' || c == ']' || c == '\\' || c == '^')
11851             sv_catpvs(sv, "\\");
11852         sv_catpvn(sv, &string, 1);
11853     }
11854 }
11855
11856
11857 #define CLEAR_OPTSTART \
11858     if (optstart) STMT_START { \
11859             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11860             optstart=NULL; \
11861     } STMT_END
11862
11863 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11864
11865 STATIC const regnode *
11866 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11867             const regnode *last, const regnode *plast,
11868             SV* sv, I32 indent, U32 depth)
11869 {
11870     dVAR;
11871     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
11872     register const regnode *next;
11873     const regnode *optstart= NULL;
11874
11875     RXi_GET_DECL(r,ri);
11876     GET_RE_DEBUG_FLAGS_DECL;
11877
11878     PERL_ARGS_ASSERT_DUMPUNTIL;
11879
11880 #ifdef DEBUG_DUMPUNTIL
11881     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11882         last ? last-start : 0,plast ? plast-start : 0);
11883 #endif
11884
11885     if (plast && plast < last)
11886         last= plast;
11887
11888     while (PL_regkind[op] != END && (!last || node < last)) {
11889         /* While that wasn't END last time... */
11890         NODE_ALIGN(node);
11891         op = OP(node);
11892         if (op == CLOSE || op == WHILEM)
11893             indent--;
11894         next = regnext((regnode *)node);
11895
11896         /* Where, what. */
11897         if (OP(node) == OPTIMIZED) {
11898             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11899                 optstart = node;
11900             else
11901                 goto after_print;
11902         } else
11903             CLEAR_OPTSTART;
11904
11905         regprop(r, sv, node);
11906         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11907                       (int)(2*indent + 1), "", SvPVX_const(sv));
11908
11909         if (OP(node) != OPTIMIZED) {
11910             if (next == NULL)           /* Next ptr. */
11911                 PerlIO_printf(Perl_debug_log, " (0)");
11912             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11913                 PerlIO_printf(Perl_debug_log, " (FAIL)");
11914             else
11915                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11916             (void)PerlIO_putc(Perl_debug_log, '\n');
11917         }
11918
11919       after_print:
11920         if (PL_regkind[(U8)op] == BRANCHJ) {
11921             assert(next);
11922             {
11923                 register const regnode *nnode = (OP(next) == LONGJMP
11924                                              ? regnext((regnode *)next)
11925                                              : next);
11926                 if (last && nnode > last)
11927                     nnode = last;
11928                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11929             }
11930         }
11931         else if (PL_regkind[(U8)op] == BRANCH) {
11932             assert(next);
11933             DUMPUNTIL(NEXTOPER(node), next);
11934         }
11935         else if ( PL_regkind[(U8)op]  == TRIE ) {
11936             const regnode *this_trie = node;
11937             const char op = OP(node);
11938             const U32 n = ARG(node);
11939             const reg_ac_data * const ac = op>=AHOCORASICK ?
11940                (reg_ac_data *)ri->data->data[n] :
11941                NULL;
11942             const reg_trie_data * const trie =
11943                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
11944 #ifdef DEBUGGING
11945             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
11946 #endif
11947             const regnode *nextbranch= NULL;
11948             I32 word_idx;
11949             sv_setpvs(sv, "");
11950             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
11951                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
11952
11953                 PerlIO_printf(Perl_debug_log, "%*s%s ",
11954                    (int)(2*(indent+3)), "",
11955                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
11956                             PL_colors[0], PL_colors[1],
11957                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
11958                             PERL_PV_PRETTY_ELLIPSES    |
11959                             PERL_PV_PRETTY_LTGT
11960                             )
11961                             : "???"
11962                 );
11963                 if (trie->jump) {
11964                     U16 dist= trie->jump[word_idx+1];
11965                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
11966                                   (UV)((dist ? this_trie + dist : next) - start));
11967                     if (dist) {
11968                         if (!nextbranch)
11969                             nextbranch= this_trie + trie->jump[0];
11970                         DUMPUNTIL(this_trie + dist, nextbranch);
11971                     }
11972                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
11973                         nextbranch= regnext((regnode *)nextbranch);
11974                 } else {
11975                     PerlIO_printf(Perl_debug_log, "\n");
11976                 }
11977             }
11978             if (last && next > last)
11979                 node= last;
11980             else
11981                 node= next;
11982         }
11983         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
11984             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
11985                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
11986         }
11987         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
11988             assert(next);
11989             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
11990         }
11991         else if ( op == PLUS || op == STAR) {
11992             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
11993         }
11994         else if (PL_regkind[(U8)op] == ANYOF) {
11995             /* arglen 1 + class block */
11996             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
11997                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
11998             node = NEXTOPER(node);
11999         }
12000         else if (PL_regkind[(U8)op] == EXACT) {
12001             /* Literal string, where present. */
12002             node += NODE_SZ_STR(node) - 1;
12003             node = NEXTOPER(node);
12004         }
12005         else {
12006             node = NEXTOPER(node);
12007             node += regarglen[(U8)op];
12008         }
12009         if (op == CURLYX || op == OPEN)
12010             indent++;
12011     }
12012     CLEAR_OPTSTART;
12013 #ifdef DEBUG_DUMPUNTIL
12014     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
12015 #endif
12016     return node;
12017 }
12018
12019 #endif  /* DEBUGGING */
12020
12021 /*
12022  * Local variables:
12023  * c-indentation-style: bsd
12024  * c-basic-offset: 4
12025  * indent-tabs-mode: t
12026  * End:
12027  *
12028  * ex: set ts=8 sts=4 sw=4 noet:
12029  */