regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     HV          *paren_names;           /* Paren names */
 138
 139     regnode     **recurse;              /* Recurse regops */
 140     I32         recurse_count;          /* Number of recurse regops */
 141 #if ADD_TO_REGEXEC
 142     char        *starttry;              /* -Dr: where regtry was called. */
 143 #define RExC_starttry   (pRExC_state->starttry)
 144 #endif
 145 #ifdef DEBUGGING
 146     const char  *lastparse;
 147     I32         lastnum;
 148     AV          *paren_name_list;       /* idx -> name */
 149 #define RExC_lastparse  (pRExC_state->lastparse)
 150 #define RExC_lastnum    (pRExC_state->lastnum)
 151 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 152 #endif
 153 } RExC_state_t;
 154
 155 #define RExC_flags      (pRExC_state->flags)
 156 #define RExC_precomp    (pRExC_state->precomp)
 157 #define RExC_rx_sv      (pRExC_state->rx_sv)
 158 #define RExC_rx         (pRExC_state->rx)
 159 #define RExC_rxi        (pRExC_state->rxi)
 160 #define RExC_start      (pRExC_state->start)
 161 #define RExC_end        (pRExC_state->end)
 162 #define RExC_parse      (pRExC_state->parse)
 163 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 164 #ifdef RE_TRACK_PATTERN_OFFSETS
 165 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 166 #endif
 167 #define RExC_emit       (pRExC_state->emit)
 168 #define RExC_emit_start (pRExC_state->emit_start)
 169 #define RExC_emit_bound (pRExC_state->emit_bound)
 170 #define RExC_naughty    (pRExC_state->naughty)
 171 #define RExC_sawback    (pRExC_state->sawback)
 172 #define RExC_seen       (pRExC_state->seen)
 173 #define RExC_size       (pRExC_state->size)
 174 #define RExC_npar       (pRExC_state->npar)
 175 #define RExC_nestroot   (pRExC_state->nestroot)
 176 #define RExC_extralen   (pRExC_state->extralen)
 177 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 178 #define RExC_seen_evals (pRExC_state->seen_evals)
 179 #define RExC_utf8       (pRExC_state->utf8)
 180 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 181 #define RExC_open_parens        (pRExC_state->open_parens)
 182 #define RExC_close_parens       (pRExC_state->close_parens)
 183 #define RExC_opend      (pRExC_state->opend)
 184 #define RExC_paren_names        (pRExC_state->paren_names)
 185 #define RExC_recurse    (pRExC_state->recurse)
 186 #define RExC_recurse_count      (pRExC_state->recurse_count)
 187
 188
 189 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 190 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 191         ((*s) == '{' && regcurly(s)))
 192
 193 #ifdef SPSTART
 194 #undef SPSTART          /* dratted cpp namespace... */
 195 #endif
 196 /*
 197  * Flags to be passed up and down.
 198  */
 199 #define WORST           0       /* Worst case. */
 200 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 201
 202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 203  * character, and if utf8, must be invariant. */
 204 #define SIMPLE          0x02
 205 #define SPSTART         0x04    /* Starts with * or +. */
 206 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 207 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 208
 209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 210
 211 /* whether trie related optimizations are enabled */
 212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 213 #define TRIE_STUDY_OPT
 214 #define FULL_TRIE_STUDY
 215 #define TRIE_STCLASS
 216 #endif
 217
 218
 219
 220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 221 #define PBITVAL(paren) (1 << ((paren) & 7))
 222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 225
 226 /* If not already in utf8, do a longjmp back to the beginning */
 227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 228 #define REQUIRE_UTF8    STMT_START {                                       \
 229                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 230                         } STMT_END
 231
 232 /* About scan_data_t.
 233
 234   During optimisation we recurse through the regexp program performing
 235   various inplace (keyhole style) optimisations. In addition study_chunk
 236   and scan_commit populate this data structure with information about
 237   what strings MUST appear in the pattern. We look for the longest
 238   string that must appear at a fixed location, and we look for the
 239   longest string that may appear at a floating location. So for instance
 240   in the pattern:
 241
 242     /FOO[xX]A.*B[xX]BAR/
 243
 244   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 245   strings (because they follow a .* construct). study_chunk will identify
 246   both FOO and BAR as being the longest fixed and floating strings respectively.
 247
 248   The strings can be composites, for instance
 249
 250      /(f)(o)(o)/
 251
 252   will result in a composite fixed substring 'foo'.
 253
 254   For each string some basic information is maintained:
 255
 256   - offset or min_offset
 257     This is the position the string must appear at, or not before.
 258     It also implicitly (when combined with minlenp) tells us how many
 259     characters must match before the string we are searching for.
 260     Likewise when combined with minlenp and the length of the string it
 261     tells us how many characters must appear after the string we have
 262     found.
 263
 264   - max_offset
 265     Only used for floating strings. This is the rightmost point that
 266     the string can appear at. If set to I32 max it indicates that the
 267     string can occur infinitely far to the right.
 268
 269   - minlenp
 270     A pointer to the minimum length of the pattern that the string
 271     was found inside. This is important as in the case of positive
 272     lookahead or positive lookbehind we can have multiple patterns
 273     involved. Consider
 274
 275     /(?=FOO).*F/
 276
 277     The minimum length of the pattern overall is 3, the minimum length
 278     of the lookahead part is 3, but the minimum length of the part that
 279     will actually match is 1. So 'FOO's minimum length is 3, but the
 280     minimum length for the F is 1. This is important as the minimum length
 281     is used to determine offsets in front of and behind the string being
 282     looked for.  Since strings can be composites this is the length of the
 283     pattern at the time it was committed with a scan_commit. Note that
 284     the length is calculated by study_chunk, so that the minimum lengths
 285     are not known until the full pattern has been compiled, thus the
 286     pointer to the value.
 287
 288   - lookbehind
 289
 290     In the case of lookbehind the string being searched for can be
 291     offset past the start point of the final matching string.
 292     If this value was just blithely removed from the min_offset it would
 293     invalidate some of the calculations for how many chars must match
 294     before or after (as they are derived from min_offset and minlen and
 295     the length of the string being searched for).
 296     When the final pattern is compiled and the data is moved from the
 297     scan_data_t structure into the regexp structure the information
 298     about lookbehind is factored in, with the information that would
 299     have been lost precalculated in the end_shift field for the
 300     associated string.
 301
 302   The fields pos_min and pos_delta are used to store the minimum offset
 303   and the delta to the maximum offset at the current point in the pattern.
 304
 305 */
 306
 307 typedef struct scan_data_t {
 308     /*I32 len_min;      unused */
 309     /*I32 len_delta;    unused */
 310     I32 pos_min;
 311     I32 pos_delta;
 312     SV *last_found;
 313     I32 last_end;           /* min value, <0 unless valid. */
 314     I32 last_start_min;
 315     I32 last_start_max;
 316     SV **longest;           /* Either &l_fixed, or &l_float. */
 317     SV *longest_fixed;      /* longest fixed string found in pattern */
 318     I32 offset_fixed;       /* offset where it starts */
 319     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 320     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 321     SV *longest_float;      /* longest floating string found in pattern */
 322     I32 offset_float_min;   /* earliest point in string it can appear */
 323     I32 offset_float_max;   /* latest point in string it can appear */
 324     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 325     I32 lookbehind_float;   /* is the position of the string modified by LB */
 326     I32 flags;
 327     I32 whilem_c;
 328     I32 *last_closep;
 329     struct regnode_charclass_class *start_class;
 330 } scan_data_t;
 331
 332 /*
 333  * Forward declarations for pregcomp()'s friends.
 334  */
 335
 336 static const scan_data_t zero_scan_data =
 337   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 338
 339 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 340 #define SF_BEFORE_SEOL          0x0001
 341 #define SF_BEFORE_MEOL          0x0002
 342 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 343 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 344
 345 #ifdef NO_UNARY_PLUS
 346 #  define SF_FIX_SHIFT_EOL      (0+2)
 347 #  define SF_FL_SHIFT_EOL               (0+4)
 348 #else
 349 #  define SF_FIX_SHIFT_EOL      (+2)
 350 #  define SF_FL_SHIFT_EOL               (+4)
 351 #endif
 352
 353 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 354 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 355
 356 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 357 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 358 #define SF_IS_INF               0x0040
 359 #define SF_HAS_PAR              0x0080
 360 #define SF_IN_PAR               0x0100
 361 #define SF_HAS_EVAL             0x0200
 362 #define SCF_DO_SUBSTR           0x0400
 363 #define SCF_DO_STCLASS_AND      0x0800
 364 #define SCF_DO_STCLASS_OR       0x1000
 365 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 366 #define SCF_WHILEM_VISITED_POS  0x2000
 367
 368 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 369 #define SCF_SEEN_ACCEPT         0x8000
 370
 371 #define UTF cBOOL(RExC_utf8)
 372 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 373 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 374
 375 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 376
 377 #define OOB_UNICODE             12345678
 378 #define OOB_NAMEDCLASS          -1
 379
 380 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 381 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 382
 383
 384 /* length of regex to show in messages that don't mark a position within */
 385 #define RegexLengthToShowInErrorMessages 127
 386
 387 /*
 388  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 389  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 390  * op/pragma/warn/regcomp.
 391  */
 392 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 393 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 394
 395 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 396
 397 /*
 398  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 399  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 400  * "...".
 401  */
 402 #define _FAIL(code) STMT_START {                                        \
 403     const char *ellipses = "";                                          \
 404     IV len = RExC_end - RExC_precomp;                                   \
 405                                                                         \
 406     if (!SIZE_ONLY)                                                     \
 407         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 408     if (len > RegexLengthToShowInErrorMessages) {                       \
 409         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 410         len = RegexLengthToShowInErrorMessages - 10;                    \
 411         ellipses = "...";                                               \
 412     }                                                                   \
 413     code;                                                               \
 414 } STMT_END
 415
 416 #define FAIL(msg) _FAIL(                            \
 417     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 418             msg, (int)len, RExC_precomp, ellipses))
 419
 420 #define FAIL2(msg,arg) _FAIL(                       \
 421     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 422             arg, (int)len, RExC_precomp, ellipses))
 423
 424 /*
 425  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 426  */
 427 #define Simple_vFAIL(m) STMT_START {                                    \
 428     const IV offset = RExC_parse - RExC_precomp;                        \
 429     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 430             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 431 } STMT_END
 432
 433 /*
 434  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 435  */
 436 #define vFAIL(m) STMT_START {                           \
 437     if (!SIZE_ONLY)                                     \
 438         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 439     Simple_vFAIL(m);                                    \
 440 } STMT_END
 441
 442 /*
 443  * Like Simple_vFAIL(), but accepts two arguments.
 444  */
 445 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 446     const IV offset = RExC_parse - RExC_precomp;                        \
 447     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 448             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 449 } STMT_END
 450
 451 /*
 452  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 453  */
 454 #define vFAIL2(m,a1) STMT_START {                       \
 455     if (!SIZE_ONLY)                                     \
 456         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 457     Simple_vFAIL2(m, a1);                               \
 458 } STMT_END
 459
 460
 461 /*
 462  * Like Simple_vFAIL(), but accepts three arguments.
 463  */
 464 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 465     const IV offset = RExC_parse - RExC_precomp;                \
 466     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 467             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 468 } STMT_END
 469
 470 /*
 471  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 472  */
 473 #define vFAIL3(m,a1,a2) STMT_START {                    \
 474     if (!SIZE_ONLY)                                     \
 475         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 476     Simple_vFAIL3(m, a1, a2);                           \
 477 } STMT_END
 478
 479 /*
 480  * Like Simple_vFAIL(), but accepts four arguments.
 481  */
 482 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 483     const IV offset = RExC_parse - RExC_precomp;                \
 484     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 485             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 486 } STMT_END
 487
 488 #define ckWARNreg(loc,m) STMT_START {                                   \
 489     const IV offset = loc - RExC_precomp;                               \
 490     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 491             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 492 } STMT_END
 493
 494 #define ckWARNregdep(loc,m) STMT_START {                                \
 495     const IV offset = loc - RExC_precomp;                               \
 496     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 497             m REPORT_LOCATION,                                          \
 498             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 499 } STMT_END
 500
 501 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 502     const IV offset = loc - RExC_precomp;                               \
 503     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 504             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 505 } STMT_END
 506
 507 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 508     const IV offset = loc - RExC_precomp;                               \
 509     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 510             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 511 } STMT_END
 512
 513 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 514     const IV offset = loc - RExC_precomp;                               \
 515     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 516             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 517 } STMT_END
 518
 519 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 520     const IV offset = loc - RExC_precomp;                               \
 521     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 522             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 523 } STMT_END
 524
 525 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 526     const IV offset = loc - RExC_precomp;                               \
 527     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 528             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 529 } STMT_END
 530
 531 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 532     const IV offset = loc - RExC_precomp;                               \
 533     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 534             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 535 } STMT_END
 536
 537
 538 /* Allow for side effects in s */
 539 #define REGC(c,s) STMT_START {                  \
 540     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 541 } STMT_END
 542
 543 /* Macros for recording node offsets.   20001227 mjd@plover.com
 544  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 545  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 546  * Element 0 holds the number n.
 547  * Position is 1 indexed.
 548  */
 549 #ifndef RE_TRACK_PATTERN_OFFSETS
 550 #define Set_Node_Offset_To_R(node,byte)
 551 #define Set_Node_Offset(node,byte)
 552 #define Set_Cur_Node_Offset
 553 #define Set_Node_Length_To_R(node,len)
 554 #define Set_Node_Length(node,len)
 555 #define Set_Node_Cur_Length(node)
 556 #define Node_Offset(n)
 557 #define Node_Length(n)
 558 #define Set_Node_Offset_Length(node,offset,len)
 559 #define ProgLen(ri) ri->u.proglen
 560 #define SetProgLen(ri,x) ri->u.proglen = x
 561 #else
 562 #define ProgLen(ri) ri->u.offsets[0]
 563 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 564 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 565     if (! SIZE_ONLY) {                                                  \
 566         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 567                     __LINE__, (int)(node), (int)(byte)));               \
 568         if((node) < 0) {                                                \
 569             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 570         } else {                                                        \
 571             RExC_offsets[2*(node)-1] = (byte);                          \
 572         }                                                               \
 573     }                                                                   \
 574 } STMT_END
 575
 576 #define Set_Node_Offset(node,byte) \
 577     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 578 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 579
 580 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 581     if (! SIZE_ONLY) {                                                  \
 582         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 583                 __LINE__, (int)(node), (int)(len)));                    \
 584         if((node) < 0) {                                                \
 585             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 586         } else {                                                        \
 587             RExC_offsets[2*(node)] = (len);                             \
 588         }                                                               \
 589     }                                                                   \
 590 } STMT_END
 591
 592 #define Set_Node_Length(node,len) \
 593     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 594 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 595 #define Set_Node_Cur_Length(node) \
 596     Set_Node_Length(node, RExC_parse - parse_start)
 597
 598 /* Get offsets and lengths */
 599 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 600 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 601
 602 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 603     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 604     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 605 } STMT_END
 606 #endif
 607
 608 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 609 #define EXPERIMENTAL_INPLACESCAN
 610 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 611
 612 #define DEBUG_STUDYDATA(str,data,depth)                              \
 613 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 614     PerlIO_printf(Perl_debug_log,                                    \
 615         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 616         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 617         (int)(depth)*2, "",                                          \
 618         (IV)((data)->pos_min),                                       \
 619         (IV)((data)->pos_delta),                                     \
 620         (UV)((data)->flags),                                         \
 621         (IV)((data)->whilem_c),                                      \
 622         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 623         is_inf ? "INF " : ""                                         \
 624     );                                                               \
 625     if ((data)->last_found)                                          \
 626         PerlIO_printf(Perl_debug_log,                                \
 627             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 628             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 629             SvPVX_const((data)->last_found),                         \
 630             (IV)((data)->last_end),                                  \
 631             (IV)((data)->last_start_min),                            \
 632             (IV)((data)->last_start_max),                            \
 633             ((data)->longest &&                                      \
 634              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 635             SvPVX_const((data)->longest_fixed),                      \
 636             (IV)((data)->offset_fixed),                              \
 637             ((data)->longest &&                                      \
 638              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 639             SvPVX_const((data)->longest_float),                      \
 640             (IV)((data)->offset_float_min),                          \
 641             (IV)((data)->offset_float_max)                           \
 642         );                                                           \
 643     PerlIO_printf(Perl_debug_log,"\n");                              \
 644 });
 645
 646 static void clear_re(pTHX_ void *r);
 647
 648 /* Mark that we cannot extend a found fixed substring at this point.
 649    Update the longest found anchored substring and the longest found
 650    floating substrings if needed. */
 651
 652 STATIC void
 653 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 654 {
 655     const STRLEN l = CHR_SVLEN(data->last_found);
 656     const STRLEN old_l = CHR_SVLEN(*data->longest);
 657     GET_RE_DEBUG_FLAGS_DECL;
 658
 659     PERL_ARGS_ASSERT_SCAN_COMMIT;
 660
 661     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 662         SvSetMagicSV(*data->longest, data->last_found);
 663         if (*data->longest == data->longest_fixed) {
 664             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 665             if (data->flags & SF_BEFORE_EOL)
 666                 data->flags
 667                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 668             else
 669                 data->flags &= ~SF_FIX_BEFORE_EOL;
 670             data->minlen_fixed=minlenp;
 671             data->lookbehind_fixed=0;
 672         }
 673         else { /* *data->longest == data->longest_float */
 674             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 675             data->offset_float_max = (l
 676                                       ? data->last_start_max
 677                                       : data->pos_min + data->pos_delta);
 678             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 679                 data->offset_float_max = I32_MAX;
 680             if (data->flags & SF_BEFORE_EOL)
 681                 data->flags
 682                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 683             else
 684                 data->flags &= ~SF_FL_BEFORE_EOL;
 685             data->minlen_float=minlenp;
 686             data->lookbehind_float=0;
 687         }
 688     }
 689     SvCUR_set(data->last_found, 0);
 690     {
 691         SV * const sv = data->last_found;
 692         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 693             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 694             if (mg)
 695                 mg->mg_len = 0;
 696         }
 697     }
 698     data->last_end = -1;
 699     data->flags &= ~SF_BEFORE_EOL;
 700     DEBUG_STUDYDATA("commit: ",data,0);
 701 }
 702
 703 /* Can match anything (initialization) */
 704 STATIC void
 705 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 706 {
 707     PERL_ARGS_ASSERT_CL_ANYTHING;
 708
 709     ANYOF_CLASS_ZERO(cl);
 710     ANYOF_BITMAP_SETALL(cl);
 711     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 712     if (LOC)
 713         cl->flags |= ANYOF_LOCALE;
 714 }
 715
 716 /* Can match anything (initialization) */
 717 STATIC int
 718 S_cl_is_anything(const struct regnode_charclass_class *cl)
 719 {
 720     int value;
 721
 722     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 723
 724     for (value = 0; value <= ANYOF_MAX; value += 2)
 725         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 726             return 1;
 727     if (!(cl->flags & ANYOF_UNICODE_ALL))
 728         return 0;
 729     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 730         return 0;
 731     return 1;
 732 }
 733
 734 /* Can match anything (initialization) */
 735 STATIC void
 736 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 737 {
 738     PERL_ARGS_ASSERT_CL_INIT;
 739
 740     Zero(cl, 1, struct regnode_charclass_class);
 741     cl->type = ANYOF;
 742     cl_anything(pRExC_state, cl);
 743 }
 744
 745 STATIC void
 746 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 747 {
 748     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 749
 750     Zero(cl, 1, struct regnode_charclass_class);
 751     cl->type = ANYOF;
 752     cl_anything(pRExC_state, cl);
 753     if (LOC)
 754         cl->flags |= ANYOF_LOCALE;
 755 }
 756
 757 /* 'And' a given class with another one.  Can create false positives */
 758 /* We assume that cl is not inverted */
 759 STATIC void
 760 S_cl_and(struct regnode_charclass_class *cl,
 761         const struct regnode_charclass_class *and_with)
 762 {
 763     PERL_ARGS_ASSERT_CL_AND;
 764
 765     assert(and_with->type == ANYOF);
 766
 767     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 768         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 769         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 770         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 771         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 772         int i;
 773
 774         if (and_with->flags & ANYOF_INVERT)
 775             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 776                 cl->bitmap[i] &= ~and_with->bitmap[i];
 777         else
 778             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 779                 cl->bitmap[i] &= and_with->bitmap[i];
 780     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 781     if (!(and_with->flags & ANYOF_EOS))
 782         cl->flags &= ~ANYOF_EOS;
 783
 784     if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
 785         cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
 786     if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
 787         cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 788
 789     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
 790         !(and_with->flags & ANYOF_INVERT)) {
 791         cl->flags &= ~ANYOF_UNICODE_ALL;
 792         cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
 793                                                            only the one(s)
 794                                                            actually set */
 795         ARG_SET(cl, ARG(and_with));
 796     }
 797     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 798         !(and_with->flags & ANYOF_INVERT))
 799         cl->flags &= ~ANYOF_UNICODE_ALL;
 800     if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
 801         !(and_with->flags & ANYOF_INVERT))
 802         cl->flags &= ~ANYOF_NONBITMAP;
 803 }
 804
 805 /* 'OR' a given class with another one.  Can create false positives */
 806 /* We assume that cl is not inverted */
 807 STATIC void
 808 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 809 {
 810     PERL_ARGS_ASSERT_CL_OR;
 811
 812     if (or_with->flags & ANYOF_INVERT) {
 813         /* We do not use
 814          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 815          *   <= (B1 | !B2) | (CL1 | !CL2)
 816          * which is wasteful if CL2 is small, but we ignore CL2:
 817          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 818          * XXXX Can we handle case-fold?  Unclear:
 819          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 820          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 821          */
 822         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 823              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 824              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 825             int i;
 826
 827             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 828                 cl->bitmap[i] |= ~or_with->bitmap[i];
 829         } /* XXXX: logic is complicated otherwise */
 830         else {
 831             cl_anything(pRExC_state, cl);
 832         }
 833     } else {
 834         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 835         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 836              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 837                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 838             int i;
 839
 840             /* OR char bitmap and class bitmap separately */
 841             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 842                 cl->bitmap[i] |= or_with->bitmap[i];
 843             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 844                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 845                     cl->classflags[i] |= or_with->classflags[i];
 846                 cl->flags |= ANYOF_CLASS;
 847             }
 848         }
 849         else { /* XXXX: logic is complicated, leave it along for a moment. */
 850             cl_anything(pRExC_state, cl);
 851         }
 852     }
 853     if (or_with->flags & ANYOF_EOS)
 854         cl->flags |= ANYOF_EOS;
 855     if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
 856         cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
 857
 858     if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 859         cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
 860
 861     /* If both nodes match something outside the bitmap, but what they match
 862      * outside is not the same pointer, and hence not easily compared, give up
 863      * and allow the start class to match everything outside the bitmap */
 864     if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
 865         ARG(cl) != ARG(or_with)) {
 866         cl->flags |= ANYOF_UNICODE_ALL;
 867     }
 868
 869     if (or_with->flags & ANYOF_UNICODE_ALL) {
 870         cl->flags |= ANYOF_UNICODE_ALL;
 871     }
 872 }
 873
 874 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 875 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 876 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 877 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 878
 879
 880 #ifdef DEBUGGING
 881 /*
 882    dump_trie(trie,widecharmap,revcharmap)
 883    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 884    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 885
 886    These routines dump out a trie in a somewhat readable format.
 887    The _interim_ variants are used for debugging the interim
 888    tables that are used to generate the final compressed
 889    representation which is what dump_trie expects.
 890
 891    Part of the reason for their existence is to provide a form
 892    of documentation as to how the different representations function.
 893
 894 */
 895
 896 /*
 897   Dumps the final compressed table form of the trie to Perl_debug_log.
 898   Used for debugging make_trie().
 899 */
 900
 901 STATIC void
 902 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 903             AV *revcharmap, U32 depth)
 904 {
 905     U32 state;
 906     SV *sv=sv_newmortal();
 907     int colwidth= widecharmap ? 6 : 4;
 908     U16 word;
 909     GET_RE_DEBUG_FLAGS_DECL;
 910
 911     PERL_ARGS_ASSERT_DUMP_TRIE;
 912
 913     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 914         (int)depth * 2 + 2,"",
 915         "Match","Base","Ofs" );
 916
 917     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 918         SV ** const tmp = av_fetch( revcharmap, state, 0);
 919         if ( tmp ) {
 920             PerlIO_printf( Perl_debug_log, "%*s",
 921                 colwidth,
 922                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 923                             PL_colors[0], PL_colors[1],
 924                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 925                             PERL_PV_ESCAPE_FIRSTCHAR
 926                 )
 927             );
 928         }
 929     }
 930     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 931         (int)depth * 2 + 2,"");
 932
 933     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 934         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 935     PerlIO_printf( Perl_debug_log, "\n");
 936
 937     for( state = 1 ; state < trie->statecount ; state++ ) {
 938         const U32 base = trie->states[ state ].trans.base;
 939
 940         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 941
 942         if ( trie->states[ state ].wordnum ) {
 943             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 944         } else {
 945             PerlIO_printf( Perl_debug_log, "%6s", "" );
 946         }
 947
 948         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 949
 950         if ( base ) {
 951             U32 ofs = 0;
 952
 953             while( ( base + ofs  < trie->uniquecharcount ) ||
 954                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 955                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 956                     ofs++;
 957
 958             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 959
 960             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 961                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 962                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 963                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 964                 {
 965                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 966                     colwidth,
 967                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 968                 } else {
 969                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 970                 }
 971             }
 972
 973             PerlIO_printf( Perl_debug_log, "]");
 974
 975         }
 976         PerlIO_printf( Perl_debug_log, "\n" );
 977     }
 978     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 979     for (word=1; word <= trie->wordcount; word++) {
 980         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 981             (int)word, (int)(trie->wordinfo[word].prev),
 982             (int)(trie->wordinfo[word].len));
 983     }
 984     PerlIO_printf(Perl_debug_log, "\n" );
 985 }
 986 /*
 987   Dumps a fully constructed but uncompressed trie in list form.
 988   List tries normally only are used for construction when the number of
 989   possible chars (trie->uniquecharcount) is very high.
 990   Used for debugging make_trie().
 991 */
 992 STATIC void
 993 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 994                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
 995                          U32 depth)
 996 {
 997     U32 state;
 998     SV *sv=sv_newmortal();
 999     int colwidth= widecharmap ? 6 : 4;
1000     GET_RE_DEBUG_FLAGS_DECL;
1001
1002     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1003
1004     /* print out the table precompression.  */
1005     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1006         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1007         "------:-----+-----------------\n" );
1008
1009     for( state=1 ; state < next_alloc ; state ++ ) {
1010         U16 charid;
1011
1012         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1013             (int)depth * 2 + 2,"", (UV)state  );
1014         if ( ! trie->states[ state ].wordnum ) {
1015             PerlIO_printf( Perl_debug_log, "%5s| ","");
1016         } else {
1017             PerlIO_printf( Perl_debug_log, "W%4x| ",
1018                 trie->states[ state ].wordnum
1019             );
1020         }
1021         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1022             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1023             if ( tmp ) {
1024                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1025                     colwidth,
1026                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1027                             PL_colors[0], PL_colors[1],
1028                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1029                             PERL_PV_ESCAPE_FIRSTCHAR
1030                     ) ,
1031                     TRIE_LIST_ITEM(state,charid).forid,
1032                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1033                 );
1034                 if (!(charid % 10))
1035                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1036                         (int)((depth * 2) + 14), "");
1037             }
1038         }
1039         PerlIO_printf( Perl_debug_log, "\n");
1040     }
1041 }
1042
1043 /*
1044   Dumps a fully constructed but uncompressed trie in table form.
1045   This is the normal DFA style state transition table, with a few
1046   twists to facilitate compression later.
1047   Used for debugging make_trie().
1048 */
1049 STATIC void
1050 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1051                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1052                           U32 depth)
1053 {
1054     U32 state;
1055     U16 charid;
1056     SV *sv=sv_newmortal();
1057     int colwidth= widecharmap ? 6 : 4;
1058     GET_RE_DEBUG_FLAGS_DECL;
1059
1060     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1061
1062     /*
1063        print out the table precompression so that we can do a visual check
1064        that they are identical.
1065      */
1066
1067     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1068
1069     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1070         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1071         if ( tmp ) {
1072             PerlIO_printf( Perl_debug_log, "%*s",
1073                 colwidth,
1074                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1075                             PL_colors[0], PL_colors[1],
1076                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1077                             PERL_PV_ESCAPE_FIRSTCHAR
1078                 )
1079             );
1080         }
1081     }
1082
1083     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1084
1085     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1086         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1087     }
1088
1089     PerlIO_printf( Perl_debug_log, "\n" );
1090
1091     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1092
1093         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1094             (int)depth * 2 + 2,"",
1095             (UV)TRIE_NODENUM( state ) );
1096
1097         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1098             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1099             if (v)
1100                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1101             else
1102                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1103         }
1104         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1105             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1106         } else {
1107             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1108             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1109         }
1110     }
1111 }
1112
1113 #endif
1114
1115
1116 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1117   startbranch: the first branch in the whole branch sequence
1118   first      : start branch of sequence of branch-exact nodes.
1119                May be the same as startbranch
1120   last       : Thing following the last branch.
1121                May be the same as tail.
1122   tail       : item following the branch sequence
1123   count      : words in the sequence
1124   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1125   depth      : indent depth
1126
1127 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1128
1129 A trie is an N'ary tree where the branches are determined by digital
1130 decomposition of the key. IE, at the root node you look up the 1st character and
1131 follow that branch repeat until you find the end of the branches. Nodes can be
1132 marked as "accepting" meaning they represent a complete word. Eg:
1133
1134   /he|she|his|hers/
1135
1136 would convert into the following structure. Numbers represent states, letters
1137 following numbers represent valid transitions on the letter from that state, if
1138 the number is in square brackets it represents an accepting state, otherwise it
1139 will be in parenthesis.
1140
1141       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1142       |    |
1143       |   (2)
1144       |    |
1145      (1)   +-i->(6)-+-s->[7]
1146       |
1147       +-s->(3)-+-h->(4)-+-e->[5]
1148
1149       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1150
1151 This shows that when matching against the string 'hers' we will begin at state 1
1152 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1153 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1154 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1155 single traverse. We store a mapping from accepting to state to which word was
1156 matched, and then when we have multiple possibilities we try to complete the
1157 rest of the regex in the order in which they occured in the alternation.
1158
1159 The only prior NFA like behaviour that would be changed by the TRIE support is
1160 the silent ignoring of duplicate alternations which are of the form:
1161
1162  / (DUPE|DUPE) X? (?{ ... }) Y /x
1163
1164 Thus EVAL blocks following a trie may be called a different number of times with
1165 and without the optimisation. With the optimisations dupes will be silently
1166 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1167 the following demonstrates:
1168
1169  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1170
1171 which prints out 'word' three times, but
1172
1173  'words'=~/(word|word|word)(?{ print $1 })S/
1174
1175 which doesnt print it out at all. This is due to other optimisations kicking in.
1176
1177 Example of what happens on a structural level:
1178
1179 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1180
1181    1: CURLYM[1] {1,32767}(18)
1182    5:   BRANCH(8)
1183    6:     EXACT <ac>(16)
1184    8:   BRANCH(11)
1185    9:     EXACT <ad>(16)
1186   11:   BRANCH(14)
1187   12:     EXACT <ab>(16)
1188   16:   SUCCEED(0)
1189   17:   NOTHING(18)
1190   18: END(0)
1191
1192 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1193 and should turn into:
1194
1195    1: CURLYM[1] {1,32767}(18)
1196    5:   TRIE(16)
1197         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1198           <ac>
1199           <ad>
1200           <ab>
1201   16:   SUCCEED(0)
1202   17:   NOTHING(18)
1203   18: END(0)
1204
1205 Cases where tail != last would be like /(?foo|bar)baz/:
1206
1207    1: BRANCH(4)
1208    2:   EXACT <foo>(8)
1209    4: BRANCH(7)
1210    5:   EXACT <bar>(8)
1211    7: TAIL(8)
1212    8: EXACT <baz>(10)
1213   10: END(0)
1214
1215 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1216 and would end up looking like:
1217
1218     1: TRIE(8)
1219       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1220         <foo>
1221         <bar>
1222    7: TAIL(8)
1223    8: EXACT <baz>(10)
1224   10: END(0)
1225
1226     d = uvuni_to_utf8_flags(d, uv, 0);
1227
1228 is the recommended Unicode-aware way of saying
1229
1230     *(d++) = uv;
1231 */
1232
1233 #define TRIE_STORE_REVCHAR                                                 \
1234     STMT_START {                                                           \
1235         if (UTF) {                                                         \
1236             SV *zlopp = newSV(2);                                          \
1237             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1238             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1239             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1240             SvPOK_on(zlopp);                                               \
1241             SvUTF8_on(zlopp);                                              \
1242             av_push(revcharmap, zlopp);                                    \
1243         } else {                                                           \
1244             char ooooff = (char)uvc;                                               \
1245             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1246         }                                                                  \
1247         } STMT_END
1248
1249 #define TRIE_READ_CHAR STMT_START {                                           \
1250     wordlen++;                                                                \
1251     if ( UTF ) {                                                              \
1252         if ( folder ) {                                                       \
1253             if ( foldlen > 0 ) {                                              \
1254                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1255                foldlen -= len;                                                \
1256                scan += len;                                                   \
1257                len = 0;                                                       \
1258             } else {                                                          \
1259                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1260                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1261                 foldlen -= UNISKIP( uvc );                                    \
1262                 scan = foldbuf + UNISKIP( uvc );                              \
1263             }                                                                 \
1264         } else {                                                              \
1265             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1266         }                                                                     \
1267     } else {                                                                  \
1268         uvc = (U32)*uc;                                                       \
1269         len = 1;                                                              \
1270     }                                                                         \
1271 } STMT_END
1272
1273
1274
1275 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1276     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1277         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1278         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1279     }                                                           \
1280     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1281     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1282     TRIE_LIST_CUR( state )++;                                   \
1283 } STMT_END
1284
1285 #define TRIE_LIST_NEW(state) STMT_START {                       \
1286     Newxz( trie->states[ state ].trans.list,               \
1287         4, reg_trie_trans_le );                                 \
1288      TRIE_LIST_CUR( state ) = 1;                                \
1289      TRIE_LIST_LEN( state ) = 4;                                \
1290 } STMT_END
1291
1292 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1293     U16 dupe= trie->states[ state ].wordnum;                    \
1294     regnode * const noper_next = regnext( noper );              \
1295                                                                 \
1296     DEBUG_r({                                                   \
1297         /* store the word for dumping */                        \
1298         SV* tmp;                                                \
1299         if (OP(noper) != NOTHING)                               \
1300             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1301         else                                                    \
1302             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1303         av_push( trie_words, tmp );                             \
1304     });                                                         \
1305                                                                 \
1306     curword++;                                                  \
1307     trie->wordinfo[curword].prev   = 0;                         \
1308     trie->wordinfo[curword].len    = wordlen;                   \
1309     trie->wordinfo[curword].accept = state;                     \
1310                                                                 \
1311     if ( noper_next < tail ) {                                  \
1312         if (!trie->jump)                                        \
1313             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1314         trie->jump[curword] = (U16)(noper_next - convert);      \
1315         if (!jumper)                                            \
1316             jumper = noper_next;                                \
1317         if (!nextbranch)                                        \
1318             nextbranch= regnext(cur);                           \
1319     }                                                           \
1320                                                                 \
1321     if ( dupe ) {                                               \
1322         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1323         /* chain, so that when the bits of chain are later    */\
1324         /* linked together, the dups appear in the chain      */\
1325         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1326         trie->wordinfo[dupe].prev = curword;                    \
1327     } else {                                                    \
1328         /* we haven't inserted this word yet.                */ \
1329         trie->states[ state ].wordnum = curword;                \
1330     }                                                           \
1331 } STMT_END
1332
1333
1334 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1335      ( ( base + charid >=  ucharcount                                   \
1336          && base + charid < ubound                                      \
1337          && state == trie->trans[ base - ucharcount + charid ].check    \
1338          && trie->trans[ base - ucharcount + charid ].next )            \
1339            ? trie->trans[ base - ucharcount + charid ].next             \
1340            : ( state==1 ? special : 0 )                                 \
1341       )
1342
1343 #define MADE_TRIE       1
1344 #define MADE_JUMP_TRIE  2
1345 #define MADE_EXACT_TRIE 4
1346
1347 STATIC I32
1348 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1349 {
1350     dVAR;
1351     /* first pass, loop through and scan words */
1352     reg_trie_data *trie;
1353     HV *widecharmap = NULL;
1354     AV *revcharmap = newAV();
1355     regnode *cur;
1356     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1357     STRLEN len = 0;
1358     UV uvc = 0;
1359     U16 curword = 0;
1360     U32 next_alloc = 0;
1361     regnode *jumper = NULL;
1362     regnode *nextbranch = NULL;
1363     regnode *convert = NULL;
1364     U32 *prev_states; /* temp array mapping each state to previous one */
1365     /* we just use folder as a flag in utf8 */
1366     const U8 * folder = NULL;
1367
1368 #ifdef DEBUGGING
1369     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1370     AV *trie_words = NULL;
1371     /* along with revcharmap, this only used during construction but both are
1372      * useful during debugging so we store them in the struct when debugging.
1373      */
1374 #else
1375     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1376     STRLEN trie_charcount=0;
1377 #endif
1378     SV *re_trie_maxbuff;
1379     GET_RE_DEBUG_FLAGS_DECL;
1380
1381     PERL_ARGS_ASSERT_MAKE_TRIE;
1382 #ifndef DEBUGGING
1383     PERL_UNUSED_ARG(depth);
1384 #endif
1385
1386     switch (flags) {
1387         case EXACTFU: folder = PL_fold_latin1; break;
1388         case EXACTF:  folder = PL_fold; break;
1389         case EXACTFL: folder = PL_fold_locale; break;
1390     }
1391
1392     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1393     trie->refcount = 1;
1394     trie->startstate = 1;
1395     trie->wordcount = word_count;
1396     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1397     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1398     if (!(UTF && folder))
1399         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1400     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1401                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1402
1403     DEBUG_r({
1404         trie_words = newAV();
1405     });
1406
1407     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1408     if (!SvIOK(re_trie_maxbuff)) {
1409         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1410     }
1411     DEBUG_OPTIMISE_r({
1412                 PerlIO_printf( Perl_debug_log,
1413                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1414                   (int)depth * 2 + 2, "",
1415                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1416                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1417                   (int)depth);
1418     });
1419
1420    /* Find the node we are going to overwrite */
1421     if ( first == startbranch && OP( last ) != BRANCH ) {
1422         /* whole branch chain */
1423         convert = first;
1424     } else {
1425         /* branch sub-chain */
1426         convert = NEXTOPER( first );
1427     }
1428
1429     /*  -- First loop and Setup --
1430
1431        We first traverse the branches and scan each word to determine if it
1432        contains widechars, and how many unique chars there are, this is
1433        important as we have to build a table with at least as many columns as we
1434        have unique chars.
1435
1436        We use an array of integers to represent the character codes 0..255
1437        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1438        native representation of the character value as the key and IV's for the
1439        coded index.
1440
1441        *TODO* If we keep track of how many times each character is used we can
1442        remap the columns so that the table compression later on is more
1443        efficient in terms of memory by ensuring the most common value is in the
1444        middle and the least common are on the outside.  IMO this would be better
1445        than a most to least common mapping as theres a decent chance the most
1446        common letter will share a node with the least common, meaning the node
1447        will not be compressible. With a middle is most common approach the worst
1448        case is when we have the least common nodes twice.
1449
1450      */
1451
1452     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1453         regnode * const noper = NEXTOPER( cur );
1454         const U8 *uc = (U8*)STRING( noper );
1455         const U8 * const e  = uc + STR_LEN( noper );
1456         STRLEN foldlen = 0;
1457         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1458         const U8 *scan = (U8*)NULL;
1459         U32 wordlen      = 0;         /* required init */
1460         STRLEN chars = 0;
1461         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1462
1463         if (OP(noper) == NOTHING) {
1464             trie->minlen= 0;
1465             continue;
1466         }
1467         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1468             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1469                                           regardless of encoding */
1470
1471         for ( ; uc < e ; uc += len ) {
1472             TRIE_CHARCOUNT(trie)++;
1473             TRIE_READ_CHAR;
1474             chars++;
1475             if ( uvc < 256 ) {
1476                 if ( !trie->charmap[ uvc ] ) {
1477                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1478                     if ( folder )
1479                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1480                     TRIE_STORE_REVCHAR;
1481                 }
1482                 if ( set_bit ) {
1483                     /* store the codepoint in the bitmap, and its folded
1484                      * equivalent. */
1485                     TRIE_BITMAP_SET(trie,uvc);
1486
1487                     /* store the folded codepoint */
1488                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1489
1490                     if ( !UTF ) {
1491                         /* store first byte of utf8 representation of
1492                            variant codepoints */
1493                         if (! UNI_IS_INVARIANT(uvc)) {
1494                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1495                         }
1496                     }
1497                     set_bit = 0; /* We've done our bit :-) */
1498                 }
1499             } else {
1500                 SV** svpp;
1501                 if ( !widecharmap )
1502                     widecharmap = newHV();
1503
1504                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1505
1506                 if ( !svpp )
1507                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1508
1509                 if ( !SvTRUE( *svpp ) ) {
1510                     sv_setiv( *svpp, ++trie->uniquecharcount );
1511                     TRIE_STORE_REVCHAR;
1512                 }
1513             }
1514         }
1515         if( cur == first ) {
1516             trie->minlen=chars;
1517             trie->maxlen=chars;
1518         } else if (chars < trie->minlen) {
1519             trie->minlen=chars;
1520         } else if (chars > trie->maxlen) {
1521             trie->maxlen=chars;
1522         }
1523
1524     } /* end first pass */
1525     DEBUG_TRIE_COMPILE_r(
1526         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1527                 (int)depth * 2 + 2,"",
1528                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1529                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1530                 (int)trie->minlen, (int)trie->maxlen )
1531     );
1532
1533     /*
1534         We now know what we are dealing with in terms of unique chars and
1535         string sizes so we can calculate how much memory a naive
1536         representation using a flat table  will take. If it's over a reasonable
1537         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1538         conservative but potentially much slower representation using an array
1539         of lists.
1540
1541         At the end we convert both representations into the same compressed
1542         form that will be used in regexec.c for matching with. The latter
1543         is a form that cannot be used to construct with but has memory
1544         properties similar to the list form and access properties similar
1545         to the table form making it both suitable for fast searches and
1546         small enough that its feasable to store for the duration of a program.
1547
1548         See the comment in the code where the compressed table is produced
1549         inplace from the flat tabe representation for an explanation of how
1550         the compression works.
1551
1552     */
1553
1554
1555     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1556     prev_states[1] = 0;
1557
1558     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1559         /*
1560             Second Pass -- Array Of Lists Representation
1561
1562             Each state will be represented by a list of charid:state records
1563             (reg_trie_trans_le) the first such element holds the CUR and LEN
1564             points of the allocated array. (See defines above).
1565
1566             We build the initial structure using the lists, and then convert
1567             it into the compressed table form which allows faster lookups
1568             (but cant be modified once converted).
1569         */
1570
1571         STRLEN transcount = 1;
1572
1573         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1574             "%*sCompiling trie using list compiler\n",
1575             (int)depth * 2 + 2, ""));
1576
1577         trie->states = (reg_trie_state *)
1578             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1579                                   sizeof(reg_trie_state) );
1580         TRIE_LIST_NEW(1);
1581         next_alloc = 2;
1582
1583         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1584
1585             regnode * const noper = NEXTOPER( cur );
1586             U8 *uc           = (U8*)STRING( noper );
1587             const U8 * const e = uc + STR_LEN( noper );
1588             U32 state        = 1;         /* required init */
1589             U16 charid       = 0;         /* sanity init */
1590             U8 *scan         = (U8*)NULL; /* sanity init */
1591             STRLEN foldlen   = 0;         /* required init */
1592             U32 wordlen      = 0;         /* required init */
1593             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1594
1595             if (OP(noper) != NOTHING) {
1596                 for ( ; uc < e ; uc += len ) {
1597
1598                     TRIE_READ_CHAR;
1599
1600                     if ( uvc < 256 ) {
1601                         charid = trie->charmap[ uvc ];
1602                     } else {
1603                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1604                         if ( !svpp ) {
1605                             charid = 0;
1606                         } else {
1607                             charid=(U16)SvIV( *svpp );
1608                         }
1609                     }
1610                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1611                     if ( charid ) {
1612
1613                         U16 check;
1614                         U32 newstate = 0;
1615
1616                         charid--;
1617                         if ( !trie->states[ state ].trans.list ) {
1618                             TRIE_LIST_NEW( state );
1619                         }
1620                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1621                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1622                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1623                                 break;
1624                             }
1625                         }
1626                         if ( ! newstate ) {
1627                             newstate = next_alloc++;
1628                             prev_states[newstate] = state;
1629                             TRIE_LIST_PUSH( state, charid, newstate );
1630                             transcount++;
1631                         }
1632                         state = newstate;
1633                     } else {
1634                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1635                     }
1636                 }
1637             }
1638             TRIE_HANDLE_WORD(state);
1639
1640         } /* end second pass */
1641
1642         /* next alloc is the NEXT state to be allocated */
1643         trie->statecount = next_alloc;
1644         trie->states = (reg_trie_state *)
1645             PerlMemShared_realloc( trie->states,
1646                                    next_alloc
1647                                    * sizeof(reg_trie_state) );
1648
1649         /* and now dump it out before we compress it */
1650         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1651                                                          revcharmap, next_alloc,
1652                                                          depth+1)
1653         );
1654
1655         trie->trans = (reg_trie_trans *)
1656             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1657         {
1658             U32 state;
1659             U32 tp = 0;
1660             U32 zp = 0;
1661
1662
1663             for( state=1 ; state < next_alloc ; state ++ ) {
1664                 U32 base=0;
1665
1666                 /*
1667                 DEBUG_TRIE_COMPILE_MORE_r(
1668                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1669                 );
1670                 */
1671
1672                 if (trie->states[state].trans.list) {
1673                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1674                     U16 maxid=minid;
1675                     U16 idx;
1676
1677                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1678                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1679                         if ( forid < minid ) {
1680                             minid=forid;
1681                         } else if ( forid > maxid ) {
1682                             maxid=forid;
1683                         }
1684                     }
1685                     if ( transcount < tp + maxid - minid + 1) {
1686                         transcount *= 2;
1687                         trie->trans = (reg_trie_trans *)
1688                             PerlMemShared_realloc( trie->trans,
1689                                                      transcount
1690                                                      * sizeof(reg_trie_trans) );
1691                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1692                     }
1693                     base = trie->uniquecharcount + tp - minid;
1694                     if ( maxid == minid ) {
1695                         U32 set = 0;
1696                         for ( ; zp < tp ; zp++ ) {
1697                             if ( ! trie->trans[ zp ].next ) {
1698                                 base = trie->uniquecharcount + zp - minid;
1699                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1700                                 trie->trans[ zp ].check = state;
1701                                 set = 1;
1702                                 break;
1703                             }
1704                         }
1705                         if ( !set ) {
1706                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1707                             trie->trans[ tp ].check = state;
1708                             tp++;
1709                             zp = tp;
1710                         }
1711                     } else {
1712                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1713                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1714                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1715                             trie->trans[ tid ].check = state;
1716                         }
1717                         tp += ( maxid - minid + 1 );
1718                     }
1719                     Safefree(trie->states[ state ].trans.list);
1720                 }
1721                 /*
1722                 DEBUG_TRIE_COMPILE_MORE_r(
1723                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1724                 );
1725                 */
1726                 trie->states[ state ].trans.base=base;
1727             }
1728             trie->lasttrans = tp + 1;
1729         }
1730     } else {
1731         /*
1732            Second Pass -- Flat Table Representation.
1733
1734            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1735            We know that we will need Charcount+1 trans at most to store the data
1736            (one row per char at worst case) So we preallocate both structures
1737            assuming worst case.
1738
1739            We then construct the trie using only the .next slots of the entry
1740            structs.
1741
1742            We use the .check field of the first entry of the node temporarily to
1743            make compression both faster and easier by keeping track of how many non
1744            zero fields are in the node.
1745
1746            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1747            transition.
1748
1749            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1750            number representing the first entry of the node, and state as a
1751            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1752            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1753            are 2 entrys per node. eg:
1754
1755              A B       A B
1756           1. 2 4    1. 3 7
1757           2. 0 3    3. 0 5
1758           3. 0 0    5. 0 0
1759           4. 0 0    7. 0 0
1760
1761            The table is internally in the right hand, idx form. However as we also
1762            have to deal with the states array which is indexed by nodenum we have to
1763            use TRIE_NODENUM() to convert.
1764
1765         */
1766         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1767             "%*sCompiling trie using table compiler\n",
1768             (int)depth * 2 + 2, ""));
1769
1770         trie->trans = (reg_trie_trans *)
1771             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1772                                   * trie->uniquecharcount + 1,
1773                                   sizeof(reg_trie_trans) );
1774         trie->states = (reg_trie_state *)
1775             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1776                                   sizeof(reg_trie_state) );
1777         next_alloc = trie->uniquecharcount + 1;
1778
1779
1780         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1781
1782             regnode * const noper   = NEXTOPER( cur );
1783             const U8 *uc     = (U8*)STRING( noper );
1784             const U8 * const e = uc + STR_LEN( noper );
1785
1786             U32 state        = 1;         /* required init */
1787
1788             U16 charid       = 0;         /* sanity init */
1789             U32 accept_state = 0;         /* sanity init */
1790             U8 *scan         = (U8*)NULL; /* sanity init */
1791
1792             STRLEN foldlen   = 0;         /* required init */
1793             U32 wordlen      = 0;         /* required init */
1794             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1795
1796             if ( OP(noper) != NOTHING ) {
1797                 for ( ; uc < e ; uc += len ) {
1798
1799                     TRIE_READ_CHAR;
1800
1801                     if ( uvc < 256 ) {
1802                         charid = trie->charmap[ uvc ];
1803                     } else {
1804                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1805                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1806                     }
1807                     if ( charid ) {
1808                         charid--;
1809                         if ( !trie->trans[ state + charid ].next ) {
1810                             trie->trans[ state + charid ].next = next_alloc;
1811                             trie->trans[ state ].check++;
1812                             prev_states[TRIE_NODENUM(next_alloc)]
1813                                     = TRIE_NODENUM(state);
1814                             next_alloc += trie->uniquecharcount;
1815                         }
1816                         state = trie->trans[ state + charid ].next;
1817                     } else {
1818                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1819                     }
1820                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1821                 }
1822             }
1823             accept_state = TRIE_NODENUM( state );
1824             TRIE_HANDLE_WORD(accept_state);
1825
1826         } /* end second pass */
1827
1828         /* and now dump it out before we compress it */
1829         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1830                                                           revcharmap,
1831                                                           next_alloc, depth+1));
1832
1833         {
1834         /*
1835            * Inplace compress the table.*
1836
1837            For sparse data sets the table constructed by the trie algorithm will
1838            be mostly 0/FAIL transitions or to put it another way mostly empty.
1839            (Note that leaf nodes will not contain any transitions.)
1840
1841            This algorithm compresses the tables by eliminating most such
1842            transitions, at the cost of a modest bit of extra work during lookup:
1843
1844            - Each states[] entry contains a .base field which indicates the
1845            index in the state[] array wheres its transition data is stored.
1846
1847            - If .base is 0 there are no valid transitions from that node.
1848
1849            - If .base is nonzero then charid is added to it to find an entry in
1850            the trans array.
1851
1852            -If trans[states[state].base+charid].check!=state then the
1853            transition is taken to be a 0/Fail transition. Thus if there are fail
1854            transitions at the front of the node then the .base offset will point
1855            somewhere inside the previous nodes data (or maybe even into a node
1856            even earlier), but the .check field determines if the transition is
1857            valid.
1858
1859            XXX - wrong maybe?
1860            The following process inplace converts the table to the compressed
1861            table: We first do not compress the root node 1,and mark all its
1862            .check pointers as 1 and set its .base pointer as 1 as well. This
1863            allows us to do a DFA construction from the compressed table later,
1864            and ensures that any .base pointers we calculate later are greater
1865            than 0.
1866
1867            - We set 'pos' to indicate the first entry of the second node.
1868
1869            - We then iterate over the columns of the node, finding the first and
1870            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1871            and set the .check pointers accordingly, and advance pos
1872            appropriately and repreat for the next node. Note that when we copy
1873            the next pointers we have to convert them from the original
1874            NODEIDX form to NODENUM form as the former is not valid post
1875            compression.
1876
1877            - If a node has no transitions used we mark its base as 0 and do not
1878            advance the pos pointer.
1879
1880            - If a node only has one transition we use a second pointer into the
1881            structure to fill in allocated fail transitions from other states.
1882            This pointer is independent of the main pointer and scans forward
1883            looking for null transitions that are allocated to a state. When it
1884            finds one it writes the single transition into the "hole".  If the
1885            pointer doesnt find one the single transition is appended as normal.
1886
1887            - Once compressed we can Renew/realloc the structures to release the
1888            excess space.
1889
1890            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1891            specifically Fig 3.47 and the associated pseudocode.
1892
1893            demq
1894         */
1895         const U32 laststate = TRIE_NODENUM( next_alloc );
1896         U32 state, charid;
1897         U32 pos = 0, zp=0;
1898         trie->statecount = laststate;
1899
1900         for ( state = 1 ; state < laststate ; state++ ) {
1901             U8 flag = 0;
1902             const U32 stateidx = TRIE_NODEIDX( state );
1903             const U32 o_used = trie->trans[ stateidx ].check;
1904             U32 used = trie->trans[ stateidx ].check;
1905             trie->trans[ stateidx ].check = 0;
1906
1907             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1908                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1909                     if ( trie->trans[ stateidx + charid ].next ) {
1910                         if (o_used == 1) {
1911                             for ( ; zp < pos ; zp++ ) {
1912                                 if ( ! trie->trans[ zp ].next ) {
1913                                     break;
1914                                 }
1915                             }
1916                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1917                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1918                             trie->trans[ zp ].check = state;
1919                             if ( ++zp > pos ) pos = zp;
1920                             break;
1921                         }
1922                         used--;
1923                     }
1924                     if ( !flag ) {
1925                         flag = 1;
1926                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1927                     }
1928                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1929                     trie->trans[ pos ].check = state;
1930                     pos++;
1931                 }
1932             }
1933         }
1934         trie->lasttrans = pos + 1;
1935         trie->states = (reg_trie_state *)
1936             PerlMemShared_realloc( trie->states, laststate
1937                                    * sizeof(reg_trie_state) );
1938         DEBUG_TRIE_COMPILE_MORE_r(
1939                 PerlIO_printf( Perl_debug_log,
1940                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1941                     (int)depth * 2 + 2,"",
1942                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1943                     (IV)next_alloc,
1944                     (IV)pos,
1945                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1946             );
1947
1948         } /* end table compress */
1949     }
1950     DEBUG_TRIE_COMPILE_MORE_r(
1951             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1952                 (int)depth * 2 + 2, "",
1953                 (UV)trie->statecount,
1954                 (UV)trie->lasttrans)
1955     );
1956     /* resize the trans array to remove unused space */
1957     trie->trans = (reg_trie_trans *)
1958         PerlMemShared_realloc( trie->trans, trie->lasttrans
1959                                * sizeof(reg_trie_trans) );
1960
1961     {   /* Modify the program and insert the new TRIE node */
1962         U8 nodetype =(U8)(flags & 0xFF);
1963         char *str=NULL;
1964
1965 #ifdef DEBUGGING
1966         regnode *optimize = NULL;
1967 #ifdef RE_TRACK_PATTERN_OFFSETS
1968
1969         U32 mjd_offset = 0;
1970         U32 mjd_nodelen = 0;
1971 #endif /* RE_TRACK_PATTERN_OFFSETS */
1972 #endif /* DEBUGGING */
1973         /*
1974            This means we convert either the first branch or the first Exact,
1975            depending on whether the thing following (in 'last') is a branch
1976            or not and whther first is the startbranch (ie is it a sub part of
1977            the alternation or is it the whole thing.)
1978            Assuming its a sub part we convert the EXACT otherwise we convert
1979            the whole branch sequence, including the first.
1980          */
1981         /* Find the node we are going to overwrite */
1982         if ( first != startbranch || OP( last ) == BRANCH ) {
1983             /* branch sub-chain */
1984             NEXT_OFF( first ) = (U16)(last - first);
1985 #ifdef RE_TRACK_PATTERN_OFFSETS
1986             DEBUG_r({
1987                 mjd_offset= Node_Offset((convert));
1988                 mjd_nodelen= Node_Length((convert));
1989             });
1990 #endif
1991             /* whole branch chain */
1992         }
1993 #ifdef RE_TRACK_PATTERN_OFFSETS
1994         else {
1995             DEBUG_r({
1996                 const  regnode *nop = NEXTOPER( convert );
1997                 mjd_offset= Node_Offset((nop));
1998                 mjd_nodelen= Node_Length((nop));
1999             });
2000         }
2001         DEBUG_OPTIMISE_r(
2002             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2003                 (int)depth * 2 + 2, "",
2004                 (UV)mjd_offset, (UV)mjd_nodelen)
2005         );
2006 #endif
2007         /* But first we check to see if there is a common prefix we can
2008            split out as an EXACT and put in front of the TRIE node.  */
2009         trie->startstate= 1;
2010         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2011             U32 state;
2012             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2013                 U32 ofs = 0;
2014                 I32 idx = -1;
2015                 U32 count = 0;
2016                 const U32 base = trie->states[ state ].trans.base;
2017
2018                 if ( trie->states[state].wordnum )
2019                         count = 1;
2020
2021                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2022                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2023                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2024                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2025                     {
2026                         if ( ++count > 1 ) {
2027                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2028                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2029                             if ( state == 1 ) break;
2030                             if ( count == 2 ) {
2031                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2032                                 DEBUG_OPTIMISE_r(
2033                                     PerlIO_printf(Perl_debug_log,
2034                                         "%*sNew Start State=%"UVuf" Class: [",
2035                                         (int)depth * 2 + 2, "",
2036                                         (UV)state));
2037                                 if (idx >= 0) {
2038                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2039                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2040
2041                                     TRIE_BITMAP_SET(trie,*ch);
2042                                     if ( folder )
2043                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2044                                     DEBUG_OPTIMISE_r(
2045                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2046                                     );
2047                                 }
2048                             }
2049                             TRIE_BITMAP_SET(trie,*ch);
2050                             if ( folder )
2051                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2052                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2053                         }
2054                         idx = ofs;
2055                     }
2056                 }
2057                 if ( count == 1 ) {
2058                     SV **tmp = av_fetch( revcharmap, idx, 0);
2059                     STRLEN len;
2060                     char *ch = SvPV( *tmp, len );
2061                     DEBUG_OPTIMISE_r({
2062                         SV *sv=sv_newmortal();
2063                         PerlIO_printf( Perl_debug_log,
2064                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2065                             (int)depth * 2 + 2, "",
2066                             (UV)state, (UV)idx,
2067                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2068                                 PL_colors[0], PL_colors[1],
2069                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2070                                 PERL_PV_ESCAPE_FIRSTCHAR
2071                             )
2072                         );
2073                     });
2074                     if ( state==1 ) {
2075                         OP( convert ) = nodetype;
2076                         str=STRING(convert);
2077                         STR_LEN(convert)=0;
2078                     }
2079                     STR_LEN(convert) += len;
2080                     while (len--)
2081                         *str++ = *ch++;
2082                 } else {
2083 #ifdef DEBUGGING
2084                     if (state>1)
2085                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2086 #endif
2087                     break;
2088                 }
2089             }
2090             trie->prefixlen = (state-1);
2091             if (str) {
2092                 regnode *n = convert+NODE_SZ_STR(convert);
2093                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2094                 trie->startstate = state;
2095                 trie->minlen -= (state - 1);
2096                 trie->maxlen -= (state - 1);
2097 #ifdef DEBUGGING
2098                /* At least the UNICOS C compiler choked on this
2099                 * being argument to DEBUG_r(), so let's just have
2100                 * it right here. */
2101                if (
2102 #ifdef PERL_EXT_RE_BUILD
2103                    1
2104 #else
2105                    DEBUG_r_TEST
2106 #endif
2107                    ) {
2108                    regnode *fix = convert;
2109                    U32 word = trie->wordcount;
2110                    mjd_nodelen++;
2111                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2112                    while( ++fix < n ) {
2113                        Set_Node_Offset_Length(fix, 0, 0);
2114                    }
2115                    while (word--) {
2116                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2117                        if (tmp) {
2118                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2119                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2120                            else
2121                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2122                        }
2123                    }
2124                }
2125 #endif
2126                 if (trie->maxlen) {
2127                     convert = n;
2128                 } else {
2129                     NEXT_OFF(convert) = (U16)(tail - convert);
2130                     DEBUG_r(optimize= n);
2131                 }
2132             }
2133         }
2134         if (!jumper)
2135             jumper = last;
2136         if ( trie->maxlen ) {
2137             NEXT_OFF( convert ) = (U16)(tail - convert);
2138             ARG_SET( convert, data_slot );
2139             /* Store the offset to the first unabsorbed branch in
2140                jump[0], which is otherwise unused by the jump logic.
2141                We use this when dumping a trie and during optimisation. */
2142             if (trie->jump)
2143                 trie->jump[0] = (U16)(nextbranch - convert);
2144
2145             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2146              *   and there is a bitmap
2147              *   and the first "jump target" node we found leaves enough room
2148              * then convert the TRIE node into a TRIEC node, with the bitmap
2149              * embedded inline in the opcode - this is hypothetically faster.
2150              */
2151             if ( !trie->states[trie->startstate].wordnum
2152                  && trie->bitmap
2153                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2154             {
2155                 OP( convert ) = TRIEC;
2156                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2157                 PerlMemShared_free(trie->bitmap);
2158                 trie->bitmap= NULL;
2159             } else
2160                 OP( convert ) = TRIE;
2161
2162             /* store the type in the flags */
2163             convert->flags = nodetype;
2164             DEBUG_r({
2165             optimize = convert
2166                       + NODE_STEP_REGNODE
2167                       + regarglen[ OP( convert ) ];
2168             });
2169             /* XXX We really should free up the resource in trie now,
2170                    as we won't use them - (which resources?) dmq */
2171         }
2172         /* needed for dumping*/
2173         DEBUG_r(if (optimize) {
2174             regnode *opt = convert;
2175
2176             while ( ++opt < optimize) {
2177                 Set_Node_Offset_Length(opt,0,0);
2178             }
2179             /*
2180                 Try to clean up some of the debris left after the
2181                 optimisation.
2182              */
2183             while( optimize < jumper ) {
2184                 mjd_nodelen += Node_Length((optimize));
2185                 OP( optimize ) = OPTIMIZED;
2186                 Set_Node_Offset_Length(optimize,0,0);
2187                 optimize++;
2188             }
2189             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2190         });
2191     } /* end node insert */
2192
2193     /*  Finish populating the prev field of the wordinfo array.  Walk back
2194      *  from each accept state until we find another accept state, and if
2195      *  so, point the first word's .prev field at the second word. If the
2196      *  second already has a .prev field set, stop now. This will be the
2197      *  case either if we've already processed that word's accept state,
2198      *  or that state had multiple words, and the overspill words were
2199      *  already linked up earlier.
2200      */
2201     {
2202         U16 word;
2203         U32 state;
2204         U16 prev;
2205
2206         for (word=1; word <= trie->wordcount; word++) {
2207             prev = 0;
2208             if (trie->wordinfo[word].prev)
2209                 continue;
2210             state = trie->wordinfo[word].accept;
2211             while (state) {
2212                 state = prev_states[state];
2213                 if (!state)
2214                     break;
2215                 prev = trie->states[state].wordnum;
2216                 if (prev)
2217                     break;
2218             }
2219             trie->wordinfo[word].prev = prev;
2220         }
2221         Safefree(prev_states);
2222     }
2223
2224
2225     /* and now dump out the compressed format */
2226     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2227
2228     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2229 #ifdef DEBUGGING
2230     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2231     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2232 #else
2233     SvREFCNT_dec(revcharmap);
2234 #endif
2235     return trie->jump
2236            ? MADE_JUMP_TRIE
2237            : trie->startstate>1
2238              ? MADE_EXACT_TRIE
2239              : MADE_TRIE;
2240 }
2241
2242 STATIC void
2243 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2244 {
2245 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2246
2247    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2248    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2249    ISBN 0-201-10088-6
2250
2251    We find the fail state for each state in the trie, this state is the longest proper
2252    suffix of the current state's 'word' that is also a proper prefix of another word in our
2253    trie. State 1 represents the word '' and is thus the default fail state. This allows
2254    the DFA not to have to restart after its tried and failed a word at a given point, it
2255    simply continues as though it had been matching the other word in the first place.
2256    Consider
2257       'abcdgu'=~/abcdefg|cdgu/
2258    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2259    fail, which would bring us to the state representing 'd' in the second word where we would
2260    try 'g' and succeed, proceeding to match 'cdgu'.
2261  */
2262  /* add a fail transition */
2263     const U32 trie_offset = ARG(source);
2264     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2265     U32 *q;
2266     const U32 ucharcount = trie->uniquecharcount;
2267     const U32 numstates = trie->statecount;
2268     const U32 ubound = trie->lasttrans + ucharcount;
2269     U32 q_read = 0;
2270     U32 q_write = 0;
2271     U32 charid;
2272     U32 base = trie->states[ 1 ].trans.base;
2273     U32 *fail;
2274     reg_ac_data *aho;
2275     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2276     GET_RE_DEBUG_FLAGS_DECL;
2277
2278     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2279 #ifndef DEBUGGING
2280     PERL_UNUSED_ARG(depth);
2281 #endif
2282
2283
2284     ARG_SET( stclass, data_slot );
2285     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2286     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2287     aho->trie=trie_offset;
2288     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2289     Copy( trie->states, aho->states, numstates, reg_trie_state );
2290     Newxz( q, numstates, U32);
2291     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2292     aho->refcount = 1;
2293     fail = aho->fail;
2294     /* initialize fail[0..1] to be 1 so that we always have
2295        a valid final fail state */
2296     fail[ 0 ] = fail[ 1 ] = 1;
2297
2298     for ( charid = 0; charid < ucharcount ; charid++ ) {
2299         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2300         if ( newstate ) {
2301             q[ q_write ] = newstate;
2302             /* set to point at the root */
2303             fail[ q[ q_write++ ] ]=1;
2304         }
2305     }
2306     while ( q_read < q_write) {
2307         const U32 cur = q[ q_read++ % numstates ];
2308         base = trie->states[ cur ].trans.base;
2309
2310         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2311             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2312             if (ch_state) {
2313                 U32 fail_state = cur;
2314                 U32 fail_base;
2315                 do {
2316                     fail_state = fail[ fail_state ];
2317                     fail_base = aho->states[ fail_state ].trans.base;
2318                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2319
2320                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2321                 fail[ ch_state ] = fail_state;
2322                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2323                 {
2324                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2325                 }
2326                 q[ q_write++ % numstates] = ch_state;
2327             }
2328         }
2329     }
2330     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2331        when we fail in state 1, this allows us to use the
2332        charclass scan to find a valid start char. This is based on the principle
2333        that theres a good chance the string being searched contains lots of stuff
2334        that cant be a start char.
2335      */
2336     fail[ 0 ] = fail[ 1 ] = 0;
2337     DEBUG_TRIE_COMPILE_r({
2338         PerlIO_printf(Perl_debug_log,
2339                       "%*sStclass Failtable (%"UVuf" states): 0",
2340                       (int)(depth * 2), "", (UV)numstates
2341         );
2342         for( q_read=1; q_read<numstates; q_read++ ) {
2343             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2344         }
2345         PerlIO_printf(Perl_debug_log, "\n");
2346     });
2347     Safefree(q);
2348     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2349 }
2350
2351
2352 /*
2353  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2354  * These need to be revisited when a newer toolchain becomes available.
2355  */
2356 #if defined(__sparc64__) && defined(__GNUC__)
2357 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2358 #       undef  SPARC64_GCC_WORKAROUND
2359 #       define SPARC64_GCC_WORKAROUND 1
2360 #   endif
2361 #endif
2362
2363 #define DEBUG_PEEP(str,scan,depth) \
2364     DEBUG_OPTIMISE_r({if (scan){ \
2365        SV * const mysv=sv_newmortal(); \
2366        regnode *Next = regnext(scan); \
2367        regprop(RExC_rx, mysv, scan); \
2368        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2369        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2370        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2371    }});
2372
2373
2374
2375
2376
2377 #define JOIN_EXACT(scan,min,flags) \
2378     if (PL_regkind[OP(scan)] == EXACT) \
2379         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2380
2381 STATIC U32
2382 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2383     /* Merge several consecutive EXACTish nodes into one. */
2384     regnode *n = regnext(scan);
2385     U32 stringok = 1;
2386     regnode *next = scan + NODE_SZ_STR(scan);
2387     U32 merged = 0;
2388     U32 stopnow = 0;
2389 #ifdef DEBUGGING
2390     regnode *stop = scan;
2391     GET_RE_DEBUG_FLAGS_DECL;
2392 #else
2393     PERL_UNUSED_ARG(depth);
2394 #endif
2395
2396     PERL_ARGS_ASSERT_JOIN_EXACT;
2397 #ifndef EXPERIMENTAL_INPLACESCAN
2398     PERL_UNUSED_ARG(flags);
2399     PERL_UNUSED_ARG(val);
2400 #endif
2401     DEBUG_PEEP("join",scan,depth);
2402
2403     /* Skip NOTHING, merge EXACT*. */
2404     while (n &&
2405            ( PL_regkind[OP(n)] == NOTHING ||
2406              (stringok && (OP(n) == OP(scan))))
2407            && NEXT_OFF(n)
2408            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2409
2410         if (OP(n) == TAIL || n > next)
2411             stringok = 0;
2412         if (PL_regkind[OP(n)] == NOTHING) {
2413             DEBUG_PEEP("skip:",n,depth);
2414             NEXT_OFF(scan) += NEXT_OFF(n);
2415             next = n + NODE_STEP_REGNODE;
2416 #ifdef DEBUGGING
2417             if (stringok)
2418                 stop = n;
2419 #endif
2420             n = regnext(n);
2421         }
2422         else if (stringok) {
2423             const unsigned int oldl = STR_LEN(scan);
2424             regnode * const nnext = regnext(n);
2425
2426             DEBUG_PEEP("merg",n,depth);
2427
2428             merged++;
2429             if (oldl + STR_LEN(n) > U8_MAX)
2430                 break;
2431             NEXT_OFF(scan) += NEXT_OFF(n);
2432             STR_LEN(scan) += STR_LEN(n);
2433             next = n + NODE_SZ_STR(n);
2434             /* Now we can overwrite *n : */
2435             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2436 #ifdef DEBUGGING
2437             stop = next - 1;
2438 #endif
2439             n = nnext;
2440             if (stopnow) break;
2441         }
2442
2443 #ifdef EXPERIMENTAL_INPLACESCAN
2444         if (flags && !NEXT_OFF(n)) {
2445             DEBUG_PEEP("atch", val, depth);
2446             if (reg_off_by_arg[OP(n)]) {
2447                 ARG_SET(n, val - n);
2448             }
2449             else {
2450                 NEXT_OFF(n) = val - n;
2451             }
2452             stopnow = 1;
2453         }
2454 #endif
2455     }
2456 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2457 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2458 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2459 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2460
2461     if (UTF
2462         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2463         && ( STR_LEN(scan) >= 6 ) )
2464     {
2465     /*
2466     Two problematic code points in Unicode casefolding of EXACT nodes:
2467
2468     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2469     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2470
2471     which casefold to
2472
2473     Unicode                      UTF-8
2474
2475     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2476     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2477
2478     This means that in case-insensitive matching (or "loose matching",
2479     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2480     length of the above casefolded versions) can match a target string
2481     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2482     This would rather mess up the minimum length computation.
2483
2484     What we'll do is to look for the tail four bytes, and then peek
2485     at the preceding two bytes to see whether we need to decrease
2486     the minimum length by four (six minus two).
2487
2488     Thanks to the design of UTF-8, there cannot be false matches:
2489     A sequence of valid UTF-8 bytes cannot be a subsequence of
2490     another valid sequence of UTF-8 bytes.
2491
2492     */
2493          char * const s0 = STRING(scan), *s, *t;
2494          char * const s1 = s0 + STR_LEN(scan) - 1;
2495          char * const s2 = s1 - 4;
2496 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2497          const char t0[] = "\xaf\x49\xaf\x42";
2498 #else
2499          const char t0[] = "\xcc\x88\xcc\x81";
2500 #endif
2501          const char * const t1 = t0 + 3;
2502
2503          for (s = s0 + 2;
2504               s < s2 && (t = ninstr(s, s1, t0, t1));
2505               s = t + 4) {
2506 #ifdef EBCDIC
2507               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2508                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2509 #else
2510               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2511                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2512 #endif
2513                    *min -= 4;
2514          }
2515     }
2516
2517 #ifdef DEBUGGING
2518     /* Allow dumping */
2519     n = scan + NODE_SZ_STR(scan);
2520     while (n <= stop) {
2521         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2522             OP(n) = OPTIMIZED;
2523             NEXT_OFF(n) = 0;
2524         }
2525         n++;
2526     }
2527 #endif
2528     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2529     return stopnow;
2530 }
2531
2532 /* REx optimizer.  Converts nodes into quicker variants "in place".
2533    Finds fixed substrings.  */
2534
2535 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2536    to the position after last scanned or to NULL. */
2537
2538 #define INIT_AND_WITHP \
2539     assert(!and_withp); \
2540     Newx(and_withp,1,struct regnode_charclass_class); \
2541     SAVEFREEPV(and_withp)
2542
2543 /* this is a chain of data about sub patterns we are processing that
2544    need to be handled separately/specially in study_chunk. Its so
2545    we can simulate recursion without losing state.  */
2546 struct scan_frame;
2547 typedef struct scan_frame {
2548     regnode *last;  /* last node to process in this frame */
2549     regnode *next;  /* next node to process when last is reached */
2550     struct scan_frame *prev; /*previous frame*/
2551     I32 stop; /* what stopparen do we use */
2552 } scan_frame;
2553
2554
2555 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2556
2557 #define CASE_SYNST_FNC(nAmE)                                       \
2558 case nAmE:                                                         \
2559     if (flags & SCF_DO_STCLASS_AND) {                              \
2560             for (value = 0; value < 256; value++)                  \
2561                 if (!is_ ## nAmE ## _cp(value))                       \
2562                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2563     }                                                              \
2564     else {                                                         \
2565             for (value = 0; value < 256; value++)                  \
2566                 if (is_ ## nAmE ## _cp(value))                        \
2567                     ANYOF_BITMAP_SET(data->start_class, value);    \
2568     }                                                              \
2569     break;                                                         \
2570 case N ## nAmE:                                                    \
2571     if (flags & SCF_DO_STCLASS_AND) {                              \
2572             for (value = 0; value < 256; value++)                   \
2573                 if (is_ ## nAmE ## _cp(value))                         \
2574                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2575     }                                                               \
2576     else {                                                          \
2577             for (value = 0; value < 256; value++)                   \
2578                 if (!is_ ## nAmE ## _cp(value))                        \
2579                     ANYOF_BITMAP_SET(data->start_class, value);     \
2580     }                                                               \
2581     break
2582
2583
2584
2585 STATIC I32
2586 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2587                         I32 *minlenp, I32 *deltap,
2588                         regnode *last,
2589                         scan_data_t *data,
2590                         I32 stopparen,
2591                         U8* recursed,
2592                         struct regnode_charclass_class *and_withp,
2593                         U32 flags, U32 depth)
2594                         /* scanp: Start here (read-write). */
2595                         /* deltap: Write maxlen-minlen here. */
2596                         /* last: Stop before this one. */
2597                         /* data: string data about the pattern */
2598                         /* stopparen: treat close N as END */
2599                         /* recursed: which subroutines have we recursed into */
2600                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2601 {
2602     dVAR;
2603     I32 min = 0, pars = 0, code;
2604     regnode *scan = *scanp, *next;
2605     I32 delta = 0;
2606     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2607     int is_inf_internal = 0;            /* The studied chunk is infinite */
2608     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2609     scan_data_t data_fake;
2610     SV *re_trie_maxbuff = NULL;
2611     regnode *first_non_open = scan;
2612     I32 stopmin = I32_MAX;
2613     scan_frame *frame = NULL;
2614     GET_RE_DEBUG_FLAGS_DECL;
2615
2616     PERL_ARGS_ASSERT_STUDY_CHUNK;
2617
2618 #ifdef DEBUGGING
2619     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2620 #endif
2621
2622     if ( depth == 0 ) {
2623         while (first_non_open && OP(first_non_open) == OPEN)
2624             first_non_open=regnext(first_non_open);
2625     }
2626
2627
2628   fake_study_recurse:
2629     while ( scan && OP(scan) != END && scan < last ){
2630         /* Peephole optimizer: */
2631         DEBUG_STUDYDATA("Peep:", data,depth);
2632         DEBUG_PEEP("Peep",scan,depth);
2633         JOIN_EXACT(scan,&min,0);
2634
2635         /* Follow the next-chain of the current node and optimize
2636            away all the NOTHINGs from it.  */
2637         if (OP(scan) != CURLYX) {
2638             const int max = (reg_off_by_arg[OP(scan)]
2639                        ? I32_MAX
2640                        /* I32 may be smaller than U16 on CRAYs! */
2641                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2642             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2643             int noff;
2644             regnode *n = scan;
2645
2646             /* Skip NOTHING and LONGJMP. */
2647             while ((n = regnext(n))
2648                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2649                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2650                    && off + noff < max)
2651                 off += noff;
2652             if (reg_off_by_arg[OP(scan)])
2653                 ARG(scan) = off;
2654             else
2655                 NEXT_OFF(scan) = off;
2656         }
2657
2658
2659
2660         /* The principal pseudo-switch.  Cannot be a switch, since we
2661            look into several different things.  */
2662         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2663                    || OP(scan) == IFTHEN) {
2664             next = regnext(scan);
2665             code = OP(scan);
2666             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2667
2668             if (OP(next) == code || code == IFTHEN) {
2669                 /* NOTE - There is similar code to this block below for handling
2670                    TRIE nodes on a re-study.  If you change stuff here check there
2671                    too. */
2672                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2673                 struct regnode_charclass_class accum;
2674                 regnode * const startbranch=scan;
2675
2676                 if (flags & SCF_DO_SUBSTR)
2677                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2678                 if (flags & SCF_DO_STCLASS)
2679                     cl_init_zero(pRExC_state, &accum);
2680
2681                 while (OP(scan) == code) {
2682                     I32 deltanext, minnext, f = 0, fake;
2683                     struct regnode_charclass_class this_class;
2684
2685                     num++;
2686                     data_fake.flags = 0;
2687                     if (data) {
2688                         data_fake.whilem_c = data->whilem_c;
2689                         data_fake.last_closep = data->last_closep;
2690                     }
2691                     else
2692                         data_fake.last_closep = &fake;
2693
2694                     data_fake.pos_delta = delta;
2695                     next = regnext(scan);
2696                     scan = NEXTOPER(scan);
2697                     if (code != BRANCH)
2698                         scan = NEXTOPER(scan);
2699                     if (flags & SCF_DO_STCLASS) {
2700                         cl_init(pRExC_state, &this_class);
2701                         data_fake.start_class = &this_class;
2702                         f = SCF_DO_STCLASS_AND;
2703                     }
2704                     if (flags & SCF_WHILEM_VISITED_POS)
2705                         f |= SCF_WHILEM_VISITED_POS;
2706
2707                     /* we suppose the run is continuous, last=next...*/
2708                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2709                                           next, &data_fake,
2710                                           stopparen, recursed, NULL, f,depth+1);
2711                     if (min1 > minnext)
2712                         min1 = minnext;
2713                     if (max1 < minnext + deltanext)
2714                         max1 = minnext + deltanext;
2715                     if (deltanext == I32_MAX)
2716                         is_inf = is_inf_internal = 1;
2717                     scan = next;
2718                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2719                         pars++;
2720                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2721                         if ( stopmin > minnext)
2722                             stopmin = min + min1;
2723                         flags &= ~SCF_DO_SUBSTR;
2724                         if (data)
2725                             data->flags |= SCF_SEEN_ACCEPT;
2726                     }
2727                     if (data) {
2728                         if (data_fake.flags & SF_HAS_EVAL)
2729                             data->flags |= SF_HAS_EVAL;
2730                         data->whilem_c = data_fake.whilem_c;
2731                     }
2732                     if (flags & SCF_DO_STCLASS)
2733                         cl_or(pRExC_state, &accum, &this_class);
2734                 }
2735                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2736                     min1 = 0;
2737                 if (flags & SCF_DO_SUBSTR) {
2738                     data->pos_min += min1;
2739                     data->pos_delta += max1 - min1;
2740                     if (max1 != min1 || is_inf)
2741                         data->longest = &(data->longest_float);
2742                 }
2743                 min += min1;
2744                 delta += max1 - min1;
2745                 if (flags & SCF_DO_STCLASS_OR) {
2746                     cl_or(pRExC_state, data->start_class, &accum);
2747                     if (min1) {
2748                         cl_and(data->start_class, and_withp);
2749                         flags &= ~SCF_DO_STCLASS;
2750                     }
2751                 }
2752                 else if (flags & SCF_DO_STCLASS_AND) {
2753                     if (min1) {
2754                         cl_and(data->start_class, &accum);
2755                         flags &= ~SCF_DO_STCLASS;
2756                     }
2757                     else {
2758                         /* Switch to OR mode: cache the old value of
2759                          * data->start_class */
2760                         INIT_AND_WITHP;
2761                         StructCopy(data->start_class, and_withp,
2762                                    struct regnode_charclass_class);
2763                         flags &= ~SCF_DO_STCLASS_AND;
2764                         StructCopy(&accum, data->start_class,
2765                                    struct regnode_charclass_class);
2766                         flags |= SCF_DO_STCLASS_OR;
2767                         data->start_class->flags |= ANYOF_EOS;
2768                     }
2769                 }
2770
2771                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2772                 /* demq.
2773
2774                    Assuming this was/is a branch we are dealing with: 'scan' now
2775                    points at the item that follows the branch sequence, whatever
2776                    it is. We now start at the beginning of the sequence and look
2777                    for subsequences of
2778
2779                    BRANCH->EXACT=>x1
2780                    BRANCH->EXACT=>x2
2781                    tail
2782
2783                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2784
2785                    If we can find such a subsequence we need to turn the first
2786                    element into a trie and then add the subsequent branch exact
2787                    strings to the trie.
2788
2789                    We have two cases
2790
2791                      1. patterns where the whole set of branches can be converted.
2792
2793                      2. patterns where only a subset can be converted.
2794
2795                    In case 1 we can replace the whole set with a single regop
2796                    for the trie. In case 2 we need to keep the start and end
2797                    branches so
2798
2799                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2800                      becomes BRANCH TRIE; BRANCH X;
2801
2802                   There is an additional case, that being where there is a
2803                   common prefix, which gets split out into an EXACT like node
2804                   preceding the TRIE node.
2805
2806                   If x(1..n)==tail then we can do a simple trie, if not we make
2807                   a "jump" trie, such that when we match the appropriate word
2808                   we "jump" to the appropriate tail node. Essentially we turn
2809                   a nested if into a case structure of sorts.
2810
2811                 */
2812
2813                     int made=0;
2814                     if (!re_trie_maxbuff) {
2815                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2816                         if (!SvIOK(re_trie_maxbuff))
2817                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2818                     }
2819                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2820                         regnode *cur;
2821                         regnode *first = (regnode *)NULL;
2822                         regnode *last = (regnode *)NULL;
2823                         regnode *tail = scan;
2824                         U8 optype = 0;
2825                         U32 count=0;
2826
2827 #ifdef DEBUGGING
2828                         SV * const mysv = sv_newmortal();       /* for dumping */
2829 #endif
2830                         /* var tail is used because there may be a TAIL
2831                            regop in the way. Ie, the exacts will point to the
2832                            thing following the TAIL, but the last branch will
2833                            point at the TAIL. So we advance tail. If we
2834                            have nested (?:) we may have to move through several
2835                            tails.
2836                          */
2837
2838                         while ( OP( tail ) == TAIL ) {
2839                             /* this is the TAIL generated by (?:) */
2840                             tail = regnext( tail );
2841                         }
2842
2843
2844                         DEBUG_OPTIMISE_r({
2845                             regprop(RExC_rx, mysv, tail );
2846                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2847                                 (int)depth * 2 + 2, "",
2848                                 "Looking for TRIE'able sequences. Tail node is: ",
2849                                 SvPV_nolen_const( mysv )
2850                             );
2851                         });
2852
2853                         /*
2854
2855                            step through the branches, cur represents each
2856                            branch, noper is the first thing to be matched
2857                            as part of that branch and noper_next is the
2858                            regnext() of that node. if noper is an EXACT
2859                            and noper_next is the same as scan (our current
2860                            position in the regex) then the EXACT branch is
2861                            a possible optimization target. Once we have
2862                            two or more consecutive such branches we can
2863                            create a trie of the EXACT's contents and stich
2864                            it in place. If the sequence represents all of
2865                            the branches we eliminate the whole thing and
2866                            replace it with a single TRIE. If it is a
2867                            subsequence then we need to stitch it in. This
2868                            means the first branch has to remain, and needs
2869                            to be repointed at the item on the branch chain
2870                            following the last branch optimized. This could
2871                            be either a BRANCH, in which case the
2872                            subsequence is internal, or it could be the
2873                            item following the branch sequence in which
2874                            case the subsequence is at the end.
2875
2876                         */
2877
2878                         /* dont use tail as the end marker for this traverse */
2879                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2880                             regnode * const noper = NEXTOPER( cur );
2881 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2882                             regnode * const noper_next = regnext( noper );
2883 #endif
2884
2885                             DEBUG_OPTIMISE_r({
2886                                 regprop(RExC_rx, mysv, cur);
2887                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2888                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2889
2890                                 regprop(RExC_rx, mysv, noper);
2891                                 PerlIO_printf( Perl_debug_log, " -> %s",
2892                                     SvPV_nolen_const(mysv));
2893
2894                                 if ( noper_next ) {
2895                                   regprop(RExC_rx, mysv, noper_next );
2896                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2897                                     SvPV_nolen_const(mysv));
2898                                 }
2899                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2900                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2901                             });
2902                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2903                                          : PL_regkind[ OP( noper ) ] == EXACT )
2904                                   || OP(noper) == NOTHING )
2905 #ifdef NOJUMPTRIE
2906                                   && noper_next == tail
2907 #endif
2908                                   && count < U16_MAX)
2909                             {
2910                                 count++;
2911                                 if ( !first || optype == NOTHING ) {
2912                                     if (!first) first = cur;
2913                                     optype = OP( noper );
2914                                 } else {
2915                                     last = cur;
2916                                 }
2917                             } else {
2918 /*
2919     Currently we do not believe that the trie logic can
2920     handle case insensitive matching properly when the
2921     pattern is not unicode (thus forcing unicode semantics).
2922
2923     If/when this is fixed the following define can be swapped
2924     in below to fully enable trie logic.
2925
2926 #define TRIE_TYPE_IS_SAFE 1
2927
2928 */
2929 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2930
2931                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2932                                     make_trie( pRExC_state,
2933                                             startbranch, first, cur, tail, count,
2934                                             optype, depth+1 );
2935                                 }
2936                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2937 #ifdef NOJUMPTRIE
2938                                      && noper_next == tail
2939 #endif
2940                                 ){
2941                                     count = 1;
2942                                     first = cur;
2943                                     optype = OP( noper );
2944                                 } else {
2945                                     count = 0;
2946                                     first = NULL;
2947                                     optype = 0;
2948                                 }
2949                                 last = NULL;
2950                             }
2951                         }
2952                         DEBUG_OPTIMISE_r({
2953                             regprop(RExC_rx, mysv, cur);
2954                             PerlIO_printf( Perl_debug_log,
2955                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2956                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2957
2958                         });
2959
2960                         if ( last && TRIE_TYPE_IS_SAFE ) {
2961                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2962 #ifdef TRIE_STUDY_OPT
2963                             if ( ((made == MADE_EXACT_TRIE &&
2964                                  startbranch == first)
2965                                  || ( first_non_open == first )) &&
2966                                  depth==0 ) {
2967                                 flags |= SCF_TRIE_RESTUDY;
2968                                 if ( startbranch == first
2969                                      && scan == tail )
2970                                 {
2971                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2972                                 }
2973                             }
2974 #endif
2975                         }
2976                     }
2977
2978                 } /* do trie */
2979
2980             }
2981             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2982                 scan = NEXTOPER(NEXTOPER(scan));
2983             } else                      /* single branch is optimized. */
2984                 scan = NEXTOPER(scan);
2985             continue;
2986         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2987             scan_frame *newframe = NULL;
2988             I32 paren;
2989             regnode *start;
2990             regnode *end;
2991
2992             if (OP(scan) != SUSPEND) {
2993             /* set the pointer */
2994                 if (OP(scan) == GOSUB) {
2995                     paren = ARG(scan);
2996                     RExC_recurse[ARG2L(scan)] = scan;
2997                     start = RExC_open_parens[paren-1];
2998                     end   = RExC_close_parens[paren-1];
2999                 } else {
3000                     paren = 0;
3001                     start = RExC_rxi->program + 1;
3002                     end   = RExC_opend;
3003                 }
3004                 if (!recursed) {
3005                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3006                     SAVEFREEPV(recursed);
3007                 }
3008                 if (!PAREN_TEST(recursed,paren+1)) {
3009                     PAREN_SET(recursed,paren+1);
3010                     Newx(newframe,1,scan_frame);
3011                 } else {
3012                     if (flags & SCF_DO_SUBSTR) {
3013                         SCAN_COMMIT(pRExC_state,data,minlenp);
3014                         data->longest = &(data->longest_float);
3015                     }
3016                     is_inf = is_inf_internal = 1;
3017                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3018                         cl_anything(pRExC_state, data->start_class);
3019                     flags &= ~SCF_DO_STCLASS;
3020                 }
3021             } else {
3022                 Newx(newframe,1,scan_frame);
3023                 paren = stopparen;
3024                 start = scan+2;
3025                 end = regnext(scan);
3026             }
3027             if (newframe) {
3028                 assert(start);
3029                 assert(end);
3030                 SAVEFREEPV(newframe);
3031                 newframe->next = regnext(scan);
3032                 newframe->last = last;
3033                 newframe->stop = stopparen;
3034                 newframe->prev = frame;
3035
3036                 frame = newframe;
3037                 scan =  start;
3038                 stopparen = paren;
3039                 last = end;
3040
3041                 continue;
3042             }
3043         }
3044         else if (OP(scan) == EXACT) {
3045             I32 l = STR_LEN(scan);
3046             UV uc;
3047             if (UTF) {
3048                 const U8 * const s = (U8*)STRING(scan);
3049                 l = utf8_length(s, s + l);
3050                 uc = utf8_to_uvchr(s, NULL);
3051             } else {
3052                 uc = *((U8*)STRING(scan));
3053             }
3054             min += l;
3055             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3056                 /* The code below prefers earlier match for fixed
3057                    offset, later match for variable offset.  */
3058                 if (data->last_end == -1) { /* Update the start info. */
3059                     data->last_start_min = data->pos_min;
3060                     data->last_start_max = is_inf
3061                         ? I32_MAX : data->pos_min + data->pos_delta;
3062                 }
3063                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3064                 if (UTF)
3065                     SvUTF8_on(data->last_found);
3066                 {
3067                     SV * const sv = data->last_found;
3068                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3069                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3070                     if (mg && mg->mg_len >= 0)
3071                         mg->mg_len += utf8_length((U8*)STRING(scan),
3072                                                   (U8*)STRING(scan)+STR_LEN(scan));
3073                 }
3074                 data->last_end = data->pos_min + l;
3075                 data->pos_min += l; /* As in the first entry. */
3076                 data->flags &= ~SF_BEFORE_EOL;
3077             }
3078             if (flags & SCF_DO_STCLASS_AND) {
3079                 /* Check whether it is compatible with what we know already! */
3080                 int compat = 1;
3081
3082
3083                 /* If compatible, we or it in below.  It is compatible if is
3084                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3085                  * it's for a locale.  Even if there isn't unicode semantics
3086                  * here, at runtime there may be because of matching against a
3087                  * utf8 string, so accept a possible false positive for
3088                  * latin1-range folds */
3089                 if (uc >= 0x100 ||
3090                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3091                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3092                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3093                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3094                     )
3095                     compat = 0;
3096                 ANYOF_CLASS_ZERO(data->start_class);
3097                 ANYOF_BITMAP_ZERO(data->start_class);
3098                 if (compat)
3099                     ANYOF_BITMAP_SET(data->start_class, uc);
3100                 data->start_class->flags &= ~ANYOF_EOS;
3101                 if (uc < 0x100)
3102                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3103             }
3104             else if (flags & SCF_DO_STCLASS_OR) {
3105                 /* false positive possible if the class is case-folded */
3106                 if (uc < 0x100)
3107                     ANYOF_BITMAP_SET(data->start_class, uc);
3108                 else
3109                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3110                 data->start_class->flags &= ~ANYOF_EOS;
3111                 cl_and(data->start_class, and_withp);
3112             }
3113             flags &= ~SCF_DO_STCLASS;
3114         }
3115         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3116             I32 l = STR_LEN(scan);
3117             UV uc = *((U8*)STRING(scan));
3118
3119             /* Search for fixed substrings supports EXACT only. */
3120             if (flags & SCF_DO_SUBSTR) {
3121                 assert(data);
3122                 SCAN_COMMIT(pRExC_state, data, minlenp);
3123             }
3124             if (UTF) {
3125                 const U8 * const s = (U8 *)STRING(scan);
3126                 l = utf8_length(s, s + l);
3127                 uc = utf8_to_uvchr(s, NULL);
3128             }
3129             min += l;
3130             if (flags & SCF_DO_SUBSTR)
3131                 data->pos_min += l;
3132             if (flags & SCF_DO_STCLASS_AND) {
3133                 /* Check whether it is compatible with what we know already! */
3134                 int compat = 1;
3135                 if (uc >= 0x100 ||
3136                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3137                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3138                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3139                 {
3140                     compat = 0;
3141                 }
3142                 ANYOF_CLASS_ZERO(data->start_class);
3143                 ANYOF_BITMAP_ZERO(data->start_class);
3144                 if (compat) {
3145                     ANYOF_BITMAP_SET(data->start_class, uc);
3146                     data->start_class->flags &= ~ANYOF_EOS;
3147                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3148                     if (OP(scan) == EXACTFL) {
3149                         data->start_class->flags |= ANYOF_LOCALE;
3150                     }
3151                     else {
3152
3153                         /* Also set the other member of the fold pair.  In case
3154                          * that unicode semantics is called for at runtime, use
3155                          * the full latin1 fold.  (Can't do this for locale,
3156                          * because not known until runtime */
3157                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3158                     }
3159                 }
3160             }
3161             else if (flags & SCF_DO_STCLASS_OR) {
3162                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3163                     /* false positive possible if the class is case-folded.
3164                        Assume that the locale settings are the same... */
3165                     if (uc < 0x100) {
3166                         ANYOF_BITMAP_SET(data->start_class, uc);
3167                         if (OP(scan) != EXACTFL) {
3168
3169                             /* And set the other member of the fold pair, but
3170                              * can't do that in locale because not known until
3171                              * run-time */
3172                             ANYOF_BITMAP_SET(data->start_class,
3173                                              PL_fold_latin1[uc]);
3174                         }
3175                     }
3176                     data->start_class->flags &= ~ANYOF_EOS;
3177                 }
3178                 cl_and(data->start_class, and_withp);
3179             }
3180             flags &= ~SCF_DO_STCLASS;
3181         }
3182         else if (REGNODE_VARIES(OP(scan))) {
3183             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3184             I32 f = flags, pos_before = 0;
3185             regnode * const oscan = scan;
3186             struct regnode_charclass_class this_class;
3187             struct regnode_charclass_class *oclass = NULL;
3188             I32 next_is_eval = 0;
3189
3190             switch (PL_regkind[OP(scan)]) {
3191             case WHILEM:                /* End of (?:...)* . */
3192                 scan = NEXTOPER(scan);
3193                 goto finish;
3194             case PLUS:
3195                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3196                     next = NEXTOPER(scan);
3197                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3198                         mincount = 1;
3199                         maxcount = REG_INFTY;
3200                         next = regnext(scan);
3201                         scan = NEXTOPER(scan);
3202                         goto do_curly;
3203                     }
3204                 }
3205                 if (flags & SCF_DO_SUBSTR)
3206                     data->pos_min++;
3207                 min++;
3208                 /* Fall through. */
3209             case STAR:
3210                 if (flags & SCF_DO_STCLASS) {
3211                     mincount = 0;
3212                     maxcount = REG_INFTY;
3213                     next = regnext(scan);
3214                     scan = NEXTOPER(scan);
3215                     goto do_curly;
3216                 }
3217                 is_inf = is_inf_internal = 1;
3218                 scan = regnext(scan);
3219                 if (flags & SCF_DO_SUBSTR) {
3220                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3221                     data->longest = &(data->longest_float);
3222                 }
3223                 goto optimize_curly_tail;
3224             case CURLY:
3225                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3226                     && (scan->flags == stopparen))
3227                 {
3228                     mincount = 1;
3229                     maxcount = 1;
3230                 } else {
3231                     mincount = ARG1(scan);
3232                     maxcount = ARG2(scan);
3233                 }
3234                 next = regnext(scan);
3235                 if (OP(scan) == CURLYX) {
3236                     I32 lp = (data ? *(data->last_closep) : 0);
3237                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3238                 }
3239                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3240                 next_is_eval = (OP(scan) == EVAL);
3241               do_curly:
3242                 if (flags & SCF_DO_SUBSTR) {
3243                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3244                     pos_before = data->pos_min;
3245                 }
3246                 if (data) {
3247                     fl = data->flags;
3248                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3249                     if (is_inf)
3250                         data->flags |= SF_IS_INF;
3251                 }
3252                 if (flags & SCF_DO_STCLASS) {
3253                     cl_init(pRExC_state, &this_class);
3254                     oclass = data->start_class;
3255                     data->start_class = &this_class;
3256                     f |= SCF_DO_STCLASS_AND;
3257                     f &= ~SCF_DO_STCLASS_OR;
3258                 }
3259                 /* Exclude from super-linear cache processing any {n,m}
3260                    regops for which the combination of input pos and regex
3261                    pos is not enough information to determine if a match
3262                    will be possible.
3263
3264                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3265                    regex pos at the \s*, the prospects for a match depend not
3266                    only on the input position but also on how many (bar\s*)
3267                    repeats into the {4,8} we are. */
3268                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3269                     f &= ~SCF_WHILEM_VISITED_POS;
3270
3271                 /* This will finish on WHILEM, setting scan, or on NULL: */
3272                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3273                                       last, data, stopparen, recursed, NULL,
3274                                       (mincount == 0
3275                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3276
3277                 if (flags & SCF_DO_STCLASS)
3278                     data->start_class = oclass;
3279                 if (mincount == 0 || minnext == 0) {
3280                     if (flags & SCF_DO_STCLASS_OR) {
3281                         cl_or(pRExC_state, data->start_class, &this_class);
3282                     }
3283                     else if (flags & SCF_DO_STCLASS_AND) {
3284                         /* Switch to OR mode: cache the old value of
3285                          * data->start_class */
3286                         INIT_AND_WITHP;
3287                         StructCopy(data->start_class, and_withp,
3288                                    struct regnode_charclass_class);
3289                         flags &= ~SCF_DO_STCLASS_AND;
3290                         StructCopy(&this_class, data->start_class,
3291                                    struct regnode_charclass_class);
3292                         flags |= SCF_DO_STCLASS_OR;
3293                         data->start_class->flags |= ANYOF_EOS;
3294                     }
3295                 } else {                /* Non-zero len */
3296                     if (flags & SCF_DO_STCLASS_OR) {
3297                         cl_or(pRExC_state, data->start_class, &this_class);
3298                         cl_and(data->start_class, and_withp);
3299                     }
3300                     else if (flags & SCF_DO_STCLASS_AND)
3301                         cl_and(data->start_class, &this_class);
3302                     flags &= ~SCF_DO_STCLASS;
3303                 }
3304                 if (!scan)              /* It was not CURLYX, but CURLY. */
3305                     scan = next;
3306                 if ( /* ? quantifier ok, except for (?{ ... }) */
3307                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3308                     && (minnext == 0) && (deltanext == 0)
3309                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3310                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3311                 {
3312                     ckWARNreg(RExC_parse,
3313                               "Quantifier unexpected on zero-length expression");
3314                 }
3315
3316                 min += minnext * mincount;
3317                 is_inf_internal |= ((maxcount == REG_INFTY
3318                                      && (minnext + deltanext) > 0)
3319                                     || deltanext == I32_MAX);
3320                 is_inf |= is_inf_internal;
3321                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3322
3323                 /* Try powerful optimization CURLYX => CURLYN. */
3324                 if (  OP(oscan) == CURLYX && data
3325                       && data->flags & SF_IN_PAR
3326                       && !(data->flags & SF_HAS_EVAL)
3327                       && !deltanext && minnext == 1 ) {
3328                     /* Try to optimize to CURLYN.  */
3329                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3330                     regnode * const nxt1 = nxt;
3331 #ifdef DEBUGGING
3332                     regnode *nxt2;
3333 #endif
3334
3335                     /* Skip open. */
3336                     nxt = regnext(nxt);
3337                     if (!REGNODE_SIMPLE(OP(nxt))
3338                         && !(PL_regkind[OP(nxt)] == EXACT
3339                              && STR_LEN(nxt) == 1))
3340                         goto nogo;
3341 #ifdef DEBUGGING
3342                     nxt2 = nxt;
3343 #endif
3344                     nxt = regnext(nxt);
3345                     if (OP(nxt) != CLOSE)
3346                         goto nogo;
3347                     if (RExC_open_parens) {
3348                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3349                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3350                     }
3351                     /* Now we know that nxt2 is the only contents: */
3352                     oscan->flags = (U8)ARG(nxt);
3353                     OP(oscan) = CURLYN;
3354                     OP(nxt1) = NOTHING; /* was OPEN. */
3355
3356 #ifdef DEBUGGING
3357                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3358                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3359                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3360                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3361                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3362                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3363 #endif
3364                 }
3365               nogo:
3366
3367                 /* Try optimization CURLYX => CURLYM. */
3368                 if (  OP(oscan) == CURLYX && data
3369                       && !(data->flags & SF_HAS_PAR)
3370                       && !(data->flags & SF_HAS_EVAL)
3371                       && !deltanext     /* atom is fixed width */
3372                       && minnext != 0   /* CURLYM can't handle zero width */
3373                 ) {
3374                     /* XXXX How to optimize if data == 0? */
3375                     /* Optimize to a simpler form.  */
3376                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3377                     regnode *nxt2;
3378
3379                     OP(oscan) = CURLYM;
3380                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3381                             && (OP(nxt2) != WHILEM))
3382                         nxt = nxt2;
3383                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3384                     /* Need to optimize away parenths. */
3385                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3386                         /* Set the parenth number.  */
3387                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3388
3389                         oscan->flags = (U8)ARG(nxt);
3390                         if (RExC_open_parens) {
3391                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3392                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3393                         }
3394                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3395                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3396
3397 #ifdef DEBUGGING
3398                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3399                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3400                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3401                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3402 #endif
3403 #if 0
3404                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3405                             regnode *nnxt = regnext(nxt1);
3406                             if (nnxt == nxt) {
3407                                 if (reg_off_by_arg[OP(nxt1)])
3408                                     ARG_SET(nxt1, nxt2 - nxt1);
3409                                 else if (nxt2 - nxt1 < U16_MAX)
3410                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3411                                 else
3412                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3413                             }
3414                             nxt1 = nnxt;
3415                         }
3416 #endif
3417                         /* Optimize again: */
3418                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3419                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3420                     }
3421                     else
3422                         oscan->flags = 0;
3423                 }
3424                 else if ((OP(oscan) == CURLYX)
3425                          && (flags & SCF_WHILEM_VISITED_POS)
3426                          /* See the comment on a similar expression above.
3427                             However, this time it's not a subexpression
3428                             we care about, but the expression itself. */
3429                          && (maxcount == REG_INFTY)
3430                          && data && ++data->whilem_c < 16) {
3431                     /* This stays as CURLYX, we can put the count/of pair. */
3432                     /* Find WHILEM (as in regexec.c) */
3433                     regnode *nxt = oscan + NEXT_OFF(oscan);
3434
3435                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3436                         nxt += ARG(nxt);
3437                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3438                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3439                 }
3440                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3441                     pars++;
3442                 if (flags & SCF_DO_SUBSTR) {
3443                     SV *last_str = NULL;
3444                     int counted = mincount != 0;
3445
3446                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3447 #if defined(SPARC64_GCC_WORKAROUND)
3448                         I32 b = 0;
3449                         STRLEN l = 0;
3450                         const char *s = NULL;
3451                         I32 old = 0;
3452
3453                         if (pos_before >= data->last_start_min)
3454                             b = pos_before;
3455                         else
3456                             b = data->last_start_min;
3457
3458                         l = 0;
3459                         s = SvPV_const(data->last_found, l);
3460                         old = b - data->last_start_min;
3461
3462 #else
3463                         I32 b = pos_before >= data->last_start_min
3464                             ? pos_before : data->last_start_min;
3465                         STRLEN l;
3466                         const char * const s = SvPV_const(data->last_found, l);
3467                         I32 old = b - data->last_start_min;
3468 #endif
3469
3470                         if (UTF)
3471                             old = utf8_hop((U8*)s, old) - (U8*)s;
3472                         l -= old;
3473                         /* Get the added string: */
3474                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3475                         if (deltanext == 0 && pos_before == b) {
3476                             /* What was added is a constant string */
3477                             if (mincount > 1) {
3478                                 SvGROW(last_str, (mincount * l) + 1);
3479                                 repeatcpy(SvPVX(last_str) + l,
3480                                           SvPVX_const(last_str), l, mincount - 1);
3481                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3482                                 /* Add additional parts. */
3483                                 SvCUR_set(data->last_found,
3484                                           SvCUR(data->last_found) - l);
3485                                 sv_catsv(data->last_found, last_str);
3486                                 {
3487                                     SV * sv = data->last_found;
3488                                     MAGIC *mg =
3489                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3490                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3491                                     if (mg && mg->mg_len >= 0)
3492                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3493                                 }
3494                                 data->last_end += l * (mincount - 1);
3495                             }
3496                         } else {
3497                             /* start offset must point into the last copy */
3498                             data->last_start_min += minnext * (mincount - 1);
3499                             data->last_start_max += is_inf ? I32_MAX
3500                                 : (maxcount - 1) * (minnext + data->pos_delta);
3501                         }
3502                     }
3503                     /* It is counted once already... */
3504                     data->pos_min += minnext * (mincount - counted);
3505                     data->pos_delta += - counted * deltanext +
3506                         (minnext + deltanext) * maxcount - minnext * mincount;
3507                     if (mincount != maxcount) {
3508                          /* Cannot extend fixed substrings found inside
3509                             the group.  */
3510                         SCAN_COMMIT(pRExC_state,data,minlenp);
3511                         if (mincount && last_str) {
3512                             SV * const sv = data->last_found;
3513                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3514                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3515
3516                             if (mg)
3517                                 mg->mg_len = -1;
3518                             sv_setsv(sv, last_str);
3519                             data->last_end = data->pos_min;
3520                             data->last_start_min =
3521                                 data->pos_min - CHR_SVLEN(last_str);
3522                             data->last_start_max = is_inf
3523                                 ? I32_MAX
3524                                 : data->pos_min + data->pos_delta
3525                                 - CHR_SVLEN(last_str);
3526                         }
3527                         data->longest = &(data->longest_float);
3528                     }
3529                     SvREFCNT_dec(last_str);
3530                 }
3531                 if (data && (fl & SF_HAS_EVAL))
3532                     data->flags |= SF_HAS_EVAL;
3533               optimize_curly_tail:
3534                 if (OP(oscan) != CURLYX) {
3535                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3536                            && NEXT_OFF(next))
3537                         NEXT_OFF(oscan) += NEXT_OFF(next);
3538                 }
3539                 continue;
3540             default:                    /* REF, ANYOFV, and CLUMP only? */
3541                 if (flags & SCF_DO_SUBSTR) {
3542                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3543                     data->longest = &(data->longest_float);
3544                 }
3545                 is_inf = is_inf_internal = 1;
3546                 if (flags & SCF_DO_STCLASS_OR)
3547                     cl_anything(pRExC_state, data->start_class);
3548                 flags &= ~SCF_DO_STCLASS;
3549                 break;
3550             }
3551         }
3552         else if (OP(scan) == LNBREAK) {
3553             if (flags & SCF_DO_STCLASS) {
3554                 int value = 0;
3555                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3556                 if (flags & SCF_DO_STCLASS_AND) {
3557                     for (value = 0; value < 256; value++)
3558                         if (!is_VERTWS_cp(value))
3559                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3560                 }
3561                 else {
3562                     for (value = 0; value < 256; value++)
3563                         if (is_VERTWS_cp(value))
3564                             ANYOF_BITMAP_SET(data->start_class, value);
3565                 }
3566                 if (flags & SCF_DO_STCLASS_OR)
3567                     cl_and(data->start_class, and_withp);
3568                 flags &= ~SCF_DO_STCLASS;
3569             }
3570             min += 1;
3571             delta += 1;
3572             if (flags & SCF_DO_SUBSTR) {
3573                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3574                 data->pos_min += 1;
3575                 data->pos_delta += 1;
3576                 data->longest = &(data->longest_float);
3577             }
3578         }
3579         else if (OP(scan) == FOLDCHAR) {
3580             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3581             flags &= ~SCF_DO_STCLASS;
3582             min += 1;
3583             delta += d;
3584             if (flags & SCF_DO_SUBSTR) {
3585                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3586                 data->pos_min += 1;
3587                 data->pos_delta += d;
3588                 data->longest = &(data->longest_float);
3589             }
3590         }
3591         else if (REGNODE_SIMPLE(OP(scan))) {
3592             int value = 0;
3593
3594             if (flags & SCF_DO_SUBSTR) {
3595                 SCAN_COMMIT(pRExC_state,data,minlenp);
3596                 data->pos_min++;
3597             }
3598             min++;
3599             if (flags & SCF_DO_STCLASS) {
3600                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3601
3602                 /* Some of the logic below assumes that switching
3603                    locale on will only add false positives. */
3604                 switch (PL_regkind[OP(scan)]) {
3605                 case SANY:
3606                 default:
3607                   do_default:
3608                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3609                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3610                         cl_anything(pRExC_state, data->start_class);
3611                     break;
3612                 case REG_ANY:
3613                     if (OP(scan) == SANY)
3614                         goto do_default;
3615                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3616                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3617                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3618                         cl_anything(pRExC_state, data->start_class);
3619                     }
3620                     if (flags & SCF_DO_STCLASS_AND || !value)
3621                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3622                     break;
3623                 case ANYOF:
3624                     if (flags & SCF_DO_STCLASS_AND)
3625                         cl_and(data->start_class,
3626                                (struct regnode_charclass_class*)scan);
3627                     else
3628                         cl_or(pRExC_state, data->start_class,
3629                               (struct regnode_charclass_class*)scan);
3630                     break;
3631                 case ALNUM:
3632                     if (flags & SCF_DO_STCLASS_AND) {
3633                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3634                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3635                             if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
3636                                 for (value = 0; value < 256; value++) {
3637                                     if (!isWORDCHAR_L1(value)) {
3638                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3639                                     }
3640                                 }
3641                             } else {
3642                                 for (value = 0; value < 256; value++) {
3643                                     if (!isALNUM(value)) {
3644                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3645                                     }
3646                                 }
3647                             }
3648                         }
3649                     }
3650                     else {
3651                         if (data->start_class->flags & ANYOF_LOCALE)
3652                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3653                         else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
3654                             for (value = 0; value < 256; value++) {
3655                                 if (isWORDCHAR_L1(value)) {
3656                                     ANYOF_BITMAP_SET(data->start_class, value);
3657                                 }
3658                             }
3659                         } else {
3660                             for (value = 0; value < 256; value++) {
3661                                 if (isALNUM(value)) {
3662                                     ANYOF_BITMAP_SET(data->start_class, value);
3663                                 }
3664                             }
3665                         }
3666                     }
3667                     break;
3668                 case NALNUM:
3669                     if (flags & SCF_DO_STCLASS_AND) {
3670                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3671                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3672                             if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
3673                                 for (value = 0; value < 256; value++) {
3674                                     if (isWORDCHAR_L1(value)) {
3675                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3676                                     }
3677                                 }
3678                             } else {
3679                                 for (value = 0; value < 256; value++) {
3680                                     if (isALNUM(value)) {
3681                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3682                                     }
3683                                 }
3684                             }
3685                         }
3686                     }
3687                     else {
3688                         if (data->start_class->flags & ANYOF_LOCALE)
3689                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3690                         else {
3691                             if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
3692                                 for (value = 0; value < 256; value++) {
3693                                     if (! isWORDCHAR_L1(value)) {
3694                                         ANYOF_BITMAP_SET(data->start_class, value);
3695                                     }
3696                                 }
3697                             } else {
3698                                 for (value = 0; value < 256; value++) {
3699                                     if (! isALNUM(value)) {
3700                                         ANYOF_BITMAP_SET(data->start_class, value);
3701                                     }
3702                                 }
3703                             }
3704                         }
3705                     }
3706                     break;
3707                 case SPACE:
3708                     if (flags & SCF_DO_STCLASS_AND) {
3709                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3710                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3711                             if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
3712                                 for (value = 0; value < 256; value++) {
3713                                     if (!isSPACE_L1(value)) {
3714                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3715                                     }
3716                                 }
3717                             } else {
3718                                 for (value = 0; value < 256; value++) {
3719                                     if (!isSPACE(value)) {
3720                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3721                                     }
3722                                 }
3723                             }
3724                         }
3725                     }
3726                     else {
3727                         if (data->start_class->flags & ANYOF_LOCALE) {
3728                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3729                         }
3730                         else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
3731                             for (value = 0; value < 256; value++) {
3732                                 if (isSPACE_L1(value)) {
3733                                     ANYOF_BITMAP_SET(data->start_class, value);
3734                                 }
3735                             }
3736                         } else {
3737                             for (value = 0; value < 256; value++) {
3738                                 if (isSPACE(value)) {
3739                                     ANYOF_BITMAP_SET(data->start_class, value);
3740                                 }
3741                             }
3742                         }
3743                     }
3744                     break;
3745                 case NSPACE:
3746                     if (flags & SCF_DO_STCLASS_AND) {
3747                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3748                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3749                             if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
3750                                 for (value = 0; value < 256; value++) {
3751                                     if (isSPACE_L1(value)) {
3752                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3753                                     }
3754                                 }
3755                             } else {
3756                                 for (value = 0; value < 256; value++) {
3757                                     if (isSPACE(value)) {
3758                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3759                                     }
3760                                 }
3761                             }
3762                         }
3763                     }
3764                     else {
3765                         if (data->start_class->flags & ANYOF_LOCALE)
3766                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3767                         else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
3768                             for (value = 0; value < 256; value++) {
3769                                 if (!isSPACE_L1(value)) {
3770                                     ANYOF_BITMAP_SET(data->start_class, value);
3771                                 }
3772                             }
3773                         }
3774                         else {
3775                             for (value = 0; value < 256; value++) {
3776                                 if (!isSPACE(value)) {
3777                                     ANYOF_BITMAP_SET(data->start_class, value);
3778                                 }
3779                             }
3780                         }
3781                     }
3782                     break;
3783                 case DIGIT:
3784                     if (flags & SCF_DO_STCLASS_AND) {
3785                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3786                         for (value = 0; value < 256; value++)
3787                             if (!isDIGIT(value))
3788                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3789                     }
3790                     else {
3791                         if (data->start_class->flags & ANYOF_LOCALE)
3792                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3793                         else {
3794                             for (value = 0; value < 256; value++)
3795                                 if (isDIGIT(value))
3796                                     ANYOF_BITMAP_SET(data->start_class, value);
3797                         }
3798                     }
3799                     break;
3800                 case NDIGIT:
3801                     if (flags & SCF_DO_STCLASS_AND) {
3802                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3803                         for (value = 0; value < 256; value++)
3804                             if (isDIGIT(value))
3805                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3806                     }
3807                     else {
3808                         if (data->start_class->flags & ANYOF_LOCALE)
3809                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3810                         else {
3811                             for (value = 0; value < 256; value++)
3812                                 if (!isDIGIT(value))
3813                                     ANYOF_BITMAP_SET(data->start_class, value);
3814                         }
3815                     }
3816                     break;
3817                 CASE_SYNST_FNC(VERTWS);
3818                 CASE_SYNST_FNC(HORIZWS);
3819
3820                 }
3821                 if (flags & SCF_DO_STCLASS_OR)
3822                     cl_and(data->start_class, and_withp);
3823                 flags &= ~SCF_DO_STCLASS;
3824             }
3825         }
3826         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3827             data->flags |= (OP(scan) == MEOL
3828                             ? SF_BEFORE_MEOL
3829                             : SF_BEFORE_SEOL);
3830         }
3831         else if (  PL_regkind[OP(scan)] == BRANCHJ
3832                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3833                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3834                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3835             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3836                 || OP(scan) == UNLESSM )
3837             {
3838                 /* Negative Lookahead/lookbehind
3839                    In this case we can't do fixed string optimisation.
3840                 */
3841
3842                 I32 deltanext, minnext, fake = 0;
3843                 regnode *nscan;
3844                 struct regnode_charclass_class intrnl;
3845                 int f = 0;
3846
3847                 data_fake.flags = 0;
3848                 if (data) {
3849                     data_fake.whilem_c = data->whilem_c;
3850                     data_fake.last_closep = data->last_closep;
3851                 }
3852                 else
3853                     data_fake.last_closep = &fake;
3854                 data_fake.pos_delta = delta;
3855                 if ( flags & SCF_DO_STCLASS && !scan->flags
3856                      && OP(scan) == IFMATCH ) { /* Lookahead */
3857                     cl_init(pRExC_state, &intrnl);
3858                     data_fake.start_class = &intrnl;
3859                     f |= SCF_DO_STCLASS_AND;
3860                 }
3861                 if (flags & SCF_WHILEM_VISITED_POS)
3862                     f |= SCF_WHILEM_VISITED_POS;
3863                 next = regnext(scan);
3864                 nscan = NEXTOPER(NEXTOPER(scan));
3865                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3866                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3867                 if (scan->flags) {
3868                     if (deltanext) {
3869                         FAIL("Variable length lookbehind not implemented");
3870                     }
3871                     else if (minnext > (I32)U8_MAX) {
3872                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3873                     }
3874                     scan->flags = (U8)minnext;
3875                 }
3876                 if (data) {
3877                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3878                         pars++;
3879                     if (data_fake.flags & SF_HAS_EVAL)
3880                         data->flags |= SF_HAS_EVAL;
3881                     data->whilem_c = data_fake.whilem_c;
3882                 }
3883                 if (f & SCF_DO_STCLASS_AND) {
3884                     if (flags & SCF_DO_STCLASS_OR) {
3885                         /* OR before, AND after: ideally we would recurse with
3886                          * data_fake to get the AND applied by study of the
3887                          * remainder of the pattern, and then derecurse;
3888                          * *** HACK *** for now just treat as "no information".
3889                          * See [perl #56690].
3890                          */
3891                         cl_init(pRExC_state, data->start_class);
3892                     }  else {
3893                         /* AND before and after: combine and continue */
3894                         const int was = (data->start_class->flags & ANYOF_EOS);
3895
3896                         cl_and(data->start_class, &intrnl);
3897                         if (was)
3898                             data->start_class->flags |= ANYOF_EOS;
3899                     }
3900                 }
3901             }
3902 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3903             else {
3904                 /* Positive Lookahead/lookbehind
3905                    In this case we can do fixed string optimisation,
3906                    but we must be careful about it. Note in the case of
3907                    lookbehind the positions will be offset by the minimum
3908                    length of the pattern, something we won't know about
3909                    until after the recurse.
3910                 */
3911                 I32 deltanext, fake = 0;
3912                 regnode *nscan;
3913                 struct regnode_charclass_class intrnl;
3914                 int f = 0;
3915                 /* We use SAVEFREEPV so that when the full compile
3916                     is finished perl will clean up the allocated
3917                     minlens when it's all done. This way we don't
3918                     have to worry about freeing them when we know
3919                     they wont be used, which would be a pain.
3920                  */
3921                 I32 *minnextp;
3922                 Newx( minnextp, 1, I32 );
3923                 SAVEFREEPV(minnextp);
3924
3925                 if (data) {
3926                     StructCopy(data, &data_fake, scan_data_t);
3927                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3928                         f |= SCF_DO_SUBSTR;
3929                         if (scan->flags)
3930                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3931                         data_fake.last_found=newSVsv(data->last_found);
3932                     }
3933                 }
3934                 else
3935                     data_fake.last_closep = &fake;
3936                 data_fake.flags = 0;
3937                 data_fake.pos_delta = delta;
3938                 if (is_inf)
3939                     data_fake.flags |= SF_IS_INF;
3940                 if ( flags & SCF_DO_STCLASS && !scan->flags
3941                      && OP(scan) == IFMATCH ) { /* Lookahead */
3942                     cl_init(pRExC_state, &intrnl);
3943                     data_fake.start_class = &intrnl;
3944                     f |= SCF_DO_STCLASS_AND;
3945                 }
3946                 if (flags & SCF_WHILEM_VISITED_POS)
3947                     f |= SCF_WHILEM_VISITED_POS;
3948                 next = regnext(scan);
3949                 nscan = NEXTOPER(NEXTOPER(scan));
3950
3951                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3952                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3953                 if (scan->flags) {
3954                     if (deltanext) {
3955                         FAIL("Variable length lookbehind not implemented");
3956                     }
3957                     else if (*minnextp > (I32)U8_MAX) {
3958                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3959                     }
3960                     scan->flags = (U8)*minnextp;
3961                 }
3962
3963                 *minnextp += min;
3964
3965                 if (f & SCF_DO_STCLASS_AND) {
3966                     const int was = (data->start_class->flags & ANYOF_EOS);
3967
3968                     cl_and(data->start_class, &intrnl);
3969                     if (was)
3970                         data->start_class->flags |= ANYOF_EOS;
3971                 }
3972                 if (data) {
3973                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3974                         pars++;
3975                     if (data_fake.flags & SF_HAS_EVAL)
3976                         data->flags |= SF_HAS_EVAL;
3977                     data->whilem_c = data_fake.whilem_c;
3978                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3979                         if (RExC_rx->minlen<*minnextp)
3980                             RExC_rx->minlen=*minnextp;
3981                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3982                         SvREFCNT_dec(data_fake.last_found);
3983
3984                         if ( data_fake.minlen_fixed != minlenp )
3985                         {
3986                             data->offset_fixed= data_fake.offset_fixed;
3987                             data->minlen_fixed= data_fake.minlen_fixed;
3988                             data->lookbehind_fixed+= scan->flags;
3989                         }
3990                         if ( data_fake.minlen_float != minlenp )
3991                         {
3992                             data->minlen_float= data_fake.minlen_float;
3993                             data->offset_float_min=data_fake.offset_float_min;
3994                             data->offset_float_max=data_fake.offset_float_max;
3995                             data->lookbehind_float+= scan->flags;
3996                         }
3997                     }
3998                 }
3999
4000
4001             }
4002 #endif
4003         }
4004         else if (OP(scan) == OPEN) {
4005             if (stopparen != (I32)ARG(scan))
4006                 pars++;
4007         }
4008         else if (OP(scan) == CLOSE) {
4009             if (stopparen == (I32)ARG(scan)) {
4010                 break;
4011             }
4012             if ((I32)ARG(scan) == is_par) {
4013                 next = regnext(scan);
4014
4015                 if ( next && (OP(next) != WHILEM) && next < last)
4016                     is_par = 0;         /* Disable optimization */
4017             }
4018             if (data)
4019                 *(data->last_closep) = ARG(scan);
4020         }
4021         else if (OP(scan) == EVAL) {
4022                 if (data)
4023                     data->flags |= SF_HAS_EVAL;
4024         }
4025         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4026             if (flags & SCF_DO_SUBSTR) {
4027                 SCAN_COMMIT(pRExC_state,data,minlenp);
4028                 flags &= ~SCF_DO_SUBSTR;
4029             }
4030             if (data && OP(scan)==ACCEPT) {
4031                 data->flags |= SCF_SEEN_ACCEPT;
4032                 if (stopmin > min)
4033                     stopmin = min;
4034             }
4035         }
4036         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4037         {
4038                 if (flags & SCF_DO_SUBSTR) {
4039                     SCAN_COMMIT(pRExC_state,data,minlenp);
4040                     data->longest = &(data->longest_float);
4041                 }
4042                 is_inf = is_inf_internal = 1;
4043                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4044                     cl_anything(pRExC_state, data->start_class);
4045                 flags &= ~SCF_DO_STCLASS;
4046         }
4047         else if (OP(scan) == GPOS) {
4048             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4049                 !(delta || is_inf || (data && data->pos_delta)))
4050             {
4051                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4052                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4053                 if (RExC_rx->gofs < (U32)min)
4054                     RExC_rx->gofs = min;
4055             } else {
4056                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4057                 RExC_rx->gofs = 0;
4058             }
4059         }
4060 #ifdef TRIE_STUDY_OPT
4061 #ifdef FULL_TRIE_STUDY
4062         else if (PL_regkind[OP(scan)] == TRIE) {
4063             /* NOTE - There is similar code to this block above for handling
4064                BRANCH nodes on the initial study.  If you change stuff here
4065                check there too. */
4066             regnode *trie_node= scan;
4067             regnode *tail= regnext(scan);
4068             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4069             I32 max1 = 0, min1 = I32_MAX;
4070             struct regnode_charclass_class accum;
4071
4072             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4073                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4074             if (flags & SCF_DO_STCLASS)
4075                 cl_init_zero(pRExC_state, &accum);
4076
4077             if (!trie->jump) {
4078                 min1= trie->minlen;
4079                 max1= trie->maxlen;
4080             } else {
4081                 const regnode *nextbranch= NULL;
4082                 U32 word;
4083
4084                 for ( word=1 ; word <= trie->wordcount ; word++)
4085                 {
4086                     I32 deltanext=0, minnext=0, f = 0, fake;
4087                     struct regnode_charclass_class this_class;
4088
4089                     data_fake.flags = 0;
4090                     if (data) {
4091                         data_fake.whilem_c = data->whilem_c;
4092                         data_fake.last_closep = data->last_closep;
4093                     }
4094                     else
4095                         data_fake.last_closep = &fake;
4096                     data_fake.pos_delta = delta;
4097                     if (flags & SCF_DO_STCLASS) {
4098                         cl_init(pRExC_state, &this_class);
4099                         data_fake.start_class = &this_class;
4100                         f = SCF_DO_STCLASS_AND;
4101                     }
4102                     if (flags & SCF_WHILEM_VISITED_POS)
4103                         f |= SCF_WHILEM_VISITED_POS;
4104
4105                     if (trie->jump[word]) {
4106                         if (!nextbranch)
4107                             nextbranch = trie_node + trie->jump[0];
4108                         scan= trie_node + trie->jump[word];
4109                         /* We go from the jump point to the branch that follows
4110                            it. Note this means we need the vestigal unused branches
4111                            even though they arent otherwise used.
4112                          */
4113                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4114                             &deltanext, (regnode *)nextbranch, &data_fake,
4115                             stopparen, recursed, NULL, f,depth+1);
4116                     }
4117                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4118                         nextbranch= regnext((regnode*)nextbranch);
4119
4120                     if (min1 > (I32)(minnext + trie->minlen))
4121                         min1 = minnext + trie->minlen;
4122                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4123                         max1 = minnext + deltanext + trie->maxlen;
4124                     if (deltanext == I32_MAX)
4125                         is_inf = is_inf_internal = 1;
4126
4127                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4128                         pars++;
4129                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4130                         if ( stopmin > min + min1)
4131                             stopmin = min + min1;
4132                         flags &= ~SCF_DO_SUBSTR;
4133                         if (data)
4134                             data->flags |= SCF_SEEN_ACCEPT;
4135                     }
4136                     if (data) {
4137                         if (data_fake.flags & SF_HAS_EVAL)
4138                             data->flags |= SF_HAS_EVAL;
4139                         data->whilem_c = data_fake.whilem_c;
4140                     }
4141                     if (flags & SCF_DO_STCLASS)
4142                         cl_or(pRExC_state, &accum, &this_class);
4143                 }
4144             }
4145             if (flags & SCF_DO_SUBSTR) {
4146                 data->pos_min += min1;
4147                 data->pos_delta += max1 - min1;
4148                 if (max1 != min1 || is_inf)
4149                     data->longest = &(data->longest_float);
4150             }
4151             min += min1;
4152             delta += max1 - min1;
4153             if (flags & SCF_DO_STCLASS_OR) {
4154                 cl_or(pRExC_state, data->start_class, &accum);
4155                 if (min1) {
4156                     cl_and(data->start_class, and_withp);
4157                     flags &= ~SCF_DO_STCLASS;
4158                 }
4159             }
4160             else if (flags & SCF_DO_STCLASS_AND) {
4161                 if (min1) {
4162                     cl_and(data->start_class, &accum);
4163                     flags &= ~SCF_DO_STCLASS;
4164                 }
4165                 else {
4166                     /* Switch to OR mode: cache the old value of
4167                      * data->start_class */
4168                     INIT_AND_WITHP;
4169                     StructCopy(data->start_class, and_withp,
4170                                struct regnode_charclass_class);
4171                     flags &= ~SCF_DO_STCLASS_AND;
4172                     StructCopy(&accum, data->start_class,
4173                                struct regnode_charclass_class);
4174                     flags |= SCF_DO_STCLASS_OR;
4175                     data->start_class->flags |= ANYOF_EOS;
4176                 }
4177             }
4178             scan= tail;
4179             continue;
4180         }
4181 #else
4182         else if (PL_regkind[OP(scan)] == TRIE) {
4183             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4184             U8*bang=NULL;
4185
4186             min += trie->minlen;
4187             delta += (trie->maxlen - trie->minlen);
4188             flags &= ~SCF_DO_STCLASS; /* xxx */
4189             if (flags & SCF_DO_SUBSTR) {
4190                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4191                 data->pos_min += trie->minlen;
4192                 data->pos_delta += (trie->maxlen - trie->minlen);
4193                 if (trie->maxlen != trie->minlen)
4194                     data->longest = &(data->longest_float);
4195             }
4196             if (trie->jump) /* no more substrings -- for now /grr*/
4197                 flags &= ~SCF_DO_SUBSTR;
4198         }
4199 #endif /* old or new */
4200 #endif /* TRIE_STUDY_OPT */
4201
4202         /* Else: zero-length, ignore. */
4203         scan = regnext(scan);
4204     }
4205     if (frame) {
4206         last = frame->last;
4207         scan = frame->next;
4208         stopparen = frame->stop;
4209         frame = frame->prev;
4210         goto fake_study_recurse;
4211     }
4212
4213   finish:
4214     assert(!frame);
4215     DEBUG_STUDYDATA("pre-fin:",data,depth);
4216
4217     *scanp = scan;
4218     *deltap = is_inf_internal ? I32_MAX : delta;
4219     if (flags & SCF_DO_SUBSTR && is_inf)
4220         data->pos_delta = I32_MAX - data->pos_min;
4221     if (is_par > (I32)U8_MAX)
4222         is_par = 0;
4223     if (is_par && pars==1 && data) {
4224         data->flags |= SF_IN_PAR;
4225         data->flags &= ~SF_HAS_PAR;
4226     }
4227     else if (pars && data) {
4228         data->flags |= SF_HAS_PAR;
4229         data->flags &= ~SF_IN_PAR;
4230     }
4231     if (flags & SCF_DO_STCLASS_OR)
4232         cl_and(data->start_class, and_withp);
4233     if (flags & SCF_TRIE_RESTUDY)
4234         data->flags |=  SCF_TRIE_RESTUDY;
4235
4236     DEBUG_STUDYDATA("post-fin:",data,depth);
4237
4238     return min < stopmin ? min : stopmin;
4239 }
4240
4241 STATIC U32
4242 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4243 {
4244     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4245
4246     PERL_ARGS_ASSERT_ADD_DATA;
4247
4248     Renewc(RExC_rxi->data,
4249            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4250            char, struct reg_data);
4251     if(count)
4252         Renew(RExC_rxi->data->what, count + n, U8);
4253     else
4254         Newx(RExC_rxi->data->what, n, U8);
4255     RExC_rxi->data->count = count + n;
4256     Copy(s, RExC_rxi->data->what + count, n, U8);
4257     return count;
4258 }
4259
4260 /*XXX: todo make this not included in a non debugging perl */
4261 #ifndef PERL_IN_XSUB_RE
4262 void
4263 Perl_reginitcolors(pTHX)
4264 {
4265     dVAR;
4266     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4267     if (s) {
4268         char *t = savepv(s);
4269         int i = 0;
4270         PL_colors[0] = t;
4271         while (++i < 6) {
4272             t = strchr(t, '\t');
4273             if (t) {
4274                 *t = '\0';
4275                 PL_colors[i] = ++t;
4276             }
4277             else
4278                 PL_colors[i] = t = (char *)"";
4279         }
4280     } else {
4281         int i = 0;
4282         while (i < 6)
4283             PL_colors[i++] = (char *)"";
4284     }
4285     PL_colorset = 1;
4286 }
4287 #endif
4288
4289
4290 #ifdef TRIE_STUDY_OPT
4291 #define CHECK_RESTUDY_GOTO                                  \
4292         if (                                                \
4293               (data.flags & SCF_TRIE_RESTUDY)               \
4294               && ! restudied++                              \
4295         )     goto reStudy
4296 #else
4297 #define CHECK_RESTUDY_GOTO
4298 #endif
4299
4300 /*
4301  - pregcomp - compile a regular expression into internal code
4302  *
4303  * We can't allocate space until we know how big the compiled form will be,
4304  * but we can't compile it (and thus know how big it is) until we've got a
4305  * place to put the code.  So we cheat:  we compile it twice, once with code
4306  * generation turned off and size counting turned on, and once "for real".
4307  * This also means that we don't allocate space until we are sure that the
4308  * thing really will compile successfully, and we never have to move the
4309  * code and thus invalidate pointers into it.  (Note that it has to be in
4310  * one piece because free() must be able to free it all.) [NB: not true in perl]
4311  *
4312  * Beware that the optimization-preparation code in here knows about some
4313  * of the structure of the compiled regexp.  [I'll say.]
4314  */
4315
4316
4317
4318 #ifndef PERL_IN_XSUB_RE
4319 #define RE_ENGINE_PTR &PL_core_reg_engine
4320 #else
4321 extern const struct regexp_engine my_reg_engine;
4322 #define RE_ENGINE_PTR &my_reg_engine
4323 #endif
4324
4325 #ifndef PERL_IN_XSUB_RE
4326 REGEXP *
4327 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4328 {
4329     dVAR;
4330     HV * const table = GvHV(PL_hintgv);
4331
4332     PERL_ARGS_ASSERT_PREGCOMP;
4333
4334     /* Dispatch a request to compile a regexp to correct
4335        regexp engine. */
4336     if (table) {
4337         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4338         GET_RE_DEBUG_FLAGS_DECL;
4339         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4340             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4341             DEBUG_COMPILE_r({
4342                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4343                     SvIV(*ptr));
4344             });
4345             return CALLREGCOMP_ENG(eng, pattern, flags);
4346         }
4347     }
4348     return Perl_re_compile(aTHX_ pattern, flags);
4349 }
4350 #endif
4351
4352 REGEXP *
4353 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4354 {
4355     dVAR;
4356     REGEXP *rx;
4357     struct regexp *r;
4358     register regexp_internal *ri;
4359     STRLEN plen;
4360     char  *exp;
4361     char* xend;
4362     regnode *scan;
4363     I32 flags;
4364     I32 minlen = 0;
4365     U32 pm_flags;
4366
4367     /* these are all flags - maybe they should be turned
4368      * into a single int with different bit masks */
4369     I32 sawlookahead = 0;
4370     I32 sawplus = 0;
4371     I32 sawopen = 0;
4372     bool used_setjump = FALSE;
4373
4374     U8 jump_ret = 0;
4375     dJMPENV;
4376     scan_data_t data;
4377     RExC_state_t RExC_state;
4378     RExC_state_t * const pRExC_state = &RExC_state;
4379 #ifdef TRIE_STUDY_OPT
4380     int restudied;
4381     RExC_state_t copyRExC_state;
4382 #endif
4383     GET_RE_DEBUG_FLAGS_DECL;
4384
4385     PERL_ARGS_ASSERT_RE_COMPILE;
4386
4387     DEBUG_r(if (!PL_colorset) reginitcolors());
4388
4389     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4390
4391     /****************** LONG JUMP TARGET HERE***********************/
4392     /* Longjmp back to here if have to switch in midstream to utf8 */
4393     if (! RExC_orig_utf8) {
4394         JMPENV_PUSH(jump_ret);
4395         used_setjump = TRUE;
4396     }
4397
4398     if (jump_ret == 0) {    /* First time through */
4399         exp = SvPV(pattern, plen);
4400         xend = exp + plen;
4401         /* ignore the utf8ness if the pattern is 0 length */
4402         if (plen == 0) {
4403             RExC_utf8 = RExC_orig_utf8 = 0;
4404         }
4405
4406         DEBUG_COMPILE_r({
4407             SV *dsv= sv_newmortal();
4408             RE_PV_QUOTED_DECL(s, RExC_utf8,
4409                 dsv, exp, plen, 60);
4410             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4411                            PL_colors[4],PL_colors[5],s);
4412         });
4413     }
4414     else {  /* longjumped back */
4415         STRLEN len = plen;
4416
4417         /* If the cause for the longjmp was other than changing to utf8, pop
4418          * our own setjmp, and longjmp to the correct handler */
4419         if (jump_ret != UTF8_LONGJMP) {
4420             JMPENV_POP;
4421             JMPENV_JUMP(jump_ret);
4422         }
4423
4424         GET_RE_DEBUG_FLAGS;
4425
4426         /* It's possible to write a regexp in ascii that represents Unicode
4427         codepoints outside of the byte range, such as via \x{100}. If we
4428         detect such a sequence we have to convert the entire pattern to utf8
4429         and then recompile, as our sizing calculation will have been based
4430         on 1 byte == 1 character, but we will need to use utf8 to encode
4431         at least some part of the pattern, and therefore must convert the whole
4432         thing.
4433         -- dmq */
4434         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4435             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4436         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4437         xend = exp + len;
4438         RExC_orig_utf8 = RExC_utf8 = 1;
4439         SAVEFREEPV(exp);
4440     }
4441
4442 #ifdef TRIE_STUDY_OPT
4443     restudied = 0;
4444 #endif
4445
4446     /* Set to use unicode semantics if the pattern is in utf8 and has the
4447      * 'depends' charset specified, as it means unicode when utf8  */
4448     pm_flags = orig_pm_flags;
4449
4450     if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
4451         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4452     }
4453
4454     RExC_precomp = exp;
4455     RExC_flags = pm_flags;
4456     RExC_sawback = 0;
4457
4458     RExC_seen = 0;
4459     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4460     RExC_seen_evals = 0;
4461     RExC_extralen = 0;
4462
4463     /* First pass: determine size, legality. */
4464     RExC_parse = exp;
4465     RExC_start = exp;
4466     RExC_end = xend;
4467     RExC_naughty = 0;
4468     RExC_npar = 1;
4469     RExC_nestroot = 0;
4470     RExC_size = 0L;
4471     RExC_emit = &PL_regdummy;
4472     RExC_whilem_seen = 0;
4473     RExC_open_parens = NULL;
4474     RExC_close_parens = NULL;
4475     RExC_opend = NULL;
4476     RExC_paren_names = NULL;
4477 #ifdef DEBUGGING
4478     RExC_paren_name_list = NULL;
4479 #endif
4480     RExC_recurse = NULL;
4481     RExC_recurse_count = 0;
4482
4483 #if 0 /* REGC() is (currently) a NOP at the first pass.
4484        * Clever compilers notice this and complain. --jhi */
4485     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4486 #endif
4487     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4488     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4489         RExC_precomp = NULL;
4490         return(NULL);
4491     }
4492
4493     /* Here, finished first pass.  Get rid of any added setjmp */
4494     if (used_setjump) {
4495         JMPENV_POP;
4496     }
4497     DEBUG_PARSE_r({
4498         PerlIO_printf(Perl_debug_log,
4499             "Required size %"IVdf" nodes\n"
4500             "Starting second pass (creation)\n",
4501             (IV)RExC_size);
4502         RExC_lastnum=0;
4503         RExC_lastparse=NULL;
4504     });
4505     /* Small enough for pointer-storage convention?
4506        If extralen==0, this means that we will not need long jumps. */
4507     if (RExC_size >= 0x10000L && RExC_extralen)
4508         RExC_size += RExC_extralen;
4509     else
4510         RExC_extralen = 0;
4511     if (RExC_whilem_seen > 15)
4512         RExC_whilem_seen = 15;
4513
4514     /* Allocate space and zero-initialize. Note, the two step process
4515        of zeroing when in debug mode, thus anything assigned has to
4516        happen after that */
4517     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4518     r = (struct regexp*)SvANY(rx);
4519     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4520          char, regexp_internal);
4521     if ( r == NULL || ri == NULL )
4522         FAIL("Regexp out of space");
4523 #ifdef DEBUGGING
4524     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4525     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4526 #else
4527     /* bulk initialize base fields with 0. */
4528     Zero(ri, sizeof(regexp_internal), char);
4529 #endif
4530
4531     /* non-zero initialization begins here */
4532     RXi_SET( r, ri );
4533     r->engine= RE_ENGINE_PTR;
4534     r->extflags = pm_flags;
4535     {
4536         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4537         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4538
4539         /* The caret is output if there are any defaults: if not all the STD
4540          * flags are set, or if no character set specifier is needed */
4541         bool has_default =
4542                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4543                     || ! has_charset);
4544         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4545         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4546                             >> RXf_PMf_STD_PMMOD_SHIFT);
4547         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4548         char *p;
4549         /* Allocate for the worst case, which is all the std flags are turned
4550          * on.  If more precision is desired, we could do a population count of
4551          * the flags set.  This could be done with a small lookup table, or by
4552          * shifting, masking and adding, or even, when available, assembly
4553          * language for a machine-language population count.
4554          * We never output a minus, as all those are defaults, so are
4555          * covered by the caret */
4556         const STRLEN wraplen = plen + has_p + has_runon
4557             + has_default       /* If needs a caret */
4558
4559                 /* If needs a character set specifier */
4560             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4561             + (sizeof(STD_PAT_MODS) - 1)
4562             + (sizeof("(?:)") - 1);
4563
4564         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4565         SvPOK_on(rx);
4566         SvFLAGS(rx) |= SvUTF8(pattern);
4567         *p++='('; *p++='?';
4568
4569         /* If a default, cover it using the caret */
4570         if (has_default) {
4571             *p++= DEFAULT_PAT_MOD;
4572         }
4573         if (has_charset) {
4574             STRLEN len;
4575             const char* const name = get_regex_charset_name(r->extflags, &len);
4576             Copy(name, p, len, char);
4577             p += len;
4578         }
4579         if (has_p)
4580             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4581         {
4582             char ch;
4583             while((ch = *fptr++)) {
4584                 if(reganch & 1)
4585                     *p++ = ch;
4586                 reganch >>= 1;
4587             }
4588         }
4589
4590         *p++ = ':';
4591         Copy(RExC_precomp, p, plen, char);
4592         assert ((RX_WRAPPED(rx) - p) < 16);
4593         r->pre_prefix = p - RX_WRAPPED(rx);
4594         p += plen;
4595         if (has_runon)
4596             *p++ = '\n';
4597         *p++ = ')';
4598         *p = 0;
4599         SvCUR_set(rx, p - SvPVX_const(rx));
4600     }
4601
4602     r->intflags = 0;
4603     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4604
4605     if (RExC_seen & REG_SEEN_RECURSE) {
4606         Newxz(RExC_open_parens, RExC_npar,regnode *);
4607         SAVEFREEPV(RExC_open_parens);
4608         Newxz(RExC_close_parens,RExC_npar,regnode *);
4609         SAVEFREEPV(RExC_close_parens);
4610     }
4611
4612     /* Useful during FAIL. */
4613 #ifdef RE_TRACK_PATTERN_OFFSETS
4614     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4615     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4616                           "%s %"UVuf" bytes for offset annotations.\n",
4617                           ri->u.offsets ? "Got" : "Couldn't get",
4618                           (UV)((2*RExC_size+1) * sizeof(U32))));
4619 #endif
4620     SetProgLen(ri,RExC_size);
4621     RExC_rx_sv = rx;
4622     RExC_rx = r;
4623     RExC_rxi = ri;
4624
4625     /* Second pass: emit code. */
4626     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4627     RExC_parse = exp;
4628     RExC_end = xend;
4629     RExC_naughty = 0;
4630     RExC_npar = 1;
4631     RExC_emit_start = ri->program;
4632     RExC_emit = ri->program;
4633     RExC_emit_bound = ri->program + RExC_size + 1;
4634
4635     /* Store the count of eval-groups for security checks: */
4636     RExC_rx->seen_evals = RExC_seen_evals;
4637     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4638     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4639         ReREFCNT_dec(rx);
4640         return(NULL);
4641     }
4642     /* XXXX To minimize changes to RE engine we always allocate
4643        3-units-long substrs field. */
4644     Newx(r->substrs, 1, struct reg_substr_data);
4645     if (RExC_recurse_count) {
4646         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4647         SAVEFREEPV(RExC_recurse);
4648     }
4649
4650 reStudy:
4651     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4652     Zero(r->substrs, 1, struct reg_substr_data);
4653
4654 #ifdef TRIE_STUDY_OPT
4655     if (!restudied) {
4656         StructCopy(&zero_scan_data, &data, scan_data_t);
4657         copyRExC_state = RExC_state;
4658     } else {
4659         U32 seen=RExC_seen;
4660         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4661
4662         RExC_state = copyRExC_state;
4663         if (seen & REG_TOP_LEVEL_BRANCHES)
4664             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4665         else
4666             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4667         if (data.last_found) {
4668             SvREFCNT_dec(data.longest_fixed);
4669             SvREFCNT_dec(data.longest_float);
4670             SvREFCNT_dec(data.last_found);
4671         }
4672         StructCopy(&zero_scan_data, &data, scan_data_t);
4673     }
4674 #else
4675     StructCopy(&zero_scan_data, &data, scan_data_t);
4676 #endif
4677
4678     /* Dig out information for optimizations. */
4679     r->extflags = RExC_flags; /* was pm_op */
4680     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4681
4682     if (UTF)
4683         SvUTF8_on(rx);  /* Unicode in it? */
4684     ri->regstclass = NULL;
4685     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4686         r->intflags |= PREGf_NAUGHTY;
4687     scan = ri->program + 1;             /* First BRANCH. */
4688
4689     /* testing for BRANCH here tells us whether there is "must appear"
4690        data in the pattern. If there is then we can use it for optimisations */
4691     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4692         I32 fake;
4693         STRLEN longest_float_length, longest_fixed_length;
4694         struct regnode_charclass_class ch_class; /* pointed to by data */
4695         int stclass_flag;
4696         I32 last_close = 0; /* pointed to by data */
4697         regnode *first= scan;
4698         regnode *first_next= regnext(first);
4699         /*
4700          * Skip introductions and multiplicators >= 1
4701          * so that we can extract the 'meat' of the pattern that must
4702          * match in the large if() sequence following.
4703          * NOTE that EXACT is NOT covered here, as it is normally
4704          * picked up by the optimiser separately.
4705          *
4706          * This is unfortunate as the optimiser isnt handling lookahead
4707          * properly currently.
4708          *
4709          */
4710         while ((OP(first) == OPEN && (sawopen = 1)) ||
4711                /* An OR of *one* alternative - should not happen now. */
4712             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4713             /* for now we can't handle lookbehind IFMATCH*/
4714             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4715             (OP(first) == PLUS) ||
4716             (OP(first) == MINMOD) ||
4717                /* An {n,m} with n>0 */
4718             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4719             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4720         {
4721                 /*
4722                  * the only op that could be a regnode is PLUS, all the rest
4723                  * will be regnode_1 or regnode_2.
4724                  *
4725                  */
4726                 if (OP(first) == PLUS)
4727                     sawplus = 1;
4728                 else
4729                     first += regarglen[OP(first)];
4730
4731                 first = NEXTOPER(first);
4732                 first_next= regnext(first);
4733         }
4734
4735         /* Starting-point info. */
4736       again:
4737         DEBUG_PEEP("first:",first,0);
4738         /* Ignore EXACT as we deal with it later. */
4739         if (PL_regkind[OP(first)] == EXACT) {
4740             if (OP(first) == EXACT)
4741                 NOOP;   /* Empty, get anchored substr later. */
4742             else
4743                 ri->regstclass = first;
4744         }
4745 #ifdef TRIE_STCLASS
4746         else if (PL_regkind[OP(first)] == TRIE &&
4747                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4748         {
4749             regnode *trie_op;
4750             /* this can happen only on restudy */
4751             if ( OP(first) == TRIE ) {
4752                 struct regnode_1 *trieop = (struct regnode_1 *)
4753                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4754                 StructCopy(first,trieop,struct regnode_1);
4755                 trie_op=(regnode *)trieop;
4756             } else {
4757                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4758                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4759                 StructCopy(first,trieop,struct regnode_charclass);
4760                 trie_op=(regnode *)trieop;
4761             }
4762             OP(trie_op)+=2;
4763             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4764             ri->regstclass = trie_op;
4765         }
4766 #endif
4767         else if (REGNODE_SIMPLE(OP(first)))
4768             ri->regstclass = first;
4769         else if (PL_regkind[OP(first)] == BOUND ||
4770                  PL_regkind[OP(first)] == NBOUND)
4771             ri->regstclass = first;
4772         else if (PL_regkind[OP(first)] == BOL) {
4773             r->extflags |= (OP(first) == MBOL
4774                            ? RXf_ANCH_MBOL
4775                            : (OP(first) == SBOL
4776                               ? RXf_ANCH_SBOL
4777                               : RXf_ANCH_BOL));
4778             first = NEXTOPER(first);
4779             goto again;
4780         }
4781         else if (OP(first) == GPOS) {
4782             r->extflags |= RXf_ANCH_GPOS;
4783             first = NEXTOPER(first);
4784             goto again;
4785         }
4786         else if ((!sawopen || !RExC_sawback) &&
4787             (OP(first) == STAR &&
4788             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4789             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4790         {
4791             /* turn .* into ^.* with an implied $*=1 */
4792             const int type =
4793                 (OP(NEXTOPER(first)) == REG_ANY)
4794                     ? RXf_ANCH_MBOL
4795                     : RXf_ANCH_SBOL;
4796             r->extflags |= type;
4797             r->intflags |= PREGf_IMPLICIT;
4798             first = NEXTOPER(first);
4799             goto again;
4800         }
4801         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4802             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4803             /* x+ must match at the 1st pos of run of x's */
4804             r->intflags |= PREGf_SKIP;
4805
4806         /* Scan is after the zeroth branch, first is atomic matcher. */
4807 #ifdef TRIE_STUDY_OPT
4808         DEBUG_PARSE_r(
4809             if (!restudied)
4810                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4811                               (IV)(first - scan + 1))
4812         );
4813 #else
4814         DEBUG_PARSE_r(
4815             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4816                 (IV)(first - scan + 1))
4817         );
4818 #endif
4819
4820
4821         /*
4822         * If there's something expensive in the r.e., find the
4823         * longest literal string that must appear and make it the
4824         * regmust.  Resolve ties in favor of later strings, since
4825         * the regstart check works with the beginning of the r.e.
4826         * and avoiding duplication strengthens checking.  Not a
4827         * strong reason, but sufficient in the absence of others.
4828         * [Now we resolve ties in favor of the earlier string if
4829         * it happens that c_offset_min has been invalidated, since the
4830         * earlier string may buy us something the later one won't.]
4831         */
4832
4833         data.longest_fixed = newSVpvs("");
4834         data.longest_float = newSVpvs("");
4835         data.last_found = newSVpvs("");
4836         data.longest = &(data.longest_fixed);
4837         first = scan;
4838         if (!ri->regstclass) {
4839             cl_init(pRExC_state, &ch_class);
4840             data.start_class = &ch_class;
4841             stclass_flag = SCF_DO_STCLASS_AND;
4842         } else                          /* XXXX Check for BOUND? */
4843             stclass_flag = 0;
4844         data.last_closep = &last_close;
4845
4846         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4847             &data, -1, NULL, NULL,
4848             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4849
4850
4851         CHECK_RESTUDY_GOTO;
4852
4853
4854         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4855              && data.last_start_min == 0 && data.last_end > 0
4856              && !RExC_seen_zerolen
4857              && !(RExC_seen & REG_SEEN_VERBARG)
4858              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4859             r->extflags |= RXf_CHECK_ALL;
4860         scan_commit(pRExC_state, &data,&minlen,0);
4861         SvREFCNT_dec(data.last_found);
4862
4863         /* Note that code very similar to this but for anchored string
4864            follows immediately below, changes may need to be made to both.
4865            Be careful.
4866          */
4867         longest_float_length = CHR_SVLEN(data.longest_float);
4868         if (longest_float_length
4869             || (data.flags & SF_FL_BEFORE_EOL
4870                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4871                     || (RExC_flags & RXf_PMf_MULTILINE))))
4872         {
4873             I32 t,ml;
4874
4875             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4876                 && data.offset_fixed == data.offset_float_min
4877                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4878                     goto remove_float;          /* As in (a)+. */
4879
4880             /* copy the information about the longest float from the reg_scan_data
4881                over to the program. */
4882             if (SvUTF8(data.longest_float)) {
4883                 r->float_utf8 = data.longest_float;
4884                 r->float_substr = NULL;
4885             } else {
4886                 r->float_substr = data.longest_float;
4887                 r->float_utf8 = NULL;
4888             }
4889             /* float_end_shift is how many chars that must be matched that
4890                follow this item. We calculate it ahead of time as once the
4891                lookbehind offset is added in we lose the ability to correctly
4892                calculate it.*/
4893             ml = data.minlen_float ? *(data.minlen_float)
4894                                    : (I32)longest_float_length;
4895             r->float_end_shift = ml - data.offset_float_min
4896                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4897                 + data.lookbehind_float;
4898             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4899             r->float_max_offset = data.offset_float_max;
4900             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4901                 r->float_max_offset -= data.lookbehind_float;
4902
4903             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4904                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4905                            || (RExC_flags & RXf_PMf_MULTILINE)));
4906             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4907         }
4908         else {
4909           remove_float:
4910             r->float_substr = r->float_utf8 = NULL;
4911             SvREFCNT_dec(data.longest_float);
4912             longest_float_length = 0;
4913         }
4914
4915         /* Note that code very similar to this but for floating string
4916            is immediately above, changes may need to be made to both.
4917            Be careful.
4918          */
4919         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4920         if (longest_fixed_length
4921             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4922                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4923                     || (RExC_flags & RXf_PMf_MULTILINE))))
4924         {
4925             I32 t,ml;
4926
4927             /* copy the information about the longest fixed
4928                from the reg_scan_data over to the program. */
4929             if (SvUTF8(data.longest_fixed)) {
4930                 r->anchored_utf8 = data.longest_fixed;
4931                 r->anchored_substr = NULL;
4932             } else {
4933                 r->anchored_substr = data.longest_fixed;
4934                 r->anchored_utf8 = NULL;
4935             }
4936             /* fixed_end_shift is how many chars that must be matched that
4937                follow this item. We calculate it ahead of time as once the
4938                lookbehind offset is added in we lose the ability to correctly
4939                calculate it.*/
4940             ml = data.minlen_fixed ? *(data.minlen_fixed)
4941                                    : (I32)longest_fixed_length;
4942             r->anchored_end_shift = ml - data.offset_fixed
4943                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4944                 + data.lookbehind_fixed;
4945             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4946
4947             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4948                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4949                      || (RExC_flags & RXf_PMf_MULTILINE)));
4950             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4951         }
4952         else {
4953             r->anchored_substr = r->anchored_utf8 = NULL;
4954             SvREFCNT_dec(data.longest_fixed);
4955             longest_fixed_length = 0;
4956         }
4957         if (ri->regstclass
4958             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4959             ri->regstclass = NULL;
4960
4961         /* If the synthetic start class were to ever be used when EOS is set,
4962          * that bit would have to be cleared, as it is shared with another */
4963         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4964             && stclass_flag
4965             && !(data.start_class->flags & ANYOF_EOS)
4966             && !cl_is_anything(data.start_class))
4967         {
4968             const U32 n = add_data(pRExC_state, 1, "f");
4969
4970             Newx(RExC_rxi->data->data[n], 1,
4971                 struct regnode_charclass_class);
4972             StructCopy(data.start_class,
4973                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4974                        struct regnode_charclass_class);
4975             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4976             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4977             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4978                       regprop(r, sv, (regnode*)data.start_class);
4979                       PerlIO_printf(Perl_debug_log,
4980                                     "synthetic stclass \"%s\".\n",
4981                                     SvPVX_const(sv));});
4982         }
4983
4984         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4985         if (longest_fixed_length > longest_float_length) {
4986             r->check_end_shift = r->anchored_end_shift;
4987             r->check_substr = r->anchored_substr;
4988             r->check_utf8 = r->anchored_utf8;
4989             r->check_offset_min = r->check_offset_max = r->anchored_offset;
4990             if (r->extflags & RXf_ANCH_SINGLE)
4991                 r->extflags |= RXf_NOSCAN;
4992         }
4993         else {
4994             r->check_end_shift = r->float_end_shift;
4995             r->check_substr = r->float_substr;
4996             r->check_utf8 = r->float_utf8;
4997             r->check_offset_min = r->float_min_offset;
4998             r->check_offset_max = r->float_max_offset;
4999         }
5000         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5001            This should be changed ASAP!  */
5002         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5003             r->extflags |= RXf_USE_INTUIT;
5004             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5005                 r->extflags |= RXf_INTUIT_TAIL;
5006         }
5007         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5008         if ( (STRLEN)minlen < longest_float_length )
5009             minlen= longest_float_length;
5010         if ( (STRLEN)minlen < longest_fixed_length )
5011             minlen= longest_fixed_length;
5012         */
5013     }
5014     else {
5015         /* Several toplevels. Best we can is to set minlen. */
5016         I32 fake;
5017         struct regnode_charclass_class ch_class;
5018         I32 last_close = 0;
5019
5020         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5021
5022         scan = ri->program + 1;
5023         cl_init(pRExC_state, &ch_class);
5024         data.start_class = &ch_class;
5025         data.last_closep = &last_close;
5026
5027
5028         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5029             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5030
5031         CHECK_RESTUDY_GOTO;
5032
5033         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5034                 = r->float_substr = r->float_utf8 = NULL;
5035
5036         /* If the synthetic start class were to ever be used when EOS is set,
5037          * that bit would have to be cleared, as it is shared with another */
5038         if (!(data.start_class->flags & ANYOF_EOS)
5039             && !cl_is_anything(data.start_class))
5040         {
5041             const U32 n = add_data(pRExC_state, 1, "f");
5042
5043             Newx(RExC_rxi->data->data[n], 1,
5044                 struct regnode_charclass_class);
5045             StructCopy(data.start_class,
5046                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5047                        struct regnode_charclass_class);
5048             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5049             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5050             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5051                       regprop(r, sv, (regnode*)data.start_class);
5052                       PerlIO_printf(Perl_debug_log,
5053                                     "synthetic stclass \"%s\".\n",
5054                                     SvPVX_const(sv));});
5055         }
5056     }
5057
5058     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5059        the "real" pattern. */
5060     DEBUG_OPTIMISE_r({
5061         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5062                       (IV)minlen, (IV)r->minlen);
5063     });
5064     r->minlenret = minlen;
5065     if (r->minlen < minlen)
5066         r->minlen = minlen;
5067
5068     if (RExC_seen & REG_SEEN_GPOS)
5069         r->extflags |= RXf_GPOS_SEEN;
5070     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5071         r->extflags |= RXf_LOOKBEHIND_SEEN;
5072     if (RExC_seen & REG_SEEN_EVAL)
5073         r->extflags |= RXf_EVAL_SEEN;
5074     if (RExC_seen & REG_SEEN_CANY)
5075         r->extflags |= RXf_CANY_SEEN;
5076     if (RExC_seen & REG_SEEN_VERBARG)
5077         r->intflags |= PREGf_VERBARG_SEEN;
5078     if (RExC_seen & REG_SEEN_CUTGROUP)
5079         r->intflags |= PREGf_CUTGROUP_SEEN;
5080     if (RExC_paren_names)
5081         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5082     else
5083         RXp_PAREN_NAMES(r) = NULL;
5084
5085 #ifdef STUPID_PATTERN_CHECKS
5086     if (RX_PRELEN(rx) == 0)
5087         r->extflags |= RXf_NULL;
5088     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5089         /* XXX: this should happen BEFORE we compile */
5090         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5091     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5092         r->extflags |= RXf_WHITE;
5093     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5094         r->extflags |= RXf_START_ONLY;
5095 #else
5096     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5097             /* XXX: this should happen BEFORE we compile */
5098             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5099     else {
5100         regnode *first = ri->program + 1;
5101         U8 fop = OP(first);
5102         U8 nop = OP(NEXTOPER(first));
5103
5104         if (PL_regkind[fop] == NOTHING && nop == END)
5105             r->extflags |= RXf_NULL;
5106         else if (PL_regkind[fop] == BOL && nop == END)
5107             r->extflags |= RXf_START_ONLY;
5108         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5109             r->extflags |= RXf_WHITE;
5110     }
5111 #endif
5112 #ifdef DEBUGGING
5113     if (RExC_paren_names) {
5114         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5115         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5116     } else
5117 #endif
5118         ri->name_list_idx = 0;
5119
5120     if (RExC_recurse_count) {
5121         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5122             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5123             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5124         }
5125     }
5126     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5127     /* assume we don't need to swap parens around before we match */
5128
5129     DEBUG_DUMP_r({
5130         PerlIO_printf(Perl_debug_log,"Final program:\n");
5131         regdump(r);
5132     });
5133 #ifdef RE_TRACK_PATTERN_OFFSETS
5134     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5135         const U32 len = ri->u.offsets[0];
5136         U32 i;
5137         GET_RE_DEBUG_FLAGS_DECL;
5138         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5139         for (i = 1; i <= len; i++) {
5140             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5141                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5142                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5143             }
5144         PerlIO_printf(Perl_debug_log, "\n");
5145     });
5146 #endif
5147     return rx;
5148 }
5149
5150 #undef RE_ENGINE_PTR
5151
5152
5153 SV*
5154 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5155                     const U32 flags)
5156 {
5157     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5158
5159     PERL_UNUSED_ARG(value);
5160
5161     if (flags & RXapif_FETCH) {
5162         return reg_named_buff_fetch(rx, key, flags);
5163     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5164         Perl_croak_no_modify(aTHX);
5165         return NULL;
5166     } else if (flags & RXapif_EXISTS) {
5167         return reg_named_buff_exists(rx, key, flags)
5168             ? &PL_sv_yes
5169             : &PL_sv_no;
5170     } else if (flags & RXapif_REGNAMES) {
5171         return reg_named_buff_all(rx, flags);
5172     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5173         return reg_named_buff_scalar(rx, flags);
5174     } else {
5175         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5176         return NULL;
5177     }
5178 }
5179
5180 SV*
5181 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5182                          const U32 flags)
5183 {
5184     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5185     PERL_UNUSED_ARG(lastkey);
5186
5187     if (flags & RXapif_FIRSTKEY)
5188         return reg_named_buff_firstkey(rx, flags);
5189     else if (flags & RXapif_NEXTKEY)
5190         return reg_named_buff_nextkey(rx, flags);
5191     else {
5192         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5193         return NULL;
5194     }
5195 }
5196
5197 SV*
5198 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5199                           const U32 flags)
5200 {
5201     AV *retarray = NULL;
5202     SV *ret;
5203     struct regexp *const rx = (struct regexp *)SvANY(r);
5204
5205     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5206
5207     if (flags & RXapif_ALL)
5208         retarray=newAV();
5209
5210     if (rx && RXp_PAREN_NAMES(rx)) {
5211         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5212         if (he_str) {
5213             IV i;
5214             SV* sv_dat=HeVAL(he_str);
5215             I32 *nums=(I32*)SvPVX(sv_dat);
5216             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5217                 if ((I32)(rx->nparens) >= nums[i]
5218                     && rx->offs[nums[i]].start != -1
5219                     && rx->offs[nums[i]].end != -1)
5220                 {
5221                     ret = newSVpvs("");
5222                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5223                     if (!retarray)
5224                         return ret;
5225                 } else {
5226                     ret = newSVsv(&PL_sv_undef);
5227                 }
5228                 if (retarray)
5229                     av_push(retarray, ret);
5230             }
5231             if (retarray)
5232                 return newRV_noinc(MUTABLE_SV(retarray));
5233         }
5234     }
5235     return NULL;
5236 }
5237
5238 bool
5239 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5240                            const U32 flags)
5241 {
5242     struct regexp *const rx = (struct regexp *)SvANY(r);
5243
5244     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5245
5246     if (rx && RXp_PAREN_NAMES(rx)) {
5247         if (flags & RXapif_ALL) {
5248             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5249         } else {
5250             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5251             if (sv) {
5252                 SvREFCNT_dec(sv);
5253                 return TRUE;
5254             } else {
5255                 return FALSE;
5256             }
5257         }
5258     } else {
5259         return FALSE;
5260     }
5261 }
5262
5263 SV*
5264 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5265 {
5266     struct regexp *const rx = (struct regexp *)SvANY(r);
5267
5268     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5269
5270     if ( rx && RXp_PAREN_NAMES(rx) ) {
5271         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5272
5273         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5274     } else {
5275         return FALSE;
5276     }
5277 }
5278
5279 SV*
5280 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5281 {
5282     struct regexp *const rx = (struct regexp *)SvANY(r);
5283     GET_RE_DEBUG_FLAGS_DECL;
5284
5285     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5286
5287     if (rx && RXp_PAREN_NAMES(rx)) {
5288         HV *hv = RXp_PAREN_NAMES(rx);
5289         HE *temphe;
5290         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5291             IV i;
5292             IV parno = 0;
5293             SV* sv_dat = HeVAL(temphe);
5294             I32 *nums = (I32*)SvPVX(sv_dat);
5295             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5296                 if ((I32)(rx->lastparen) >= nums[i] &&
5297                     rx->offs[nums[i]].start != -1 &&
5298                     rx->offs[nums[i]].end != -1)
5299                 {
5300                     parno = nums[i];
5301                     break;
5302                 }
5303             }
5304             if (parno || flags & RXapif_ALL) {
5305                 return newSVhek(HeKEY_hek(temphe));
5306             }
5307         }
5308     }
5309     return NULL;
5310 }
5311
5312 SV*
5313 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5314 {
5315     SV *ret;
5316     AV *av;
5317     I32 length;
5318     struct regexp *const rx = (struct regexp *)SvANY(r);
5319
5320     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5321
5322     if (rx && RXp_PAREN_NAMES(rx)) {
5323         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5324             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5325         } else if (flags & RXapif_ONE) {
5326             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5327             av = MUTABLE_AV(SvRV(ret));
5328             length = av_len(av);
5329             SvREFCNT_dec(ret);
5330             return newSViv(length + 1);
5331         } else {
5332             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5333             return NULL;
5334         }
5335     }
5336     return &PL_sv_undef;
5337 }
5338
5339 SV*
5340 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5341 {
5342     struct regexp *const rx = (struct regexp *)SvANY(r);
5343     AV *av = newAV();
5344
5345     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5346
5347     if (rx && RXp_PAREN_NAMES(rx)) {
5348         HV *hv= RXp_PAREN_NAMES(rx);
5349         HE *temphe;
5350         (void)hv_iterinit(hv);
5351         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5352             IV i;
5353             IV parno = 0;
5354             SV* sv_dat = HeVAL(temphe);
5355             I32 *nums = (I32*)SvPVX(sv_dat);
5356             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5357                 if ((I32)(rx->lastparen) >= nums[i] &&
5358                     rx->offs[nums[i]].start != -1 &&
5359                     rx->offs[nums[i]].end != -1)
5360                 {
5361                     parno = nums[i];
5362                     break;
5363                 }
5364             }
5365             if (parno || flags & RXapif_ALL) {
5366                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5367             }
5368         }
5369     }
5370
5371     return newRV_noinc(MUTABLE_SV(av));
5372 }
5373
5374 void
5375 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5376                              SV * const sv)
5377 {
5378     struct regexp *const rx = (struct regexp *)SvANY(r);
5379     char *s = NULL;
5380     I32 i = 0;
5381     I32 s1, t1;
5382
5383     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5384
5385     if (!rx->subbeg) {
5386         sv_setsv(sv,&PL_sv_undef);
5387         return;
5388     }
5389     else
5390     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5391         /* $` */
5392         i = rx->offs[0].start;
5393         s = rx->subbeg;
5394     }
5395     else
5396     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5397         /* $' */
5398         s = rx->subbeg + rx->offs[0].end;
5399         i = rx->sublen - rx->offs[0].end;
5400     }
5401     else
5402     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5403         (s1 = rx->offs[paren].start) != -1 &&
5404         (t1 = rx->offs[paren].end) != -1)
5405     {
5406         /* $& $1 ... */
5407         i = t1 - s1;
5408         s = rx->subbeg + s1;
5409     } else {
5410         sv_setsv(sv,&PL_sv_undef);
5411         return;
5412     }
5413     assert(rx->sublen >= (s - rx->subbeg) + i );
5414     if (i >= 0) {
5415         const int oldtainted = PL_tainted;
5416         TAINT_NOT;
5417         sv_setpvn(sv, s, i);
5418         PL_tainted = oldtainted;
5419         if ( (rx->extflags & RXf_CANY_SEEN)
5420             ? (RXp_MATCH_UTF8(rx)
5421                         && (!i || is_utf8_string((U8*)s, i)))
5422             : (RXp_MATCH_UTF8(rx)) )
5423         {
5424             SvUTF8_on(sv);
5425         }
5426         else
5427             SvUTF8_off(sv);
5428         if (PL_tainting) {
5429             if (RXp_MATCH_TAINTED(rx)) {
5430                 if (SvTYPE(sv) >= SVt_PVMG) {
5431                     MAGIC* const mg = SvMAGIC(sv);
5432                     MAGIC* mgt;
5433                     PL_tainted = 1;
5434                     SvMAGIC_set(sv, mg->mg_moremagic);
5435                     SvTAINT(sv);
5436                     if ((mgt = SvMAGIC(sv))) {
5437                         mg->mg_moremagic = mgt;
5438                         SvMAGIC_set(sv, mg);
5439                     }
5440                 } else {
5441                     PL_tainted = 1;
5442                     SvTAINT(sv);
5443                 }
5444             } else
5445                 SvTAINTED_off(sv);
5446         }
5447     } else {
5448         sv_setsv(sv,&PL_sv_undef);
5449         return;
5450     }
5451 }
5452
5453 void
5454 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5455                                                          SV const * const value)
5456 {
5457     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5458
5459     PERL_UNUSED_ARG(rx);
5460     PERL_UNUSED_ARG(paren);
5461     PERL_UNUSED_ARG(value);
5462
5463     if (!PL_localizing)
5464         Perl_croak_no_modify(aTHX);
5465 }
5466
5467 I32
5468 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5469                               const I32 paren)
5470 {
5471     struct regexp *const rx = (struct regexp *)SvANY(r);
5472     I32 i;
5473     I32 s1, t1;
5474
5475     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5476
5477     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5478         switch (paren) {
5479       /* $` / ${^PREMATCH} */
5480       case RX_BUFF_IDX_PREMATCH:
5481         if (rx->offs[0].start != -1) {
5482                         i = rx->offs[0].start;
5483                         if (i > 0) {
5484                                 s1 = 0;
5485                                 t1 = i;
5486                                 goto getlen;
5487                         }
5488             }
5489         return 0;
5490       /* $' / ${^POSTMATCH} */
5491       case RX_BUFF_IDX_POSTMATCH:
5492             if (rx->offs[0].end != -1) {
5493                         i = rx->sublen - rx->offs[0].end;
5494                         if (i > 0) {
5495                                 s1 = rx->offs[0].end;
5496                                 t1 = rx->sublen;
5497                                 goto getlen;
5498                         }
5499             }
5500         return 0;
5501       /* $& / ${^MATCH}, $1, $2, ... */
5502       default:
5503             if (paren <= (I32)rx->nparens &&
5504             (s1 = rx->offs[paren].start) != -1 &&
5505             (t1 = rx->offs[paren].end) != -1)
5506             {
5507             i = t1 - s1;
5508             goto getlen;
5509         } else {
5510             if (ckWARN(WARN_UNINITIALIZED))
5511                 report_uninit((const SV *)sv);
5512             return 0;
5513         }
5514     }
5515   getlen:
5516     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5517         const char * const s = rx->subbeg + s1;
5518         const U8 *ep;
5519         STRLEN el;
5520
5521         i = t1 - s1;
5522         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5523                         i = el;
5524     }
5525     return i;
5526 }
5527
5528 SV*
5529 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5530 {
5531     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5532         PERL_UNUSED_ARG(rx);
5533         if (0)
5534             return NULL;
5535         else
5536             return newSVpvs("Regexp");
5537 }
5538
5539 /* Scans the name of a named buffer from the pattern.
5540  * If flags is REG_RSN_RETURN_NULL returns null.
5541  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5542  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5543  * to the parsed name as looked up in the RExC_paren_names hash.
5544  * If there is an error throws a vFAIL().. type exception.
5545  */
5546
5547 #define REG_RSN_RETURN_NULL    0
5548 #define REG_RSN_RETURN_NAME    1
5549 #define REG_RSN_RETURN_DATA    2
5550
5551 STATIC SV*
5552 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5553 {
5554     char *name_start = RExC_parse;
5555
5556     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5557
5558     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5559          /* skip IDFIRST by using do...while */
5560         if (UTF)
5561             do {
5562                 RExC_parse += UTF8SKIP(RExC_parse);
5563             } while (isALNUM_utf8((U8*)RExC_parse));
5564         else
5565             do {
5566                 RExC_parse++;
5567             } while (isALNUM(*RExC_parse));
5568     }
5569
5570     if ( flags ) {
5571         SV* sv_name
5572             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5573                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5574         if ( flags == REG_RSN_RETURN_NAME)
5575             return sv_name;
5576         else if (flags==REG_RSN_RETURN_DATA) {
5577             HE *he_str = NULL;
5578             SV *sv_dat = NULL;
5579             if ( ! sv_name )      /* should not happen*/
5580                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5581             if (RExC_paren_names)
5582                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5583             if ( he_str )
5584                 sv_dat = HeVAL(he_str);
5585             if ( ! sv_dat )
5586                 vFAIL("Reference to nonexistent named group");
5587             return sv_dat;
5588         }
5589         else {
5590             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5591         }
5592         /* NOT REACHED */
5593     }
5594     return NULL;
5595 }
5596
5597 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5598     int rem=(int)(RExC_end - RExC_parse);                       \
5599     int cut;                                                    \
5600     int num;                                                    \
5601     int iscut=0;                                                \
5602     if (rem>10) {                                               \
5603         rem=10;                                                 \
5604         iscut=1;                                                \
5605     }                                                           \
5606     cut=10-rem;                                                 \
5607     if (RExC_lastparse!=RExC_parse)                             \
5608         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5609             rem, RExC_parse,                                    \
5610             cut + 4,                                            \
5611             iscut ? "..." : "<"                                 \
5612         );                                                      \
5613     else                                                        \
5614         PerlIO_printf(Perl_debug_log,"%16s","");                \
5615                                                                 \
5616     if (SIZE_ONLY)                                              \
5617        num = RExC_size + 1;                                     \
5618     else                                                        \
5619        num=REG_NODE_NUM(RExC_emit);                             \
5620     if (RExC_lastnum!=num)                                      \
5621        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5622     else                                                        \
5623        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5624     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5625         (int)((depth*2)), "",                                   \
5626         (funcname)                                              \
5627     );                                                          \
5628     RExC_lastnum=num;                                           \
5629     RExC_lastparse=RExC_parse;                                  \
5630 })
5631
5632
5633
5634 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5635     DEBUG_PARSE_MSG((funcname));                            \
5636     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5637 })
5638 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5639     DEBUG_PARSE_MSG((funcname));                            \
5640     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5641 })
5642 /*
5643  - reg - regular expression, i.e. main body or parenthesized thing
5644  *
5645  * Caller must absorb opening parenthesis.
5646  *
5647  * Combining parenthesis handling with the base level of regular expression
5648  * is a trifle forced, but the need to tie the tails of the branches to what
5649  * follows makes it hard to avoid.
5650  */
5651 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5652 #ifdef DEBUGGING
5653 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5654 #else
5655 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5656 #endif
5657
5658 STATIC regnode *
5659 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5660     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5661 {
5662     dVAR;
5663     register regnode *ret;              /* Will be the head of the group. */
5664     register regnode *br;
5665     register regnode *lastbr;
5666     register regnode *ender = NULL;
5667     register I32 parno = 0;
5668     I32 flags;
5669     U32 oregflags = RExC_flags;
5670     bool have_branch = 0;
5671     bool is_open = 0;
5672     I32 freeze_paren = 0;
5673     I32 after_freeze = 0;
5674
5675     /* for (?g), (?gc), and (?o) warnings; warning
5676        about (?c) will warn about (?g) -- japhy    */
5677
5678 #define WASTED_O  0x01
5679 #define WASTED_G  0x02
5680 #define WASTED_C  0x04
5681 #define WASTED_GC (0x02|0x04)
5682     I32 wastedflags = 0x00;
5683
5684     char * parse_start = RExC_parse; /* MJD */
5685     char * const oregcomp_parse = RExC_parse;
5686
5687     GET_RE_DEBUG_FLAGS_DECL;
5688
5689     PERL_ARGS_ASSERT_REG;
5690     DEBUG_PARSE("reg ");
5691
5692     *flagp = 0;                         /* Tentatively. */
5693
5694
5695     /* Make an OPEN node, if parenthesized. */
5696     if (paren) {
5697         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5698             char *start_verb = RExC_parse;
5699             STRLEN verb_len = 0;
5700             char *start_arg = NULL;
5701             unsigned char op = 0;
5702             int argok = 1;
5703             int internal_argval = 0; /* internal_argval is only useful if !argok */
5704             while ( *RExC_parse && *RExC_parse != ')' ) {
5705                 if ( *RExC_parse == ':' ) {
5706                     start_arg = RExC_parse + 1;
5707                     break;
5708                 }
5709                 RExC_parse++;
5710             }
5711             ++start_verb;
5712             verb_len = RExC_parse - start_verb;
5713             if ( start_arg ) {
5714                 RExC_parse++;
5715                 while ( *RExC_parse && *RExC_parse != ')' )
5716                     RExC_parse++;
5717                 if ( *RExC_parse != ')' )
5718                     vFAIL("Unterminated verb pattern argument");
5719                 if ( RExC_parse == start_arg )
5720                     start_arg = NULL;
5721             } else {
5722                 if ( *RExC_parse != ')' )
5723                     vFAIL("Unterminated verb pattern");
5724             }
5725
5726             switch ( *start_verb ) {
5727             case 'A':  /* (*ACCEPT) */
5728                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5729                     op = ACCEPT;
5730                     internal_argval = RExC_nestroot;
5731                 }
5732                 break;
5733             case 'C':  /* (*COMMIT) */
5734                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5735                     op = COMMIT;
5736                 break;
5737             case 'F':  /* (*FAIL) */
5738                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5739                     op = OPFAIL;
5740                     argok = 0;
5741                 }
5742                 break;
5743             case ':':  /* (*:NAME) */
5744             case 'M':  /* (*MARK:NAME) */
5745                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5746                     op = MARKPOINT;
5747                     argok = -1;
5748                 }
5749                 break;
5750             case 'P':  /* (*PRUNE) */
5751                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5752                     op = PRUNE;
5753                 break;
5754             case 'S':   /* (*SKIP) */
5755                 if ( memEQs(start_verb,verb_len,"SKIP") )
5756                     op = SKIP;
5757                 break;
5758             case 'T':  /* (*THEN) */
5759                 /* [19:06] <TimToady> :: is then */
5760                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5761                     op = CUTGROUP;
5762                     RExC_seen |= REG_SEEN_CUTGROUP;
5763                 }
5764                 break;
5765             }
5766             if ( ! op ) {
5767                 RExC_parse++;
5768                 vFAIL3("Unknown verb pattern '%.*s'",
5769                     verb_len, start_verb);
5770             }
5771             if ( argok ) {
5772                 if ( start_arg && internal_argval ) {
5773                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5774                         verb_len, start_verb);
5775                 } else if ( argok < 0 && !start_arg ) {
5776                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5777                         verb_len, start_verb);
5778                 } else {
5779                     ret = reganode(pRExC_state, op, internal_argval);
5780                     if ( ! internal_argval && ! SIZE_ONLY ) {
5781                         if (start_arg) {
5782                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5783                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5784                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5785                             ret->flags = 0;
5786                         } else {
5787                             ret->flags = 1;
5788                         }
5789                     }
5790                 }
5791                 if (!internal_argval)
5792                     RExC_seen |= REG_SEEN_VERBARG;
5793             } else if ( start_arg ) {
5794                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5795                         verb_len, start_verb);
5796             } else {
5797                 ret = reg_node(pRExC_state, op);
5798             }
5799             nextchar(pRExC_state);
5800             return ret;
5801         } else
5802         if (*RExC_parse == '?') { /* (?...) */
5803             bool is_logical = 0;
5804             const char * const seqstart = RExC_parse;
5805             bool has_use_defaults = FALSE;
5806
5807             RExC_parse++;
5808             paren = *RExC_parse++;
5809             ret = NULL;                 /* For look-ahead/behind. */
5810             switch (paren) {
5811
5812             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5813                 paren = *RExC_parse++;
5814                 if ( paren == '<')         /* (?P<...>) named capture */
5815                     goto named_capture;
5816                 else if (paren == '>') {   /* (?P>name) named recursion */
5817                     goto named_recursion;
5818                 }
5819                 else if (paren == '=') {   /* (?P=...)  named backref */
5820                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5821                        you change this make sure you change that */
5822                     char* name_start = RExC_parse;
5823                     U32 num = 0;
5824                     SV *sv_dat = reg_scan_name(pRExC_state,
5825                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5826                     if (RExC_parse == name_start || *RExC_parse != ')')
5827                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5828
5829                     if (!SIZE_ONLY) {
5830                         num = add_data( pRExC_state, 1, "S" );
5831                         RExC_rxi->data->data[num]=(void*)sv_dat;
5832                         SvREFCNT_inc_simple_void(sv_dat);
5833                     }
5834                     RExC_sawback = 1;
5835                     ret = reganode(pRExC_state,
5836                                    ((! FOLD)
5837                                      ? NREF
5838                                      : (UNI_SEMANTICS)
5839                                        ? NREFFU
5840                                        : (LOC)
5841                                          ? NREFFL
5842                                          : NREFF),
5843                                     num);
5844                     *flagp |= HASWIDTH;
5845
5846                     Set_Node_Offset(ret, parse_start+1);
5847                     Set_Node_Cur_Length(ret); /* MJD */
5848
5849                     nextchar(pRExC_state);
5850                     return ret;
5851                 }
5852                 RExC_parse++;
5853                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5854                 /*NOTREACHED*/
5855             case '<':           /* (?<...) */
5856                 if (*RExC_parse == '!')
5857                     paren = ',';
5858                 else if (*RExC_parse != '=')
5859               named_capture:
5860                 {               /* (?<...>) */
5861                     char *name_start;
5862                     SV *svname;
5863                     paren= '>';
5864             case '\'':          /* (?'...') */
5865                     name_start= RExC_parse;
5866                     svname = reg_scan_name(pRExC_state,
5867                         SIZE_ONLY ?  /* reverse test from the others */
5868                         REG_RSN_RETURN_NAME :
5869                         REG_RSN_RETURN_NULL);
5870                     if (RExC_parse == name_start) {
5871                         RExC_parse++;
5872                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5873                         /*NOTREACHED*/
5874                     }
5875                     if (*RExC_parse != paren)
5876                         vFAIL2("Sequence (?%c... not terminated",
5877                             paren=='>' ? '<' : paren);
5878                     if (SIZE_ONLY) {
5879                         HE *he_str;
5880                         SV *sv_dat = NULL;
5881                         if (!svname) /* shouldn't happen */
5882                             Perl_croak(aTHX_
5883                                 "panic: reg_scan_name returned NULL");
5884                         if (!RExC_paren_names) {
5885                             RExC_paren_names= newHV();
5886                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5887 #ifdef DEBUGGING
5888                             RExC_paren_name_list= newAV();
5889                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5890 #endif
5891                         }
5892                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5893                         if ( he_str )
5894                             sv_dat = HeVAL(he_str);
5895                         if ( ! sv_dat ) {
5896                             /* croak baby croak */
5897                             Perl_croak(aTHX_
5898                                 "panic: paren_name hash element allocation failed");
5899                         } else if ( SvPOK(sv_dat) ) {
5900                             /* (?|...) can mean we have dupes so scan to check
5901                                its already been stored. Maybe a flag indicating
5902                                we are inside such a construct would be useful,
5903                                but the arrays are likely to be quite small, so
5904                                for now we punt -- dmq */
5905                             IV count = SvIV(sv_dat);
5906                             I32 *pv = (I32*)SvPVX(sv_dat);
5907                             IV i;
5908                             for ( i = 0 ; i < count ; i++ ) {
5909                                 if ( pv[i] == RExC_npar ) {
5910                                     count = 0;
5911                                     break;
5912                                 }
5913                             }
5914                             if ( count ) {
5915                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5916                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5917                                 pv[count] = RExC_npar;
5918                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5919                             }
5920                         } else {
5921                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5922                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5923                             SvIOK_on(sv_dat);
5924                             SvIV_set(sv_dat, 1);
5925                         }
5926 #ifdef DEBUGGING
5927                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5928                             SvREFCNT_dec(svname);
5929 #endif
5930
5931                         /*sv_dump(sv_dat);*/
5932                     }
5933                     nextchar(pRExC_state);
5934                     paren = 1;
5935                     goto capturing_parens;
5936                 }
5937                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5938                 RExC_parse++;
5939             case '=':           /* (?=...) */
5940                 RExC_seen_zerolen++;
5941                 break;
5942             case '!':           /* (?!...) */
5943                 RExC_seen_zerolen++;
5944                 if (*RExC_parse == ')') {
5945                     ret=reg_node(pRExC_state, OPFAIL);
5946                     nextchar(pRExC_state);
5947                     return ret;
5948                 }
5949                 break;
5950             case '|':           /* (?|...) */
5951                 /* branch reset, behave like a (?:...) except that
5952                    buffers in alternations share the same numbers */
5953                 paren = ':';
5954                 after_freeze = freeze_paren = RExC_npar;
5955                 break;
5956             case ':':           /* (?:...) */
5957             case '>':           /* (?>...) */
5958                 break;
5959             case '$':           /* (?$...) */
5960             case '@':           /* (?@...) */
5961                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5962                 break;
5963             case '#':           /* (?#...) */
5964                 while (*RExC_parse && *RExC_parse != ')')
5965                     RExC_parse++;
5966                 if (*RExC_parse != ')')
5967                     FAIL("Sequence (?#... not terminated");
5968                 nextchar(pRExC_state);
5969                 *flagp = TRYAGAIN;
5970                 return NULL;
5971             case '0' :           /* (?0) */
5972             case 'R' :           /* (?R) */
5973                 if (*RExC_parse != ')')
5974                     FAIL("Sequence (?R) not terminated");
5975                 ret = reg_node(pRExC_state, GOSTART);
5976                 *flagp |= POSTPONED;
5977                 nextchar(pRExC_state);
5978                 return ret;
5979                 /*notreached*/
5980             { /* named and numeric backreferences */
5981                 I32 num;
5982             case '&':            /* (?&NAME) */
5983                 parse_start = RExC_parse - 1;
5984               named_recursion:
5985                 {
5986                     SV *sv_dat = reg_scan_name(pRExC_state,
5987                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5988                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5989                 }
5990                 goto gen_recurse_regop;
5991                 /* NOT REACHED */
5992             case '+':
5993                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5994                     RExC_parse++;
5995                     vFAIL("Illegal pattern");
5996                 }
5997                 goto parse_recursion;
5998                 /* NOT REACHED*/
5999             case '-': /* (?-1) */
6000                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6001                     RExC_parse--; /* rewind to let it be handled later */
6002                     goto parse_flags;
6003                 }
6004                 /*FALLTHROUGH */
6005             case '1': case '2': case '3': case '4': /* (?1) */
6006             case '5': case '6': case '7': case '8': case '9':
6007                 RExC_parse--;
6008               parse_recursion:
6009                 num = atoi(RExC_parse);
6010                 parse_start = RExC_parse - 1; /* MJD */
6011                 if (*RExC_parse == '-')
6012                     RExC_parse++;
6013                 while (isDIGIT(*RExC_parse))
6014                         RExC_parse++;
6015                 if (*RExC_parse!=')')
6016                     vFAIL("Expecting close bracket");
6017
6018               gen_recurse_regop:
6019                 if ( paren == '-' ) {
6020                     /*
6021                     Diagram of capture buffer numbering.
6022                     Top line is the normal capture buffer numbers
6023                     Bottom line is the negative indexing as from
6024                     the X (the (?-2))
6025
6026                     +   1 2    3 4 5 X          6 7
6027                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6028                     -   5 4    3 2 1 X          x x
6029
6030                     */
6031                     num = RExC_npar + num;
6032                     if (num < 1)  {
6033                         RExC_parse++;
6034                         vFAIL("Reference to nonexistent group");
6035                     }
6036                 } else if ( paren == '+' ) {
6037                     num = RExC_npar + num - 1;
6038                 }
6039
6040                 ret = reganode(pRExC_state, GOSUB, num);
6041                 if (!SIZE_ONLY) {
6042                     if (num > (I32)RExC_rx->nparens) {
6043                         RExC_parse++;
6044                         vFAIL("Reference to nonexistent group");
6045                     }
6046                     ARG2L_SET( ret, RExC_recurse_count++);
6047                     RExC_emit++;
6048                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6049                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6050                 } else {
6051                     RExC_size++;
6052                 }
6053                 RExC_seen |= REG_SEEN_RECURSE;
6054                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6055                 Set_Node_Offset(ret, parse_start); /* MJD */
6056
6057                 *flagp |= POSTPONED;
6058                 nextchar(pRExC_state);
6059                 return ret;
6060             } /* named and numeric backreferences */
6061             /* NOT REACHED */
6062
6063             case '?':           /* (??...) */
6064                 is_logical = 1;
6065                 if (*RExC_parse != '{') {
6066                     RExC_parse++;
6067                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6068                     /*NOTREACHED*/
6069                 }
6070                 *flagp |= POSTPONED;
6071                 paren = *RExC_parse++;
6072                 /* FALL THROUGH */
6073             case '{':           /* (?{...}) */
6074             {
6075                 I32 count = 1;
6076                 U32 n = 0;
6077                 char c;
6078                 char *s = RExC_parse;
6079
6080                 RExC_seen_zerolen++;
6081                 RExC_seen |= REG_SEEN_EVAL;
6082                 while (count && (c = *RExC_parse)) {
6083                     if (c == '\\') {
6084                         if (RExC_parse[1])
6085                             RExC_parse++;
6086                     }
6087                     else if (c == '{')
6088                         count++;
6089                     else if (c == '}')
6090                         count--;
6091                     RExC_parse++;
6092                 }
6093                 if (*RExC_parse != ')') {
6094                     RExC_parse = s;
6095                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6096                 }
6097                 if (!SIZE_ONLY) {
6098                     PAD *pad;
6099                     OP_4tree *sop, *rop;
6100                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6101
6102                     ENTER;
6103                     Perl_save_re_context(aTHX);
6104                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6105                     sop->op_private |= OPpREFCOUNTED;
6106                     /* re_dup will OpREFCNT_inc */
6107                     OpREFCNT_set(sop, 1);
6108                     LEAVE;
6109
6110                     n = add_data(pRExC_state, 3, "nop");
6111                     RExC_rxi->data->data[n] = (void*)rop;
6112                     RExC_rxi->data->data[n+1] = (void*)sop;
6113                     RExC_rxi->data->data[n+2] = (void*)pad;
6114                     SvREFCNT_dec(sv);
6115                 }
6116                 else {                                          /* First pass */
6117                     if (PL_reginterp_cnt < ++RExC_seen_evals
6118                         && IN_PERL_RUNTIME)
6119                         /* No compiled RE interpolated, has runtime
6120                            components ===> unsafe.  */
6121                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6122                     if (PL_tainting && PL_tainted)
6123                         FAIL("Eval-group in insecure regular expression");
6124 #if PERL_VERSION > 8
6125                     if (IN_PERL_COMPILETIME)
6126                         PL_cv_has_eval = 1;
6127 #endif
6128                 }
6129
6130                 nextchar(pRExC_state);
6131                 if (is_logical) {
6132                     ret = reg_node(pRExC_state, LOGICAL);
6133                     if (!SIZE_ONLY)
6134                         ret->flags = 2;
6135                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6136                     /* deal with the length of this later - MJD */
6137                     return ret;
6138                 }
6139                 ret = reganode(pRExC_state, EVAL, n);
6140                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6141                 Set_Node_Offset(ret, parse_start);
6142                 return ret;
6143             }
6144             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6145             {
6146                 int is_define= 0;
6147                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6148                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6149                         || RExC_parse[1] == '<'
6150                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6151                         I32 flag;
6152
6153                         ret = reg_node(pRExC_state, LOGICAL);
6154                         if (!SIZE_ONLY)
6155                             ret->flags = 1;
6156                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6157                         goto insert_if;
6158                     }
6159                 }
6160                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6161                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6162                 {
6163                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6164                     char *name_start= RExC_parse++;
6165                     U32 num = 0;
6166                     SV *sv_dat=reg_scan_name(pRExC_state,
6167                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6168                     if (RExC_parse == name_start || *RExC_parse != ch)
6169                         vFAIL2("Sequence (?(%c... not terminated",
6170                             (ch == '>' ? '<' : ch));
6171                     RExC_parse++;
6172                     if (!SIZE_ONLY) {
6173                         num = add_data( pRExC_state, 1, "S" );
6174                         RExC_rxi->data->data[num]=(void*)sv_dat;
6175                         SvREFCNT_inc_simple_void(sv_dat);
6176                     }
6177                     ret = reganode(pRExC_state,NGROUPP,num);
6178                     goto insert_if_check_paren;
6179                 }
6180                 else if (RExC_parse[0] == 'D' &&
6181                          RExC_parse[1] == 'E' &&
6182                          RExC_parse[2] == 'F' &&
6183                          RExC_parse[3] == 'I' &&
6184                          RExC_parse[4] == 'N' &&
6185                          RExC_parse[5] == 'E')
6186                 {
6187                     ret = reganode(pRExC_state,DEFINEP,0);
6188                     RExC_parse +=6 ;
6189                     is_define = 1;
6190                     goto insert_if_check_paren;
6191                 }
6192                 else if (RExC_parse[0] == 'R') {
6193                     RExC_parse++;
6194                     parno = 0;
6195                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6196                         parno = atoi(RExC_parse++);
6197                         while (isDIGIT(*RExC_parse))
6198                             RExC_parse++;
6199                     } else if (RExC_parse[0] == '&') {
6200                         SV *sv_dat;
6201                         RExC_parse++;
6202                         sv_dat = reg_scan_name(pRExC_state,
6203                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6204                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6205                     }
6206                     ret = reganode(pRExC_state,INSUBP,parno);
6207                     goto insert_if_check_paren;
6208                 }
6209                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6210                     /* (?(1)...) */
6211                     char c;
6212                     parno = atoi(RExC_parse++);
6213
6214                     while (isDIGIT(*RExC_parse))
6215                         RExC_parse++;
6216                     ret = reganode(pRExC_state, GROUPP, parno);
6217
6218                  insert_if_check_paren:
6219                     if ((c = *nextchar(pRExC_state)) != ')')
6220                         vFAIL("Switch condition not recognized");
6221                   insert_if:
6222                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6223                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6224                     if (br == NULL)
6225                         br = reganode(pRExC_state, LONGJMP, 0);
6226                     else
6227                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6228                     c = *nextchar(pRExC_state);
6229                     if (flags&HASWIDTH)
6230                         *flagp |= HASWIDTH;
6231                     if (c == '|') {
6232                         if (is_define)
6233                             vFAIL("(?(DEFINE)....) does not allow branches");
6234                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6235                         regbranch(pRExC_state, &flags, 1,depth+1);
6236                         REGTAIL(pRExC_state, ret, lastbr);
6237                         if (flags&HASWIDTH)
6238                             *flagp |= HASWIDTH;
6239                         c = *nextchar(pRExC_state);
6240                     }
6241                     else
6242                         lastbr = NULL;
6243                     if (c != ')')
6244                         vFAIL("Switch (?(condition)... contains too many branches");
6245                     ender = reg_node(pRExC_state, TAIL);
6246                     REGTAIL(pRExC_state, br, ender);
6247                     if (lastbr) {
6248                         REGTAIL(pRExC_state, lastbr, ender);
6249                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6250                     }
6251                     else
6252                         REGTAIL(pRExC_state, ret, ender);
6253                     RExC_size++; /* XXX WHY do we need this?!!
6254                                     For large programs it seems to be required
6255                                     but I can't figure out why. -- dmq*/
6256                     return ret;
6257                 }
6258                 else {
6259                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6260                 }
6261             }
6262             case 0:
6263                 RExC_parse--; /* for vFAIL to print correctly */
6264                 vFAIL("Sequence (? incomplete");
6265                 break;
6266             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6267                                        that follow */
6268                 has_use_defaults = TRUE;
6269                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6270                 if (RExC_utf8) {    /* But the default for a utf8 pattern is
6271                                        unicode semantics */
6272                     set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
6273                 }
6274                 goto parse_flags;
6275             default:
6276                 --RExC_parse;
6277                 parse_flags:      /* (?i) */
6278             {
6279                 U32 posflags = 0, negflags = 0;
6280                 U32 *flagsp = &posflags;
6281                 bool has_charset_modifier = 0;
6282                 regex_charset cs = REGEX_DEPENDS_CHARSET;
6283
6284                 while (*RExC_parse) {
6285                     /* && strchr("iogcmsx", *RExC_parse) */
6286                     /* (?g), (?gc) and (?o) are useless here
6287                        and must be globally applied -- japhy */
6288                     switch (*RExC_parse) {
6289                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6290                     case LOCALE_PAT_MOD:
6291                         if (has_charset_modifier || flagsp == &negflags) {
6292                             goto fail_modifiers;
6293                         }
6294                         cs = REGEX_LOCALE_CHARSET;
6295                         has_charset_modifier = 1;
6296                         break;
6297                     case UNICODE_PAT_MOD:
6298                         if (has_charset_modifier || flagsp == &negflags) {
6299                             goto fail_modifiers;
6300                         }
6301                         cs = REGEX_UNICODE_CHARSET;
6302                         has_charset_modifier = 1;
6303                         break;
6304                     case DEPENDS_PAT_MOD:
6305                         if (has_use_defaults
6306                             || has_charset_modifier
6307                             || flagsp == &negflags)
6308                         {
6309                             goto fail_modifiers;
6310                         }
6311
6312                         /* The dual charset means unicode semantics if the
6313                          * pattern (or target, not known until runtime) are
6314                          * utf8 */
6315                         cs = (RExC_utf8)
6316                              ? REGEX_UNICODE_CHARSET
6317                              : REGEX_DEPENDS_CHARSET;
6318                         has_charset_modifier = 1;
6319                         break;
6320                     case ONCE_PAT_MOD: /* 'o' */
6321                     case GLOBAL_PAT_MOD: /* 'g' */
6322                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6323                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6324                             if (! (wastedflags & wflagbit) ) {
6325                                 wastedflags |= wflagbit;
6326                                 vWARN5(
6327                                     RExC_parse + 1,
6328                                     "Useless (%s%c) - %suse /%c modifier",
6329                                     flagsp == &negflags ? "?-" : "?",
6330                                     *RExC_parse,
6331                                     flagsp == &negflags ? "don't " : "",
6332                                     *RExC_parse
6333                                 );
6334                             }
6335                         }
6336                         break;
6337
6338                     case CONTINUE_PAT_MOD: /* 'c' */
6339                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6340                             if (! (wastedflags & WASTED_C) ) {
6341                                 wastedflags |= WASTED_GC;
6342                                 vWARN3(
6343                                     RExC_parse + 1,
6344                                     "Useless (%sc) - %suse /gc modifier",
6345                                     flagsp == &negflags ? "?-" : "?",
6346                                     flagsp == &negflags ? "don't " : ""
6347                                 );
6348                             }
6349                         }
6350                         break;
6351                     case KEEPCOPY_PAT_MOD: /* 'p' */
6352                         if (flagsp == &negflags) {
6353                             if (SIZE_ONLY)
6354                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6355                         } else {
6356                             *flagsp |= RXf_PMf_KEEPCOPY;
6357                         }
6358                         break;
6359                     case '-':
6360                         /* A flag is a default iff it is following a minus, so
6361                          * if there is a minus, it means will be trying to
6362                          * re-specify a default which is an error */
6363                         if (has_use_defaults || flagsp == &negflags) {
6364             fail_modifiers:
6365                             RExC_parse++;
6366                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6367                             /*NOTREACHED*/
6368                         }
6369                         flagsp = &negflags;
6370                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6371                         break;
6372                     case ':':
6373                         paren = ':';
6374                         /*FALLTHROUGH*/
6375                     case ')':
6376                         RExC_flags |= posflags;
6377                         RExC_flags &= ~negflags;
6378                         set_regex_charset(&RExC_flags, cs);
6379                         if (paren != ':') {
6380                             oregflags |= posflags;
6381                             oregflags &= ~negflags;
6382                             set_regex_charset(&oregflags, cs);
6383                         }
6384                         nextchar(pRExC_state);
6385                         if (paren != ':') {
6386                             *flagp = TRYAGAIN;
6387                             return NULL;
6388                         } else {
6389                             ret = NULL;
6390                             goto parse_rest;
6391                         }
6392                         /*NOTREACHED*/
6393                     default:
6394                         RExC_parse++;
6395                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6396                         /*NOTREACHED*/
6397                     }
6398                     ++RExC_parse;
6399                 }
6400             }} /* one for the default block, one for the switch */
6401         }
6402         else {                  /* (...) */
6403           capturing_parens:
6404             parno = RExC_npar;
6405             RExC_npar++;
6406
6407             ret = reganode(pRExC_state, OPEN, parno);
6408             if (!SIZE_ONLY ){
6409                 if (!RExC_nestroot)
6410                     RExC_nestroot = parno;
6411                 if (RExC_seen & REG_SEEN_RECURSE
6412                     && !RExC_open_parens[parno-1])
6413                 {
6414                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6415                         "Setting open paren #%"IVdf" to %d\n",
6416                         (IV)parno, REG_NODE_NUM(ret)));
6417                     RExC_open_parens[parno-1]= ret;
6418                 }
6419             }
6420             Set_Node_Length(ret, 1); /* MJD */
6421             Set_Node_Offset(ret, RExC_parse); /* MJD */
6422             is_open = 1;
6423         }
6424     }
6425     else                        /* ! paren */
6426         ret = NULL;
6427
6428    parse_rest:
6429     /* Pick up the branches, linking them together. */
6430     parse_start = RExC_parse;   /* MJD */
6431     br = regbranch(pRExC_state, &flags, 1,depth+1);
6432
6433     if (freeze_paren) {
6434         if (RExC_npar > after_freeze)
6435             after_freeze = RExC_npar;
6436         RExC_npar = freeze_paren;
6437     }
6438
6439     /*     branch_len = (paren != 0); */
6440
6441     if (br == NULL)
6442         return(NULL);
6443     if (*RExC_parse == '|') {
6444         if (!SIZE_ONLY && RExC_extralen) {
6445             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6446         }
6447         else {                  /* MJD */
6448             reginsert(pRExC_state, BRANCH, br, depth+1);
6449             Set_Node_Length(br, paren != 0);
6450             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6451         }
6452         have_branch = 1;
6453         if (SIZE_ONLY)
6454             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6455     }
6456     else if (paren == ':') {
6457         *flagp |= flags&SIMPLE;
6458     }
6459     if (is_open) {                              /* Starts with OPEN. */
6460         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6461     }
6462     else if (paren != '?')              /* Not Conditional */
6463         ret = br;
6464     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6465     lastbr = br;
6466     while (*RExC_parse == '|') {
6467         if (!SIZE_ONLY && RExC_extralen) {
6468             ender = reganode(pRExC_state, LONGJMP,0);
6469             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6470         }
6471         if (SIZE_ONLY)
6472             RExC_extralen += 2;         /* Account for LONGJMP. */
6473         nextchar(pRExC_state);
6474         if (freeze_paren) {
6475             if (RExC_npar > after_freeze)
6476                 after_freeze = RExC_npar;
6477             RExC_npar = freeze_paren;
6478         }
6479         br = regbranch(pRExC_state, &flags, 0, depth+1);
6480
6481         if (br == NULL)
6482             return(NULL);
6483         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6484         lastbr = br;
6485         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6486     }
6487
6488     if (have_branch || paren != ':') {
6489         /* Make a closing node, and hook it on the end. */
6490         switch (paren) {
6491         case ':':
6492             ender = reg_node(pRExC_state, TAIL);
6493             break;
6494         case 1:
6495             ender = reganode(pRExC_state, CLOSE, parno);
6496             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6497                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6498                         "Setting close paren #%"IVdf" to %d\n",
6499                         (IV)parno, REG_NODE_NUM(ender)));
6500                 RExC_close_parens[parno-1]= ender;
6501                 if (RExC_nestroot == parno)
6502                     RExC_nestroot = 0;
6503             }
6504             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6505             Set_Node_Length(ender,1); /* MJD */
6506             break;
6507         case '<':
6508         case ',':
6509         case '=':
6510         case '!':
6511             *flagp &= ~HASWIDTH;
6512             /* FALL THROUGH */
6513         case '>':
6514             ender = reg_node(pRExC_state, SUCCEED);
6515             break;
6516         case 0:
6517             ender = reg_node(pRExC_state, END);
6518             if (!SIZE_ONLY) {
6519                 assert(!RExC_opend); /* there can only be one! */
6520                 RExC_opend = ender;
6521             }
6522             break;
6523         }
6524         REGTAIL(pRExC_state, lastbr, ender);
6525
6526         if (have_branch && !SIZE_ONLY) {
6527             if (depth==1)
6528                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6529
6530             /* Hook the tails of the branches to the closing node. */
6531             for (br = ret; br; br = regnext(br)) {
6532                 const U8 op = PL_regkind[OP(br)];
6533                 if (op == BRANCH) {
6534                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6535                 }
6536                 else if (op == BRANCHJ) {
6537                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6538                 }
6539             }
6540         }
6541     }
6542
6543     {
6544         const char *p;
6545         static const char parens[] = "=!<,>";
6546
6547         if (paren && (p = strchr(parens, paren))) {
6548             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6549             int flag = (p - parens) > 1;
6550
6551             if (paren == '>')
6552                 node = SUSPEND, flag = 0;
6553             reginsert(pRExC_state, node,ret, depth+1);
6554             Set_Node_Cur_Length(ret);
6555             Set_Node_Offset(ret, parse_start + 1);
6556             ret->flags = flag;
6557             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6558         }
6559     }
6560
6561     /* Check for proper termination. */
6562     if (paren) {
6563         RExC_flags = oregflags;
6564         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6565             RExC_parse = oregcomp_parse;
6566             vFAIL("Unmatched (");
6567         }
6568     }
6569     else if (!paren && RExC_parse < RExC_end) {
6570         if (*RExC_parse == ')') {
6571             RExC_parse++;
6572             vFAIL("Unmatched )");
6573         }
6574         else
6575             FAIL("Junk on end of regexp");      /* "Can't happen". */
6576         /* NOTREACHED */
6577     }
6578     if (after_freeze)
6579         RExC_npar = after_freeze;
6580     return(ret);
6581 }
6582
6583 /*
6584  - regbranch - one alternative of an | operator
6585  *
6586  * Implements the concatenation operator.
6587  */
6588 STATIC regnode *
6589 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6590 {
6591     dVAR;
6592     register regnode *ret;
6593     register regnode *chain = NULL;
6594     register regnode *latest;
6595     I32 flags = 0, c = 0;
6596     GET_RE_DEBUG_FLAGS_DECL;
6597
6598     PERL_ARGS_ASSERT_REGBRANCH;
6599
6600     DEBUG_PARSE("brnc");
6601
6602     if (first)
6603         ret = NULL;
6604     else {
6605         if (!SIZE_ONLY && RExC_extralen)
6606             ret = reganode(pRExC_state, BRANCHJ,0);
6607         else {
6608             ret = reg_node(pRExC_state, BRANCH);
6609             Set_Node_Length(ret, 1);
6610         }
6611     }
6612
6613     if (!first && SIZE_ONLY)
6614         RExC_extralen += 1;                     /* BRANCHJ */
6615
6616     *flagp = WORST;                     /* Tentatively. */
6617
6618     RExC_parse--;
6619     nextchar(pRExC_state);
6620     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6621         flags &= ~TRYAGAIN;
6622         latest = regpiece(pRExC_state, &flags,depth+1);
6623         if (latest == NULL) {
6624             if (flags & TRYAGAIN)
6625                 continue;
6626             return(NULL);
6627         }
6628         else if (ret == NULL)
6629             ret = latest;
6630         *flagp |= flags&(HASWIDTH|POSTPONED);
6631         if (chain == NULL)      /* First piece. */
6632             *flagp |= flags&SPSTART;
6633         else {
6634             RExC_naughty++;
6635             REGTAIL(pRExC_state, chain, latest);
6636         }
6637         chain = latest;
6638         c++;
6639     }
6640     if (chain == NULL) {        /* Loop ran zero times. */
6641         chain = reg_node(pRExC_state, NOTHING);
6642         if (ret == NULL)
6643             ret = chain;
6644     }
6645     if (c == 1) {
6646         *flagp |= flags&SIMPLE;
6647     }
6648
6649     return ret;
6650 }
6651
6652 /*
6653  - regpiece - something followed by possible [*+?]
6654  *
6655  * Note that the branching code sequences used for ? and the general cases
6656  * of * and + are somewhat optimized:  they use the same NOTHING node as
6657  * both the endmarker for their branch list and the body of the last branch.
6658  * It might seem that this node could be dispensed with entirely, but the
6659  * endmarker role is not redundant.
6660  */
6661 STATIC regnode *
6662 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6663 {
6664     dVAR;
6665     register regnode *ret;
6666     register char op;
6667     register char *next;
6668     I32 flags;
6669     const char * const origparse = RExC_parse;
6670     I32 min;
6671     I32 max = REG_INFTY;
6672     char *parse_start;
6673     const char *maxpos = NULL;
6674     GET_RE_DEBUG_FLAGS_DECL;
6675
6676     PERL_ARGS_ASSERT_REGPIECE;
6677
6678     DEBUG_PARSE("piec");
6679
6680     ret = regatom(pRExC_state, &flags,depth+1);
6681     if (ret == NULL) {
6682         if (flags & TRYAGAIN)
6683             *flagp |= TRYAGAIN;
6684         return(NULL);
6685     }
6686
6687     op = *RExC_parse;
6688
6689     if (op == '{' && regcurly(RExC_parse)) {
6690         maxpos = NULL;
6691         parse_start = RExC_parse; /* MJD */
6692         next = RExC_parse + 1;
6693         while (isDIGIT(*next) || *next == ',') {
6694             if (*next == ',') {
6695                 if (maxpos)
6696                     break;
6697                 else
6698                     maxpos = next;
6699             }
6700             next++;
6701         }
6702         if (*next == '}') {             /* got one */
6703             if (!maxpos)
6704                 maxpos = next;
6705             RExC_parse++;
6706             min = atoi(RExC_parse);
6707             if (*maxpos == ',')
6708                 maxpos++;
6709             else
6710                 maxpos = RExC_parse;
6711             max = atoi(maxpos);
6712             if (!max && *maxpos != '0')
6713                 max = REG_INFTY;                /* meaning "infinity" */
6714             else if (max >= REG_INFTY)
6715                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6716             RExC_parse = next;
6717             nextchar(pRExC_state);
6718
6719         do_curly:
6720             if ((flags&SIMPLE)) {
6721                 RExC_naughty += 2 + RExC_naughty / 2;
6722                 reginsert(pRExC_state, CURLY, ret, depth+1);
6723                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6724                 Set_Node_Cur_Length(ret);
6725             }
6726             else {
6727                 regnode * const w = reg_node(pRExC_state, WHILEM);
6728
6729                 w->flags = 0;
6730                 REGTAIL(pRExC_state, ret, w);
6731                 if (!SIZE_ONLY && RExC_extralen) {
6732                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6733                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6734                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6735                 }
6736                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6737                                 /* MJD hk */
6738                 Set_Node_Offset(ret, parse_start+1);
6739                 Set_Node_Length(ret,
6740                                 op == '{' ? (RExC_parse - parse_start) : 1);
6741
6742                 if (!SIZE_ONLY && RExC_extralen)
6743                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6744                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6745                 if (SIZE_ONLY)
6746                     RExC_whilem_seen++, RExC_extralen += 3;
6747                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6748             }
6749             ret->flags = 0;
6750
6751             if (min > 0)
6752                 *flagp = WORST;
6753             if (max > 0)
6754                 *flagp |= HASWIDTH;
6755             if (max < min)
6756                 vFAIL("Can't do {n,m} with n > m");
6757             if (!SIZE_ONLY) {
6758                 ARG1_SET(ret, (U16)min);
6759                 ARG2_SET(ret, (U16)max);
6760             }
6761
6762             goto nest_check;
6763         }
6764     }
6765
6766     if (!ISMULT1(op)) {
6767         *flagp = flags;
6768         return(ret);
6769     }
6770
6771 #if 0                           /* Now runtime fix should be reliable. */
6772
6773     /* if this is reinstated, don't forget to put this back into perldiag:
6774
6775             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6776
6777            (F) The part of the regexp subject to either the * or + quantifier
6778            could match an empty string. The {#} shows in the regular
6779            expression about where the problem was discovered.
6780
6781     */
6782
6783     if (!(flags&HASWIDTH) && op != '?')
6784       vFAIL("Regexp *+ operand could be empty");
6785 #endif
6786
6787     parse_start = RExC_parse;
6788     nextchar(pRExC_state);
6789
6790     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6791
6792     if (op == '*' && (flags&SIMPLE)) {
6793         reginsert(pRExC_state, STAR, ret, depth+1);
6794         ret->flags = 0;
6795         RExC_naughty += 4;
6796     }
6797     else if (op == '*') {
6798         min = 0;
6799         goto do_curly;
6800     }
6801     else if (op == '+' && (flags&SIMPLE)) {
6802         reginsert(pRExC_state, PLUS, ret, depth+1);
6803         ret->flags = 0;
6804         RExC_naughty += 3;
6805     }
6806     else if (op == '+') {
6807         min = 1;
6808         goto do_curly;
6809     }
6810     else if (op == '?') {
6811         min = 0; max = 1;
6812         goto do_curly;
6813     }
6814   nest_check:
6815     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6816         ckWARN3reg(RExC_parse,
6817                    "%.*s matches null string many times",
6818                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6819                    origparse);
6820     }
6821
6822     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6823         nextchar(pRExC_state);
6824         reginsert(pRExC_state, MINMOD, ret, depth+1);
6825         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6826     }
6827 #ifndef REG_ALLOW_MINMOD_SUSPEND
6828     else
6829 #endif
6830     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6831         regnode *ender;
6832         nextchar(pRExC_state);
6833         ender = reg_node(pRExC_state, SUCCEED);
6834         REGTAIL(pRExC_state, ret, ender);
6835         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6836         ret->flags = 0;
6837         ender = reg_node(pRExC_state, TAIL);
6838         REGTAIL(pRExC_state, ret, ender);
6839         /*ret= ender;*/
6840     }
6841
6842     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6843         RExC_parse++;
6844         vFAIL("Nested quantifiers");
6845     }
6846
6847     return(ret);
6848 }
6849
6850
6851 /* reg_namedseq(pRExC_state,UVp)
6852
6853    This is expected to be called by a parser routine that has
6854    recognized '\N' and needs to handle the rest. RExC_parse is
6855    expected to point at the first char following the N at the time
6856    of the call.
6857
6858    The \N may be inside (indicated by valuep not being NULL) or outside a
6859    character class.
6860
6861    \N may begin either a named sequence, or if outside a character class, mean
6862    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6863    attempted to decide which, and in the case of a named sequence converted it
6864    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6865    where c1... are the characters in the sequence.  For single-quoted regexes,
6866    the tokenizer passes the \N sequence through unchanged; this code will not
6867    attempt to determine this nor expand those.  The net effect is that if the
6868    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6869    signals that this \N occurrence means to match a non-newline.
6870
6871    Only the \N{U+...} form should occur in a character class, for the same
6872    reason that '.' inside a character class means to just match a period: it
6873    just doesn't make sense.
6874
6875    If valuep is non-null then it is assumed that we are parsing inside
6876    of a charclass definition and the first codepoint in the resolved
6877    string is returned via *valuep and the routine will return NULL.
6878    In this mode if a multichar string is returned from the charnames
6879    handler, a warning will be issued, and only the first char in the
6880    sequence will be examined. If the string returned is zero length
6881    then the value of *valuep is undefined and NON-NULL will
6882    be returned to indicate failure. (This will NOT be a valid pointer
6883    to a regnode.)
6884
6885    If valuep is null then it is assumed that we are parsing normal text and a
6886    new EXACT node is inserted into the program containing the resolved string,
6887    and a pointer to the new node is returned.  But if the string is zero length
6888    a NOTHING node is emitted instead.
6889
6890    On success RExC_parse is set to the char following the endbrace.
6891    Parsing failures will generate a fatal error via vFAIL(...)
6892  */
6893 STATIC regnode *
6894 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6895 {
6896     char * endbrace;    /* '}' following the name */
6897     regnode *ret = NULL;
6898 #ifdef DEBUGGING
6899     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6900 #endif
6901     char* p;
6902
6903     GET_RE_DEBUG_FLAGS_DECL;
6904
6905     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6906
6907     GET_RE_DEBUG_FLAGS;
6908
6909     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6910      * modifier.  The other meaning does not */
6911     p = (RExC_flags & RXf_PMf_EXTENDED)
6912         ? regwhite( pRExC_state, RExC_parse )
6913         : RExC_parse;
6914
6915     /* Disambiguate between \N meaning a named character versus \N meaning
6916      * [^\n].  The former is assumed when it can't be the latter. */
6917     if (*p != '{' || regcurly(p)) {
6918         RExC_parse = p;
6919         if (valuep) {
6920             /* no bare \N in a charclass */
6921             vFAIL("\\N in a character class must be a named character: \\N{...}");
6922         }
6923         nextchar(pRExC_state);
6924         ret = reg_node(pRExC_state, REG_ANY);
6925         *flagp |= HASWIDTH|SIMPLE;
6926         RExC_naughty++;
6927         RExC_parse--;
6928         Set_Node_Length(ret, 1); /* MJD */
6929         return ret;
6930     }
6931
6932     /* Here, we have decided it should be a named sequence */
6933
6934     /* The test above made sure that the next real character is a '{', but
6935      * under the /x modifier, it could be separated by space (or a comment and
6936      * \n) and this is not allowed (for consistency with \x{...} and the
6937      * tokenizer handling of \N{NAME}). */
6938     if (*RExC_parse != '{') {
6939         vFAIL("Missing braces on \\N{}");
6940     }
6941
6942     RExC_parse++;       /* Skip past the '{' */
6943
6944     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6945         || ! (endbrace == RExC_parse            /* nothing between the {} */
6946               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
6947                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6948     {
6949         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
6950         vFAIL("\\N{NAME} must be resolved by the lexer");
6951     }
6952
6953     if (endbrace == RExC_parse) {   /* empty: \N{} */
6954         if (! valuep) {
6955             RExC_parse = endbrace + 1;
6956             return reg_node(pRExC_state,NOTHING);
6957         }
6958
6959         if (SIZE_ONLY) {
6960             ckWARNreg(RExC_parse,
6961                     "Ignoring zero length \\N{} in character class"
6962             );
6963             RExC_parse = endbrace + 1;
6964         }
6965         *valuep = 0;
6966         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6967     }
6968
6969     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
6970     RExC_parse += 2;    /* Skip past the 'U+' */
6971
6972     if (valuep) {   /* In a bracketed char class */
6973         /* We only pay attention to the first char of
6974         multichar strings being returned. I kinda wonder
6975         if this makes sense as it does change the behaviour
6976         from earlier versions, OTOH that behaviour was broken
6977         as well. XXX Solution is to recharacterize as
6978         [rest-of-class]|multi1|multi2... */
6979
6980         STRLEN length_of_hex;
6981         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6982             | PERL_SCAN_DISALLOW_PREFIX
6983             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6984
6985         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6986         if (endchar < endbrace) {
6987             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6988         }
6989
6990         length_of_hex = (STRLEN)(endchar - RExC_parse);
6991         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6992
6993         /* The tokenizer should have guaranteed validity, but it's possible to
6994          * bypass it by using single quoting, so check */
6995         if (length_of_hex == 0
6996             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6997         {
6998             RExC_parse += length_of_hex;        /* Includes all the valid */
6999             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7000                             ? UTF8SKIP(RExC_parse)
7001                             : 1;
7002             /* Guard against malformed utf8 */
7003             if (RExC_parse >= endchar) RExC_parse = endchar;
7004             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7005         }
7006
7007         RExC_parse = endbrace + 1;
7008         if (endchar == endbrace) return NULL;
7009
7010         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7011     }
7012     else {      /* Not a char class */
7013         char *s;            /* String to put in generated EXACT node */
7014         STRLEN len = 0;     /* Its current byte length */
7015         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7016                                stream */
7017
7018         ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7019                                                    : (LOC)
7020                                                       ? EXACTFL
7021                                                       : UNI_SEMANTICS
7022                                                         ? EXACTFU
7023                                                         : EXACTF));
7024         s= STRING(ret);
7025
7026         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7027          * the input which is of the form now 'c1.c2.c3...}' until find the
7028          * ending brace or exceed length 255.  The characters that exceed this
7029          * limit are dropped.  The limit could be relaxed should it become
7030          * desirable by reparsing this as (?:\N{NAME}), so could generate
7031          * multiple EXACT nodes, as is done for just regular input.  But this
7032          * is primarily a named character, and not intended to be a huge long
7033          * string, so 255 bytes should be good enough */
7034         while (1) {
7035             STRLEN length_of_hex;
7036             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7037                             | PERL_SCAN_DISALLOW_PREFIX
7038                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7039             UV cp;  /* Ord of current character */
7040
7041             /* Code points are separated by dots.  If none, there is only one
7042              * code point, and is terminated by the brace */
7043             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7044
7045             /* The values are Unicode even on EBCDIC machines */
7046             length_of_hex = (STRLEN)(endchar - RExC_parse);
7047             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7048             if ( length_of_hex == 0
7049                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7050             {
7051                 RExC_parse += length_of_hex;        /* Includes all the valid */
7052                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7053                                 ? UTF8SKIP(RExC_parse)
7054                                 : 1;
7055                 /* Guard against malformed utf8 */
7056                 if (RExC_parse >= endchar) RExC_parse = endchar;
7057                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7058             }
7059
7060             if (! FOLD) {       /* Not folding, just append to the string */
7061                 STRLEN unilen;
7062
7063                 /* Quit before adding this character if would exceed limit */
7064                 if (len + UNISKIP(cp) > U8_MAX) break;
7065
7066                 unilen = reguni(pRExC_state, cp, s);
7067                 if (unilen > 0) {
7068                     s   += unilen;
7069                     len += unilen;
7070                 }
7071             } else {    /* Folding, output the folded equivalent */
7072                 STRLEN foldlen,numlen;
7073                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7074                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7075
7076                 /* Quit before exceeding size limit */
7077                 if (len + foldlen > U8_MAX) break;
7078
7079                 for (foldbuf = tmpbuf;
7080                     foldlen;
7081                     foldlen -= numlen)
7082                 {
7083                     cp = utf8_to_uvchr(foldbuf, &numlen);
7084                     if (numlen > 0) {
7085                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7086                         s       += unilen;
7087                         len     += unilen;
7088                         /* In EBCDIC the numlen and unilen can differ. */
7089                         foldbuf += numlen;
7090                         if (numlen >= foldlen)
7091                             break;
7092                     }
7093                     else
7094                         break; /* "Can't happen." */
7095                 }
7096             }
7097
7098             /* Point to the beginning of the next character in the sequence. */
7099             RExC_parse = endchar + 1;
7100
7101             /* Quit if no more characters */
7102             if (RExC_parse >= endbrace) break;
7103         }
7104
7105
7106         if (SIZE_ONLY) {
7107             if (RExC_parse < endbrace) {
7108                 ckWARNreg(RExC_parse - 1,
7109                           "Using just the first characters returned by \\N{}");
7110             }
7111
7112             RExC_size += STR_SZ(len);
7113         } else {
7114             STR_LEN(ret) = len;
7115             RExC_emit += STR_SZ(len);
7116         }
7117
7118         RExC_parse = endbrace + 1;
7119
7120         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7121                                with malformed in t/re/pat_advanced.t */
7122         RExC_parse --;
7123         Set_Node_Cur_Length(ret); /* MJD */
7124         nextchar(pRExC_state);
7125     }
7126
7127     return ret;
7128 }
7129
7130
7131 /*
7132  * reg_recode
7133  *
7134  * It returns the code point in utf8 for the value in *encp.
7135  *    value: a code value in the source encoding
7136  *    encp:  a pointer to an Encode object
7137  *
7138  * If the result from Encode is not a single character,
7139  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7140  */
7141 STATIC UV
7142 S_reg_recode(pTHX_ const char value, SV **encp)
7143 {
7144     STRLEN numlen = 1;
7145     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7146     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7147     const STRLEN newlen = SvCUR(sv);
7148     UV uv = UNICODE_REPLACEMENT;
7149
7150     PERL_ARGS_ASSERT_REG_RECODE;
7151
7152     if (newlen)
7153         uv = SvUTF8(sv)
7154              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7155              : *(U8*)s;
7156
7157     if (!newlen || numlen != newlen) {
7158         uv = UNICODE_REPLACEMENT;
7159         *encp = NULL;
7160     }
7161     return uv;
7162 }
7163
7164
7165 /*
7166  - regatom - the lowest level
7167
7168    Try to identify anything special at the start of the pattern. If there
7169    is, then handle it as required. This may involve generating a single regop,
7170    such as for an assertion; or it may involve recursing, such as to
7171    handle a () structure.
7172
7173    If the string doesn't start with something special then we gobble up
7174    as much literal text as we can.
7175
7176    Once we have been able to handle whatever type of thing started the
7177    sequence, we return.
7178
7179    Note: we have to be careful with escapes, as they can be both literal
7180    and special, and in the case of \10 and friends can either, depending
7181    on context. Specifically there are two separate switches for handling
7182    escape sequences, with the one for handling literal escapes requiring
7183    a dummy entry for all of the special escapes that are actually handled
7184    by the other.
7185 */
7186
7187 STATIC regnode *
7188 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7189 {
7190     dVAR;
7191     register regnode *ret = NULL;
7192     I32 flags;
7193     char *parse_start = RExC_parse;
7194     GET_RE_DEBUG_FLAGS_DECL;
7195     DEBUG_PARSE("atom");
7196     *flagp = WORST;             /* Tentatively. */
7197
7198     PERL_ARGS_ASSERT_REGATOM;
7199
7200 tryagain:
7201     switch ((U8)*RExC_parse) {
7202     case '^':
7203         RExC_seen_zerolen++;
7204         nextchar(pRExC_state);
7205         if (RExC_flags & RXf_PMf_MULTILINE)
7206             ret = reg_node(pRExC_state, MBOL);
7207         else if (RExC_flags & RXf_PMf_SINGLELINE)
7208             ret = reg_node(pRExC_state, SBOL);
7209         else
7210             ret = reg_node(pRExC_state, BOL);
7211         Set_Node_Length(ret, 1); /* MJD */
7212         break;
7213     case '$':
7214         nextchar(pRExC_state);
7215         if (*RExC_parse)
7216             RExC_seen_zerolen++;
7217         if (RExC_flags & RXf_PMf_MULTILINE)
7218             ret = reg_node(pRExC_state, MEOL);
7219         else if (RExC_flags & RXf_PMf_SINGLELINE)
7220             ret = reg_node(pRExC_state, SEOL);
7221         else
7222             ret = reg_node(pRExC_state, EOL);
7223         Set_Node_Length(ret, 1); /* MJD */
7224         break;
7225     case '.':
7226         nextchar(pRExC_state);
7227         if (RExC_flags & RXf_PMf_SINGLELINE)
7228             ret = reg_node(pRExC_state, SANY);
7229         else
7230             ret = reg_node(pRExC_state, REG_ANY);
7231         *flagp |= HASWIDTH|SIMPLE;
7232         RExC_naughty++;
7233         Set_Node_Length(ret, 1); /* MJD */
7234         break;
7235     case '[':
7236     {
7237         char * const oregcomp_parse = ++RExC_parse;
7238         ret = regclass(pRExC_state,depth+1);
7239         if (*RExC_parse != ']') {
7240             RExC_parse = oregcomp_parse;
7241             vFAIL("Unmatched [");
7242         }
7243         nextchar(pRExC_state);
7244         *flagp |= HASWIDTH|SIMPLE;
7245         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7246         break;
7247     }
7248     case '(':
7249         nextchar(pRExC_state);
7250         ret = reg(pRExC_state, 1, &flags,depth+1);
7251         if (ret == NULL) {
7252                 if (flags & TRYAGAIN) {
7253                     if (RExC_parse == RExC_end) {
7254                          /* Make parent create an empty node if needed. */
7255                         *flagp |= TRYAGAIN;
7256                         return(NULL);
7257                     }
7258                     goto tryagain;
7259                 }
7260                 return(NULL);
7261         }
7262         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7263         break;
7264     case '|':
7265     case ')':
7266         if (flags & TRYAGAIN) {
7267             *flagp |= TRYAGAIN;
7268             return NULL;
7269         }
7270         vFAIL("Internal urp");
7271                                 /* Supposed to be caught earlier. */
7272         break;
7273     case '{':
7274         if (!regcurly(RExC_parse)) {
7275             RExC_parse++;
7276             goto defchar;
7277         }
7278         /* FALL THROUGH */
7279     case '?':
7280     case '+':
7281     case '*':
7282         RExC_parse++;
7283         vFAIL("Quantifier follows nothing");
7284         break;
7285     case LATIN_SMALL_LETTER_SHARP_S:
7286     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7287     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7288 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7289 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
7290     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7291 #endif
7292         do_foldchar:
7293         if (!LOC && FOLD) {
7294             U32 len,cp;
7295             len=0; /* silence a spurious compiler warning */
7296             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7297                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7298                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7299                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7300                 Set_Node_Length(ret, 1); /* MJD */
7301                 nextchar(pRExC_state); /* kill whitespace under /x */
7302                 return ret;
7303             }
7304         }
7305         goto outer_default;
7306     case '\\':
7307         /* Special Escapes
7308
7309            This switch handles escape sequences that resolve to some kind
7310            of special regop and not to literal text. Escape sequnces that
7311            resolve to literal text are handled below in the switch marked
7312            "Literal Escapes".
7313
7314            Every entry in this switch *must* have a corresponding entry
7315            in the literal escape switch. However, the opposite is not
7316            required, as the default for this switch is to jump to the
7317            literal text handling code.
7318         */
7319         switch ((U8)*++RExC_parse) {
7320         case LATIN_SMALL_LETTER_SHARP_S:
7321         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7322         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7323                    goto do_foldchar;
7324         /* Special Escapes */
7325         case 'A':
7326             RExC_seen_zerolen++;
7327             ret = reg_node(pRExC_state, SBOL);
7328             *flagp |= SIMPLE;
7329             goto finish_meta_pat;
7330         case 'G':
7331             ret = reg_node(pRExC_state, GPOS);
7332             RExC_seen |= REG_SEEN_GPOS;
7333             *flagp |= SIMPLE;
7334             goto finish_meta_pat;
7335         case 'K':
7336             RExC_seen_zerolen++;
7337             ret = reg_node(pRExC_state, KEEPS);
7338             *flagp |= SIMPLE;
7339             /* XXX:dmq : disabling in-place substitution seems to
7340              * be necessary here to avoid cases of memory corruption, as
7341              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7342              */
7343             RExC_seen |= REG_SEEN_LOOKBEHIND;
7344             goto finish_meta_pat;
7345         case 'Z':
7346             ret = reg_node(pRExC_state, SEOL);
7347             *flagp |= SIMPLE;
7348             RExC_seen_zerolen++;                /* Do not optimize RE away */
7349             goto finish_meta_pat;
7350         case 'z':
7351             ret = reg_node(pRExC_state, EOS);
7352             *flagp |= SIMPLE;
7353             RExC_seen_zerolen++;                /* Do not optimize RE away */
7354             goto finish_meta_pat;
7355         case 'C':
7356             ret = reg_node(pRExC_state, CANY);
7357             RExC_seen |= REG_SEEN_CANY;
7358             *flagp |= HASWIDTH|SIMPLE;
7359             goto finish_meta_pat;
7360         case 'X':
7361             ret = reg_node(pRExC_state, CLUMP);
7362             *flagp |= HASWIDTH;
7363             goto finish_meta_pat;
7364         case 'w':
7365             if (LOC) {
7366                 ret = reg_node(pRExC_state, (U8)(ALNUML));
7367             } else {
7368                 ret = reg_node(pRExC_state, (U8)(ALNUM));
7369             }
7370             FLAGS(ret) = get_regex_charset(RExC_flags);
7371             *flagp |= HASWIDTH|SIMPLE;
7372             goto finish_meta_pat;
7373         case 'W':
7374             if (LOC) {
7375                 ret = reg_node(pRExC_state, (U8)(NALNUML));
7376             } else {
7377                 ret = reg_node(pRExC_state, (U8)(NALNUM));
7378             }
7379             FLAGS(ret) = get_regex_charset(RExC_flags);
7380             *flagp |= HASWIDTH|SIMPLE;
7381             goto finish_meta_pat;
7382         case 'b':
7383             RExC_seen_zerolen++;
7384             RExC_seen |= REG_SEEN_LOOKBEHIND;
7385             if (LOC) {
7386                 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7387             } else {
7388                 ret = reg_node(pRExC_state, (U8)(BOUND));
7389             }
7390             FLAGS(ret) = get_regex_charset(RExC_flags);
7391             *flagp |= SIMPLE;
7392             goto finish_meta_pat;
7393         case 'B':
7394             RExC_seen_zerolen++;
7395             RExC_seen |= REG_SEEN_LOOKBEHIND;
7396             if (LOC) {
7397                 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7398             } else {
7399                 ret = reg_node(pRExC_state, (U8)(NBOUND));
7400             }
7401             FLAGS(ret) = get_regex_charset(RExC_flags);
7402             *flagp |= SIMPLE;
7403             goto finish_meta_pat;
7404         case 's':
7405             if (LOC) {
7406                 ret = reg_node(pRExC_state, (U8)(SPACEL));
7407             } else {
7408                 ret = reg_node(pRExC_state, (U8)(SPACE));
7409             }
7410             FLAGS(ret) = get_regex_charset(RExC_flags);
7411             *flagp |= HASWIDTH|SIMPLE;
7412             goto finish_meta_pat;
7413         case 'S':
7414             if (LOC) {
7415                 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7416             } else {
7417                 ret = reg_node(pRExC_state, (U8)(NSPACE));
7418             }
7419             FLAGS(ret) = get_regex_charset(RExC_flags);
7420             *flagp |= HASWIDTH|SIMPLE;
7421             goto finish_meta_pat;
7422         case 'd':
7423             if (LOC) {
7424                 ret = reg_node(pRExC_state, (U8)(DIGITL));
7425             } else {
7426                 ret = reg_node(pRExC_state, (U8)(DIGIT));
7427             }
7428             *flagp |= HASWIDTH|SIMPLE;
7429             goto finish_meta_pat;
7430         case 'D':
7431             if (LOC) {
7432                 ret = reg_node(pRExC_state, (U8)(NDIGITL));
7433             } else {
7434                 ret = reg_node(pRExC_state, (U8)(NDIGIT));
7435             }
7436             *flagp |= HASWIDTH|SIMPLE;
7437             goto finish_meta_pat;
7438         case 'R':
7439             ret = reg_node(pRExC_state, LNBREAK);
7440             *flagp |= HASWIDTH|SIMPLE;
7441             goto finish_meta_pat;
7442         case 'h':
7443             ret = reg_node(pRExC_state, HORIZWS);
7444             *flagp |= HASWIDTH|SIMPLE;
7445             goto finish_meta_pat;
7446         case 'H':
7447             ret = reg_node(pRExC_state, NHORIZWS);
7448             *flagp |= HASWIDTH|SIMPLE;
7449             goto finish_meta_pat;
7450         case 'v':
7451             ret = reg_node(pRExC_state, VERTWS);
7452             *flagp |= HASWIDTH|SIMPLE;
7453             goto finish_meta_pat;
7454         case 'V':
7455             ret = reg_node(pRExC_state, NVERTWS);
7456             *flagp |= HASWIDTH|SIMPLE;
7457          finish_meta_pat:
7458             nextchar(pRExC_state);
7459             Set_Node_Length(ret, 2); /* MJD */
7460             break;
7461         case 'p':
7462         case 'P':
7463             {
7464                 char* const oldregxend = RExC_end;
7465 #ifdef DEBUGGING
7466                 char* parse_start = RExC_parse - 2;
7467 #endif
7468
7469                 if (RExC_parse[1] == '{') {
7470                   /* a lovely hack--pretend we saw [\pX] instead */
7471                     RExC_end = strchr(RExC_parse, '}');
7472                     if (!RExC_end) {
7473                         const U8 c = (U8)*RExC_parse;
7474                         RExC_parse += 2;
7475                         RExC_end = oldregxend;
7476                         vFAIL2("Missing right brace on \\%c{}", c);
7477                     }
7478                     RExC_end++;
7479                 }
7480                 else {
7481                     RExC_end = RExC_parse + 2;
7482                     if (RExC_end > oldregxend)
7483                         RExC_end = oldregxend;
7484                 }
7485                 RExC_parse--;
7486
7487                 ret = regclass(pRExC_state,depth+1);
7488
7489                 RExC_end = oldregxend;
7490                 RExC_parse--;
7491
7492                 Set_Node_Offset(ret, parse_start + 2);
7493                 Set_Node_Cur_Length(ret);
7494                 nextchar(pRExC_state);
7495                 *flagp |= HASWIDTH|SIMPLE;
7496             }
7497             break;
7498         case 'N':
7499             /* Handle \N and \N{NAME} here and not below because it can be
7500             multicharacter. join_exact() will join them up later on.
7501             Also this makes sure that things like /\N{BLAH}+/ and
7502             \N{BLAH} being multi char Just Happen. dmq*/
7503             ++RExC_parse;
7504             ret= reg_namedseq(pRExC_state, NULL, flagp);
7505             break;
7506         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7507         parse_named_seq:
7508         {
7509             char ch= RExC_parse[1];
7510             if (ch != '<' && ch != '\'' && ch != '{') {
7511                 RExC_parse++;
7512                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7513             } else {
7514                 /* this pretty much dupes the code for (?P=...) in reg(), if
7515                    you change this make sure you change that */
7516                 char* name_start = (RExC_parse += 2);
7517                 U32 num = 0;
7518                 SV *sv_dat = reg_scan_name(pRExC_state,
7519                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7520                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7521                 if (RExC_parse == name_start || *RExC_parse != ch)
7522                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7523
7524                 if (!SIZE_ONLY) {
7525                     num = add_data( pRExC_state, 1, "S" );
7526                     RExC_rxi->data->data[num]=(void*)sv_dat;
7527                     SvREFCNT_inc_simple_void(sv_dat);
7528                 }
7529
7530                 RExC_sawback = 1;
7531                 ret = reganode(pRExC_state,
7532                                ((! FOLD)
7533                                  ? NREF
7534                                  : (UNI_SEMANTICS)
7535                                    ? NREFFU
7536                                    : (LOC)
7537                                      ? NREFFL
7538                                      : NREFF),
7539                                 num);
7540                 *flagp |= HASWIDTH;
7541
7542                 /* override incorrect value set in reganode MJD */
7543                 Set_Node_Offset(ret, parse_start+1);
7544                 Set_Node_Cur_Length(ret); /* MJD */
7545                 nextchar(pRExC_state);
7546
7547             }
7548             break;
7549         }
7550         case 'g':
7551         case '1': case '2': case '3': case '4':
7552         case '5': case '6': case '7': case '8': case '9':
7553             {
7554                 I32 num;
7555                 bool isg = *RExC_parse == 'g';
7556                 bool isrel = 0;
7557                 bool hasbrace = 0;
7558                 if (isg) {
7559                     RExC_parse++;
7560                     if (*RExC_parse == '{') {
7561                         RExC_parse++;
7562                         hasbrace = 1;
7563                     }
7564                     if (*RExC_parse == '-') {
7565                         RExC_parse++;
7566                         isrel = 1;
7567                     }
7568                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7569                         if (isrel) RExC_parse--;
7570                         RExC_parse -= 2;
7571                         goto parse_named_seq;
7572                 }   }
7573                 num = atoi(RExC_parse);
7574                 if (isg && num == 0)
7575                     vFAIL("Reference to invalid group 0");
7576                 if (isrel) {
7577                     num = RExC_npar - num;
7578                     if (num < 1)
7579                         vFAIL("Reference to nonexistent or unclosed group");
7580                 }
7581                 if (!isg && num > 9 && num >= RExC_npar)
7582                     goto defchar;
7583                 else {
7584                     char * const parse_start = RExC_parse - 1; /* MJD */
7585                     while (isDIGIT(*RExC_parse))
7586                         RExC_parse++;
7587                     if (parse_start == RExC_parse - 1)
7588                         vFAIL("Unterminated \\g... pattern");
7589                     if (hasbrace) {
7590                         if (*RExC_parse != '}')
7591                             vFAIL("Unterminated \\g{...} pattern");
7592                         RExC_parse++;
7593                     }
7594                     if (!SIZE_ONLY) {
7595                         if (num > (I32)RExC_rx->nparens)
7596                             vFAIL("Reference to nonexistent group");
7597                     }
7598                     RExC_sawback = 1;
7599                     ret = reganode(pRExC_state,
7600                                    ((! FOLD)
7601                                      ? REF
7602                                      : (UNI_SEMANTICS)
7603                                        ? REFFU
7604                                        : (LOC)
7605                                          ? REFFL
7606                                          : REFF),
7607                                     num);
7608                     *flagp |= HASWIDTH;
7609
7610                     /* override incorrect value set in reganode MJD */
7611                     Set_Node_Offset(ret, parse_start+1);
7612                     Set_Node_Cur_Length(ret); /* MJD */
7613                     RExC_parse--;
7614                     nextchar(pRExC_state);
7615                 }
7616             }
7617             break;
7618         case '\0':
7619             if (RExC_parse >= RExC_end)
7620                 FAIL("Trailing \\");
7621             /* FALL THROUGH */
7622         default:
7623             /* Do not generate "unrecognized" warnings here, we fall
7624                back into the quick-grab loop below */
7625             parse_start--;
7626             goto defchar;
7627         }
7628         break;
7629
7630     case '#':
7631         if (RExC_flags & RXf_PMf_EXTENDED) {
7632             if ( reg_skipcomment( pRExC_state ) )
7633                 goto tryagain;
7634         }
7635         /* FALL THROUGH */
7636
7637     default:
7638         outer_default:{
7639             register STRLEN len;
7640             register UV ender;
7641             register char *p;
7642             char *s;
7643             STRLEN foldlen;
7644             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7645
7646             parse_start = RExC_parse - 1;
7647
7648             RExC_parse++;
7649
7650         defchar:
7651             ender = 0;
7652             ret = reg_node(pRExC_state,
7653                            (U8) ((! FOLD) ? EXACT
7654                                           : (LOC)
7655                                              ? EXACTFL
7656                                              : (UNI_SEMANTICS)
7657                                                ? EXACTFU
7658                                                : EXACTF)
7659                     );
7660             s = STRING(ret);
7661             for (len = 0, p = RExC_parse - 1;
7662               len < 127 && p < RExC_end;
7663               len++)
7664             {
7665                 char * const oldp = p;
7666
7667                 if (RExC_flags & RXf_PMf_EXTENDED)
7668                     p = regwhite( pRExC_state, p );
7669                 switch ((U8)*p) {
7670                 case LATIN_SMALL_LETTER_SHARP_S:
7671                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7672                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7673                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7674                                 goto normal_default;
7675                 case '^':
7676                 case '$':
7677                 case '.':
7678                 case '[':
7679                 case '(':
7680                 case ')':
7681                 case '|':
7682                     goto loopdone;
7683                 case '\\':
7684                     /* Literal Escapes Switch
7685
7686                        This switch is meant to handle escape sequences that
7687                        resolve to a literal character.
7688
7689                        Every escape sequence that represents something
7690                        else, like an assertion or a char class, is handled
7691                        in the switch marked 'Special Escapes' above in this
7692                        routine, but also has an entry here as anything that
7693                        isn't explicitly mentioned here will be treated as
7694                        an unescaped equivalent literal.
7695                     */
7696
7697                     switch ((U8)*++p) {
7698                     /* These are all the special escapes. */
7699                     case LATIN_SMALL_LETTER_SHARP_S:
7700                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7701                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7702                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7703                                 goto normal_default;
7704                     case 'A':             /* Start assertion */
7705                     case 'b': case 'B':   /* Word-boundary assertion*/
7706                     case 'C':             /* Single char !DANGEROUS! */
7707                     case 'd': case 'D':   /* digit class */
7708                     case 'g': case 'G':   /* generic-backref, pos assertion */
7709                     case 'h': case 'H':   /* HORIZWS */
7710                     case 'k': case 'K':   /* named backref, keep marker */
7711                     case 'N':             /* named char sequence */
7712                     case 'p': case 'P':   /* Unicode property */
7713                               case 'R':   /* LNBREAK */
7714                     case 's': case 'S':   /* space class */
7715                     case 'v': case 'V':   /* VERTWS */
7716                     case 'w': case 'W':   /* word class */
7717                     case 'X':             /* eXtended Unicode "combining character sequence" */
7718                     case 'z': case 'Z':   /* End of line/string assertion */
7719                         --p;
7720                         goto loopdone;
7721
7722                     /* Anything after here is an escape that resolves to a
7723                        literal. (Except digits, which may or may not)
7724                      */
7725                     case 'n':
7726                         ender = '\n';
7727                         p++;
7728                         break;
7729                     case 'r':
7730                         ender = '\r';
7731                         p++;
7732                         break;
7733                     case 't':
7734                         ender = '\t';
7735                         p++;
7736                         break;
7737                     case 'f':
7738                         ender = '\f';
7739                         p++;
7740                         break;
7741                     case 'e':
7742                           ender = ASCII_TO_NATIVE('\033');
7743                         p++;
7744                         break;
7745                     case 'a':
7746                           ender = ASCII_TO_NATIVE('\007');
7747                         p++;
7748                         break;
7749                     case 'o':
7750                         {
7751                             STRLEN brace_len = len;
7752                             UV result;
7753                             const char* error_msg;
7754
7755                             bool valid = grok_bslash_o(p,
7756                                                        &result,
7757                                                        &brace_len,
7758                                                        &error_msg,
7759                                                        1);
7760                             p += brace_len;
7761                             if (! valid) {
7762                                 RExC_parse = p; /* going to die anyway; point
7763                                                    to exact spot of failure */
7764                                 vFAIL(error_msg);
7765                             }
7766                             else
7767                             {
7768                                 ender = result;
7769                             }
7770                             if (PL_encoding && ender < 0x100) {
7771                                 goto recode_encoding;
7772                             }
7773                             if (ender > 0xff) {
7774                                 REQUIRE_UTF8;
7775                             }
7776                             break;
7777                         }
7778                     case 'x':
7779                         if (*++p == '{') {
7780                             char* const e = strchr(p, '}');
7781
7782                             if (!e) {
7783                                 RExC_parse = p + 1;
7784                                 vFAIL("Missing right brace on \\x{}");
7785                             }
7786                             else {
7787                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7788                                     | PERL_SCAN_DISALLOW_PREFIX;
7789                                 STRLEN numlen = e - p - 1;
7790                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7791                                 if (ender > 0xff)
7792                                     REQUIRE_UTF8;
7793                                 p = e + 1;
7794                             }
7795                         }
7796                         else {
7797                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7798                             STRLEN numlen = 2;
7799                             ender = grok_hex(p, &numlen, &flags, NULL);
7800                             p += numlen;
7801                         }
7802                         if (PL_encoding && ender < 0x100)
7803                             goto recode_encoding;
7804                         break;
7805                     case 'c':
7806                         p++;
7807                         ender = grok_bslash_c(*p++, SIZE_ONLY);
7808                         break;
7809                     case '0': case '1': case '2': case '3':case '4':
7810                     case '5': case '6': case '7': case '8':case '9':
7811                         if (*p == '0' ||
7812                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7813                         {
7814                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7815                             STRLEN numlen = 3;
7816                             ender = grok_oct(p, &numlen, &flags, NULL);
7817                             if (ender > 0xff) {
7818                                 REQUIRE_UTF8;
7819                             }
7820                             p += numlen;
7821                         }
7822                         else {
7823                             --p;
7824                             goto loopdone;
7825                         }
7826                         if (PL_encoding && ender < 0x100)
7827                             goto recode_encoding;
7828                         break;
7829                     recode_encoding:
7830                         {
7831                             SV* enc = PL_encoding;
7832                             ender = reg_recode((const char)(U8)ender, &enc);
7833                             if (!enc && SIZE_ONLY)
7834                                 ckWARNreg(p, "Invalid escape in the specified encoding");
7835                             REQUIRE_UTF8;
7836                         }
7837                         break;
7838                     case '\0':
7839                         if (p >= RExC_end)
7840                             FAIL("Trailing \\");
7841                         /* FALL THROUGH */
7842                     default:
7843                         if (!SIZE_ONLY&& isALPHA(*p))
7844                             ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7845                         goto normal_default;
7846                     }
7847                     break;
7848                 default:
7849                   normal_default:
7850                     if (UTF8_IS_START(*p) && UTF) {
7851                         STRLEN numlen;
7852                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7853                                                &numlen, UTF8_ALLOW_DEFAULT);
7854                         p += numlen;
7855                     }
7856                     else
7857                         ender = *p++;
7858                     break;
7859                 }
7860                 if ( RExC_flags & RXf_PMf_EXTENDED)
7861                     p = regwhite( pRExC_state, p );
7862                 if (UTF && FOLD) {
7863                     /* Prime the casefolded buffer. */
7864                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7865                 }
7866                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7867                     if (len)
7868                         p = oldp;
7869                     else if (UTF) {
7870                          if (FOLD) {
7871                               /* Emit all the Unicode characters. */
7872                               STRLEN numlen;
7873                               for (foldbuf = tmpbuf;
7874                                    foldlen;
7875                                    foldlen -= numlen) {
7876                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7877                                    if (numlen > 0) {
7878                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7879                                         s       += unilen;
7880                                         len     += unilen;
7881                                         /* In EBCDIC the numlen
7882                                          * and unilen can differ. */
7883                                         foldbuf += numlen;
7884                                         if (numlen >= foldlen)
7885                                              break;
7886                                    }
7887                                    else
7888                                         break; /* "Can't happen." */
7889                               }
7890                          }
7891                          else {
7892                               const STRLEN unilen = reguni(pRExC_state, ender, s);
7893                               if (unilen > 0) {
7894                                    s   += unilen;
7895                                    len += unilen;
7896                               }
7897                          }
7898                     }
7899                     else {
7900                         len++;
7901                         REGC((char)ender, s++);
7902                     }
7903                     break;
7904                 }
7905                 if (UTF) {
7906                      if (FOLD) {
7907                           /* Emit all the Unicode characters. */
7908                           STRLEN numlen;
7909                           for (foldbuf = tmpbuf;
7910                                foldlen;
7911                                foldlen -= numlen) {
7912                                ender = utf8_to_uvchr(foldbuf, &numlen);
7913                                if (numlen > 0) {
7914                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
7915                                     len     += unilen;
7916                                     s       += unilen;
7917                                     /* In EBCDIC the numlen
7918                                      * and unilen can differ. */
7919                                     foldbuf += numlen;
7920                                     if (numlen >= foldlen)
7921                                          break;
7922                                }
7923                                else
7924                                     break;
7925                           }
7926                      }
7927                      else {
7928                           const STRLEN unilen = reguni(pRExC_state, ender, s);
7929                           if (unilen > 0) {
7930                                s   += unilen;
7931                                len += unilen;
7932                           }
7933                      }
7934                      len--;
7935                 }
7936                 else
7937                     REGC((char)ender, s++);
7938             }
7939         loopdone:
7940             RExC_parse = p - 1;
7941             Set_Node_Cur_Length(ret); /* MJD */
7942             nextchar(pRExC_state);
7943             {
7944                 /* len is STRLEN which is unsigned, need to copy to signed */
7945                 IV iv = len;
7946                 if (iv < 0)
7947                     vFAIL("Internal disaster");
7948             }
7949             if (len > 0)
7950                 *flagp |= HASWIDTH;
7951             if (len == 1 && UNI_IS_INVARIANT(ender))
7952                 *flagp |= SIMPLE;
7953
7954             if (SIZE_ONLY)
7955                 RExC_size += STR_SZ(len);
7956             else {
7957                 STR_LEN(ret) = len;
7958                 RExC_emit += STR_SZ(len);
7959             }
7960         }
7961         break;
7962     }
7963
7964     return(ret);
7965 }
7966
7967 STATIC char *
7968 S_regwhite( RExC_state_t *pRExC_state, char *p )
7969 {
7970     const char *e = RExC_end;
7971
7972     PERL_ARGS_ASSERT_REGWHITE;
7973
7974     while (p < e) {
7975         if (isSPACE(*p))
7976             ++p;
7977         else if (*p == '#') {
7978             bool ended = 0;
7979             do {
7980                 if (*p++ == '\n') {
7981                     ended = 1;
7982                     break;
7983                 }
7984             } while (p < e);
7985             if (!ended)
7986                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7987         }
7988         else
7989             break;
7990     }
7991     return p;
7992 }
7993
7994 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7995    Character classes ([:foo:]) can also be negated ([:^foo:]).
7996    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7997    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7998    but trigger failures because they are currently unimplemented. */
7999
8000 #define POSIXCC_DONE(c)   ((c) == ':')
8001 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8002 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8003
8004 STATIC I32
8005 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
8006 {
8007     dVAR;
8008     I32 namedclass = OOB_NAMEDCLASS;
8009
8010     PERL_ARGS_ASSERT_REGPPOSIXCC;
8011
8012     if (value == '[' && RExC_parse + 1 < RExC_end &&
8013         /* I smell either [: or [= or [. -- POSIX has been here, right? */
8014         POSIXCC(UCHARAT(RExC_parse))) {
8015         const char c = UCHARAT(RExC_parse);
8016         char* const s = RExC_parse++;
8017
8018         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8019             RExC_parse++;
8020         if (RExC_parse == RExC_end)
8021             /* Grandfather lone [:, [=, [. */
8022             RExC_parse = s;
8023         else {
8024             const char* const t = RExC_parse++; /* skip over the c */
8025             assert(*t == c);
8026
8027             if (UCHARAT(RExC_parse) == ']') {
8028                 const char *posixcc = s + 1;
8029                 RExC_parse++; /* skip over the ending ] */
8030
8031                 if (*s == ':') {
8032                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8033                     const I32 skip = t - posixcc;
8034
8035                     /* Initially switch on the length of the name.  */
8036                     switch (skip) {
8037                     case 4:
8038                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8039                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
8040                         break;
8041                     case 5:
8042                         /* Names all of length 5.  */
8043                         /* alnum alpha ascii blank cntrl digit graph lower
8044                            print punct space upper  */
8045                         /* Offset 4 gives the best switch position.  */
8046                         switch (posixcc[4]) {
8047                         case 'a':
8048                             if (memEQ(posixcc, "alph", 4)) /* alpha */
8049                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
8050                             break;
8051                         case 'e':
8052                             if (memEQ(posixcc, "spac", 4)) /* space */
8053                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
8054                             break;
8055                         case 'h':
8056                             if (memEQ(posixcc, "grap", 4)) /* graph */
8057                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
8058                             break;
8059                         case 'i':
8060                             if (memEQ(posixcc, "asci", 4)) /* ascii */
8061                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
8062                             break;
8063                         case 'k':
8064                             if (memEQ(posixcc, "blan", 4)) /* blank */
8065                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
8066                             break;
8067                         case 'l':
8068                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8069                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
8070                             break;
8071                         case 'm':
8072                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
8073                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
8074                             break;
8075                         case 'r':
8076                             if (memEQ(posixcc, "lowe", 4)) /* lower */
8077                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8078                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
8079                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8080                             break;
8081                         case 't':
8082                             if (memEQ(posixcc, "digi", 4)) /* digit */
8083                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8084                             else if (memEQ(posixcc, "prin", 4)) /* print */
8085                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8086                             else if (memEQ(posixcc, "punc", 4)) /* punct */
8087                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8088                             break;
8089                         }
8090                         break;
8091                     case 6:
8092                         if (memEQ(posixcc, "xdigit", 6))
8093                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8094                         break;
8095                     }
8096
8097                     if (namedclass == OOB_NAMEDCLASS)
8098                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8099                                       t - s - 1, s + 1);
8100                     assert (posixcc[skip] == ':');
8101                     assert (posixcc[skip+1] == ']');
8102                 } else if (!SIZE_ONLY) {
8103                     /* [[=foo=]] and [[.foo.]] are still future. */
8104
8105                     /* adjust RExC_parse so the warning shows after
8106                        the class closes */
8107                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8108                         RExC_parse++;
8109                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8110                 }
8111             } else {
8112                 /* Maternal grandfather:
8113                  * "[:" ending in ":" but not in ":]" */
8114                 RExC_parse = s;
8115             }
8116         }
8117     }
8118
8119     return namedclass;
8120 }
8121
8122 STATIC void
8123 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8124 {
8125     dVAR;
8126
8127     PERL_ARGS_ASSERT_CHECKPOSIXCC;
8128
8129     if (POSIXCC(UCHARAT(RExC_parse))) {
8130         const char *s = RExC_parse;
8131         const char  c = *s++;
8132
8133         while (isALNUM(*s))
8134             s++;
8135         if (*s && c == *s && s[1] == ']') {
8136             ckWARN3reg(s+2,
8137                        "POSIX syntax [%c %c] belongs inside character classes",
8138                        c, c);
8139
8140             /* [[=foo=]] and [[.foo.]] are still future. */
8141             if (POSIXCC_NOTYET(c)) {
8142                 /* adjust RExC_parse so the error shows after
8143                    the class closes */
8144                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8145                     NOOP;
8146                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8147             }
8148         }
8149     }
8150 }
8151
8152 /* No locale test, and always Unicode semantics */
8153 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
8154 ANYOF_##NAME:                                                                  \
8155         for (value = 0; value < 256; value++)                                  \
8156             if (TEST)                                                          \
8157             stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
8158     yesno = '+';                                                               \
8159     what = WORD;                                                               \
8160     break;                                                                     \
8161 case ANYOF_N##NAME:                                                            \
8162         for (value = 0; value < 256; value++)                                  \
8163             if (!TEST)                                                         \
8164             stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
8165     yesno = '!';                                                               \
8166     what = WORD;                                                               \
8167     break
8168
8169 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8170  * there are two tests passed in, to use depending on that. There aren't any
8171  * cases where the label is different from the name, so no need for that
8172  * parameter */
8173 #define _C_C_T_(NAME,TEST_8,TEST_7,WORD)                                       \
8174 ANYOF_##NAME:                                                                  \
8175     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
8176     else if (UNI_SEMANTICS) {                                                  \
8177         for (value = 0; value < 256; value++) {                                \
8178             if (TEST_8) stored +=                                              \
8179                       S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
8180         }                                                                      \
8181     }                                                                          \
8182     else {                                                                     \
8183         for (value = 0; value < 128; value++) {                                \
8184             if (TEST_7) stored +=                                              \
8185                 S_set_regclass_bit(aTHX_ pRExC_state, ret,                     \
8186                                    (U8) UNI_TO_NATIVE(value));                 \
8187         }                                                                      \
8188     }                                                                          \
8189     yesno = '+';                                                               \
8190     what = WORD;                                                               \
8191     break;                                                                     \
8192 case ANYOF_N##NAME:                                                            \
8193     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
8194     else if (UNI_SEMANTICS) {                                                  \
8195         for (value = 0; value < 256; value++) {                                \
8196             if (! TEST_8) stored +=                                            \
8197                     S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);    \
8198         }                                                                      \
8199     }                                                                          \
8200     else {                                                                     \
8201         for (value = 0; value < 128; value++) {                                \
8202             if (! TEST_7) stored +=                                            \
8203                     S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);    \
8204         }                                                                      \
8205         /* For a non-ut8 target string with DEPENDS semantics, all above ASCII \
8206          * Latin1 code points match the complement of any of the classes.  But \
8207          * in utf8, they have their Unicode semantics, so can't just set them  \
8208          * in the bitmap, or else regexec.c will think they matched when they  \
8209          * shouldn't. */                                                       \
8210         ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8;              \
8211     }                                                                          \
8212     yesno = '!';                                                               \
8213     what = WORD;                                                               \
8214     break
8215
8216 /*
8217    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8218    so that it is possible to override the option here without having to
8219    rebuild the entire core. as we are required to do if we change regcomp.h
8220    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8221 */
8222 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8223 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8224 #endif
8225
8226 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8227 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8228 #else
8229 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8230 #endif
8231
8232 STATIC U8
8233 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8234 {
8235
8236     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8237      * Locale folding is done at run-time, so this function should not be
8238      * called for nodes that are for locales.
8239      *
8240      * This function simply sets the bit corresponding to the fold of the input
8241      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
8242      * 'F' is 'f'.
8243      *
8244      * It also sets any necessary flags, and returns the number of bits that
8245      * actually changed from 0 to 1 */
8246
8247     U8 stored = 0;
8248     U8 fold;
8249
8250     fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
8251                            : PL_fold[value];
8252
8253     /* It assumes the bit for 'value' has already been set */
8254     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8255         ANYOF_BITMAP_SET(node, fold);
8256         stored++;
8257     }
8258     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8259         || (! UNI_SEMANTICS
8260             && ! isASCII(value)
8261             && PL_fold_latin1[value] != value))
8262     {   /* A character that has a fold outside of Latin1 matches outside the
8263            bitmap, but only when the target string is utf8.  Similarly when we
8264            don't have unicode semantics for the above ASCII Latin-1 characters,
8265            and they have a fold, they should match if the target is utf8, and
8266            not otherwise */
8267         ANYOF_FLAGS(node) |= ANYOF_UTF8;
8268     }
8269
8270     return stored;
8271 }
8272
8273
8274 PERL_STATIC_INLINE U8
8275 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8276 {
8277     /* This inline function sets a bit in the bitmap if not already set, and if
8278      * appropriate, its fold, returning the number of bits that actually
8279      * changed from 0 to 1 */
8280
8281     U8 stored;
8282
8283     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
8284         return 0;
8285     }
8286
8287     ANYOF_BITMAP_SET(node, value);
8288     stored = 1;
8289
8290     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
8291         stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8292     }
8293
8294     return stored;
8295 }
8296
8297 /*
8298    parse a class specification and produce either an ANYOF node that
8299    matches the pattern or if the pattern matches a single char only and
8300    that char is < 256 and we are case insensitive then we produce an
8301    EXACT node instead.
8302 */
8303
8304 STATIC regnode *
8305 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8306 {
8307     dVAR;
8308     register UV nextvalue;
8309     register IV prevvalue = OOB_UNICODE;
8310     register IV range = 0;
8311     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8312     register regnode *ret;
8313     STRLEN numlen;
8314     IV namedclass;
8315     char *rangebegin = NULL;
8316     bool need_class = 0;
8317     SV *listsv = NULL;
8318     UV n;
8319     AV* unicode_alternate  = NULL;
8320 #ifdef EBCDIC
8321     UV literal_endpoint = 0;
8322 #endif
8323     UV stored = 0;  /* how many chars stored in the bitmap */
8324
8325     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8326         case we need to change the emitted regop to an EXACT. */
8327     const char * orig_parse = RExC_parse;
8328     GET_RE_DEBUG_FLAGS_DECL;
8329
8330     PERL_ARGS_ASSERT_REGCLASS;
8331 #ifndef DEBUGGING
8332     PERL_UNUSED_ARG(depth);
8333 #endif
8334
8335     DEBUG_PARSE("clas");
8336
8337     /* Assume we are going to generate an ANYOF node. */
8338     ret = reganode(pRExC_state, ANYOF, 0);
8339
8340     if (!SIZE_ONLY)
8341         ANYOF_FLAGS(ret) = 0;
8342
8343     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
8344         RExC_naughty++;
8345         RExC_parse++;
8346         if (!SIZE_ONLY)
8347             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8348     }
8349
8350     if (SIZE_ONLY) {
8351         RExC_size += ANYOF_SKIP;
8352 #ifdef ANYOF_ADD_LOC_SKIP
8353         if (LOC) {
8354             RExC_size += ANYOF_ADD_LOC_SKIP;
8355         }
8356 #endif
8357         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8358     }
8359     else {
8360         RExC_emit += ANYOF_SKIP;
8361         if (LOC) {
8362             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8363 #ifdef ANYOF_ADD_LOC_SKIP
8364             RExC_emit += ANYOF_ADD_LOC_SKIP;
8365 #endif
8366         }
8367         ANYOF_BITMAP_ZERO(ret);
8368         listsv = newSVpvs("# comment\n");
8369     }
8370
8371     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8372
8373     if (!SIZE_ONLY && POSIXCC(nextvalue))
8374         checkposixcc(pRExC_state);
8375
8376     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8377     if (UCHARAT(RExC_parse) == ']')
8378         goto charclassloop;
8379
8380 parseit:
8381     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8382
8383     charclassloop:
8384
8385         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8386
8387         if (!range)
8388             rangebegin = RExC_parse;
8389         if (UTF) {
8390             value = utf8n_to_uvchr((U8*)RExC_parse,
8391                                    RExC_end - RExC_parse,
8392                                    &numlen, UTF8_ALLOW_DEFAULT);
8393             RExC_parse += numlen;
8394         }
8395         else
8396             value = UCHARAT(RExC_parse++);
8397
8398         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8399         if (value == '[' && POSIXCC(nextvalue))
8400             namedclass = regpposixcc(pRExC_state, value);
8401         else if (value == '\\') {
8402             if (UTF) {
8403                 value = utf8n_to_uvchr((U8*)RExC_parse,
8404                                    RExC_end - RExC_parse,
8405                                    &numlen, UTF8_ALLOW_DEFAULT);
8406                 RExC_parse += numlen;
8407             }
8408             else
8409                 value = UCHARAT(RExC_parse++);
8410             /* Some compilers cannot handle switching on 64-bit integer
8411              * values, therefore value cannot be an UV.  Yes, this will
8412              * be a problem later if we want switch on Unicode.
8413              * A similar issue a little bit later when switching on
8414              * namedclass. --jhi */
8415             switch ((I32)value) {
8416             case 'w':   namedclass = ANYOF_ALNUM;       break;
8417             case 'W':   namedclass = ANYOF_NALNUM;      break;
8418             case 's':   namedclass = ANYOF_SPACE;       break;
8419             case 'S':   namedclass = ANYOF_NSPACE;      break;
8420             case 'd':   namedclass = ANYOF_DIGIT;       break;
8421             case 'D':   namedclass = ANYOF_NDIGIT;      break;
8422             case 'v':   namedclass = ANYOF_VERTWS;      break;
8423             case 'V':   namedclass = ANYOF_NVERTWS;     break;
8424             case 'h':   namedclass = ANYOF_HORIZWS;     break;
8425             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
8426             case 'N':  /* Handle \N{NAME} in class */
8427                 {
8428                     /* We only pay attention to the first char of
8429                     multichar strings being returned. I kinda wonder
8430                     if this makes sense as it does change the behaviour
8431                     from earlier versions, OTOH that behaviour was broken
8432                     as well. */
8433                     UV v; /* value is register so we cant & it /grrr */
8434                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8435                         goto parseit;
8436                     }
8437                     value= v;
8438                 }
8439                 break;
8440             case 'p':
8441             case 'P':
8442                 {
8443                 char *e;
8444                 if (RExC_parse >= RExC_end)
8445                     vFAIL2("Empty \\%c{}", (U8)value);
8446                 if (*RExC_parse == '{') {
8447                     const U8 c = (U8)value;
8448                     e = strchr(RExC_parse++, '}');
8449                     if (!e)
8450                         vFAIL2("Missing right brace on \\%c{}", c);
8451                     while (isSPACE(UCHARAT(RExC_parse)))
8452                         RExC_parse++;
8453                     if (e == RExC_parse)
8454                         vFAIL2("Empty \\%c{}", c);
8455                     n = e - RExC_parse;
8456                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8457                         n--;
8458                 }
8459                 else {
8460                     e = RExC_parse;
8461                     n = 1;
8462                 }
8463                 if (!SIZE_ONLY) {
8464                     if (UCHARAT(RExC_parse) == '^') {
8465                          RExC_parse++;
8466                          n--;
8467                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8468                          while (isSPACE(UCHARAT(RExC_parse))) {
8469                               RExC_parse++;
8470                               n--;
8471                          }
8472                     }
8473                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8474                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8475                 }
8476                 RExC_parse = e + 1;
8477
8478                 /* The \p could match something in the Latin1 range, hence
8479                  * something that isn't utf8 */
8480                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
8481                 if (FOLD) { /* And one of these could have a multi-char fold */
8482                     OP(ret) = ANYOFV;
8483                 }
8484                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8485                 }
8486                 break;
8487             case 'n':   value = '\n';                   break;
8488             case 'r':   value = '\r';                   break;
8489             case 't':   value = '\t';                   break;
8490             case 'f':   value = '\f';                   break;
8491             case 'b':   value = '\b';                   break;
8492             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8493             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8494             case 'o':
8495                 RExC_parse--;   /* function expects to be pointed at the 'o' */
8496                 {
8497                     const char* error_msg;
8498                     bool valid = grok_bslash_o(RExC_parse,
8499                                                &value,
8500                                                &numlen,
8501                                                &error_msg,
8502                                                SIZE_ONLY);
8503                     RExC_parse += numlen;
8504                     if (! valid) {
8505                         vFAIL(error_msg);
8506                     }
8507                 }
8508                 if (PL_encoding && value < 0x100) {
8509                     goto recode_encoding;
8510                 }
8511                 break;
8512             case 'x':
8513                 if (*RExC_parse == '{') {
8514                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8515                         | PERL_SCAN_DISALLOW_PREFIX;
8516                     char * const e = strchr(RExC_parse++, '}');
8517                     if (!e)
8518                         vFAIL("Missing right brace on \\x{}");
8519
8520                     numlen = e - RExC_parse;
8521                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8522                     RExC_parse = e + 1;
8523                 }
8524                 else {
8525                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8526                     numlen = 2;
8527                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8528                     RExC_parse += numlen;
8529                 }
8530                 if (PL_encoding && value < 0x100)
8531                     goto recode_encoding;
8532                 break;
8533             case 'c':
8534                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8535                 break;
8536             case '0': case '1': case '2': case '3': case '4':
8537             case '5': case '6': case '7':
8538                 {
8539                     /* Take 1-3 octal digits */
8540                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8541                     numlen = 3;
8542                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8543                     RExC_parse += numlen;
8544                     if (PL_encoding && value < 0x100)
8545                         goto recode_encoding;
8546                     break;
8547                 }
8548             recode_encoding:
8549                 {
8550                     SV* enc = PL_encoding;
8551                     value = reg_recode((const char)(U8)value, &enc);
8552                     if (!enc && SIZE_ONLY)
8553                         ckWARNreg(RExC_parse,
8554                                   "Invalid escape in the specified encoding");
8555                     break;
8556                 }
8557             default:
8558                 /* Allow \_ to not give an error */
8559                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8560                     ckWARN2reg(RExC_parse,
8561                                "Unrecognized escape \\%c in character class passed through",
8562                                (int)value);
8563                 }
8564                 break;
8565             }
8566         } /* end of \blah */
8567 #ifdef EBCDIC
8568         else
8569             literal_endpoint++;
8570 #endif
8571
8572         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8573
8574             /* What matches in a locale is not known until runtime, so need to
8575              * (one time per class) allocate extra space to pass to regexec.
8576              * The space will contain a bit for each named class that is to be
8577              * matched against.  This isn't needed for \p{} and pseudo-classes,
8578              * as they are not affected by locale, and hence are dealt with
8579              * separately */
8580             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8581                 need_class = 1;
8582                 if (SIZE_ONLY) {
8583 #ifdef ANYOF_CLASS_ADD_SKIP
8584                     RExC_size += ANYOF_CLASS_ADD_SKIP;
8585 #endif
8586                 }
8587                 else {
8588 #ifdef ANYOF_CLASS_ADD_SKIP
8589                     RExC_emit += ANYOF_CLASS_ADD_SKIP;
8590 #endif
8591                     ANYOF_CLASS_ZERO(ret);
8592                 }
8593                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8594             }
8595
8596             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
8597              * literal */
8598             if (range) {
8599                 if (!SIZE_ONLY) {
8600                     const int w =
8601                         RExC_parse >= rangebegin ?
8602                         RExC_parse - rangebegin : 0;
8603                     ckWARN4reg(RExC_parse,
8604                                "False [] range \"%*.*s\"",
8605                                w, w, rangebegin);
8606
8607                     if (prevvalue < 256) {
8608                         stored +=
8609                          S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
8610                         stored +=
8611                          S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8612                     }
8613                     else {
8614                         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8615                         Perl_sv_catpvf(aTHX_ listsv,
8616                            "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8617                     }
8618                 }
8619
8620                 range = 0; /* this was not a true range */
8621             }
8622
8623
8624
8625             if (!SIZE_ONLY) {
8626                 const char *what = NULL;
8627                 char yesno = 0;
8628
8629                 /* Possible truncation here but in some 64-bit environments
8630                  * the compiler gets heartburn about switch on 64-bit values.
8631                  * A similar issue a little earlier when switching on value.
8632                  * --jhi */
8633                 switch ((I32)namedclass) {
8634
8635                 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8636                 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8637                 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8638                 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8639                 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8640                 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8641                 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8642                 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8643                 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8644                 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
8645 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8646                 /* \s, \w match all unicode if utf8. */
8647                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8648                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8649 #else
8650                 /* \s, \w match ascii and locale only */
8651                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8652                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8653 #endif
8654                 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
8655                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8656                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8657                 case ANYOF_ASCII:
8658                     if (LOC)
8659                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8660                     else {
8661                         for (value = 0; value < 128; value++)
8662                             stored +=
8663                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8664                     }
8665                     yesno = '+';
8666                     what = NULL;        /* Doesn't match outside ascii, so
8667                                            don't want to add +utf8:: */
8668                     break;
8669                 case ANYOF_NASCII:
8670                     if (LOC)
8671                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8672                     else {
8673                         for (value = 128; value < 256; value++)
8674                             stored +=
8675                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8676                     }
8677                     yesno = '!';
8678                     what = "ASCII";
8679                     break;
8680                 case ANYOF_DIGIT:
8681                     if (LOC)
8682                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8683                     else {
8684                         /* consecutive digits assumed */
8685                         for (value = '0'; value <= '9'; value++)
8686                             stored +=
8687                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8688                     }
8689                     yesno = '+';
8690                     what = POSIX_CC_UNI_NAME("Digit");
8691                     break;
8692                 case ANYOF_NDIGIT:
8693                     if (LOC)
8694                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8695                     else {
8696                         /* consecutive digits assumed */
8697                         for (value = 0; value < '0'; value++)
8698                             stored +=
8699                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8700                         for (value = '9' + 1; value < 256; value++)
8701                             stored +=
8702                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8703                     }
8704                     yesno = '!';
8705                     what = POSIX_CC_UNI_NAME("Digit");
8706                     break;
8707                 case ANYOF_MAX:
8708                     /* this is to handle \p and \P */
8709                     break;
8710                 default:
8711                     vFAIL("Invalid [::] class");
8712                     break;
8713                 }
8714                 if (what) {
8715                     /* Strings such as "+utf8::isWord\n" */
8716                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8717                     ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8718                 }
8719
8720                 continue;
8721             }
8722         } /* end of namedclass \blah */
8723
8724         if (range) {
8725             if (prevvalue > (IV)value) /* b-a */ {
8726                 const int w = RExC_parse - rangebegin;
8727                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8728                 range = 0; /* not a valid range */
8729             }
8730         }
8731         else {
8732             prevvalue = value; /* save the beginning of the range */
8733             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8734                 RExC_parse[1] != ']') {
8735                 RExC_parse++;
8736
8737                 /* a bad range like \w-, [:word:]- ? */
8738                 if (namedclass > OOB_NAMEDCLASS) {
8739                     if (ckWARN(WARN_REGEXP)) {
8740                         const int w =
8741                             RExC_parse >= rangebegin ?
8742                             RExC_parse - rangebegin : 0;
8743                         vWARN4(RExC_parse,
8744                                "False [] range \"%*.*s\"",
8745                                w, w, rangebegin);
8746                     }
8747                     if (!SIZE_ONLY)
8748                         stored +=
8749                             S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8750                 } else
8751                     range = 1;  /* yeah, it's a range! */
8752                 continue;       /* but do it the next time */
8753             }
8754         }
8755
8756         /* now is the next time */
8757         if (!SIZE_ONLY) {
8758             if (prevvalue < 256) {
8759                 const IV ceilvalue = value < 256 ? value : 255;
8760                 IV i;
8761 #ifdef EBCDIC
8762                 /* In EBCDIC [\x89-\x91] should include
8763                  * the \x8e but [i-j] should not. */
8764                 if (literal_endpoint == 2 &&
8765                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8766                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8767                 {
8768                     if (isLOWER(prevvalue)) {
8769                         for (i = prevvalue; i <= ceilvalue; i++)
8770                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8771                                 stored +=
8772                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8773                             }
8774                     } else {
8775                         for (i = prevvalue; i <= ceilvalue; i++)
8776                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8777                                 stored +=
8778                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8779                             }
8780                     }
8781                 }
8782                 else
8783 #endif
8784                       for (i = prevvalue; i <= ceilvalue; i++) {
8785                         stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8786                       }
8787           }
8788           if (value > 255 || UTF) {
8789                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8790                 const UV natvalue      = NATIVE_TO_UNI(value);
8791
8792                 /* If the code point requires utf8 to represent, and we are not
8793                  * folding, it can't match unless the target is in utf8.  Only
8794                  * a few code points above 255 fold to below it, so XXX an
8795                  * optimization would be to know which ones and set the flag
8796                  * appropriately. */
8797                 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8798                                     ? ANYOF_NONBITMAP
8799                                     : ANYOF_UTF8;
8800                 if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
8801
8802                     /* The \t sets the whole range */
8803                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8804                                    prevnatvalue, natvalue);
8805
8806                     /* Currently, we don't look at every value in the range.
8807                      * Therefore we have to assume the worst case: that if
8808                      * folding, it will match more than one character */
8809                     if (FOLD) {
8810                       OP(ret) = ANYOFV;
8811                     }
8812                 }
8813                 else if (prevnatvalue == natvalue) {
8814                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8815                     if (FOLD) {
8816                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8817                          STRLEN foldlen;
8818                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8819
8820 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8821                          if (RExC_precomp[0] == ':' &&
8822                              RExC_precomp[1] == '[' &&
8823                              (f == 0xDF || f == 0x92)) {
8824                              f = NATIVE_TO_UNI(f);
8825                         }
8826 #endif
8827                          /* If folding and foldable and a single
8828                           * character, insert also the folded version
8829                           * to the charclass. */
8830                          if (f != value) {
8831 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8832                              if ((RExC_precomp[0] == ':' &&
8833                                   RExC_precomp[1] == '[' &&
8834                                   (f == 0xA2 &&
8835                                    (value == 0xFB05 || value == 0xFB06))) ?
8836                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8837                                  foldlen == (STRLEN)UNISKIP(f) )
8838 #else
8839                               if (foldlen == (STRLEN)UNISKIP(f))
8840 #endif
8841                                   Perl_sv_catpvf(aTHX_ listsv,
8842                                                  "%04"UVxf"\n", f);
8843                               else {
8844                                   /* Any multicharacter foldings
8845                                    * require the following transform:
8846                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8847                                    * where E folds into "pq" and F folds
8848                                    * into "rst", all other characters
8849                                    * fold to single characters.  We save
8850                                    * away these multicharacter foldings,
8851                                    * to be later saved as part of the
8852                                    * additional "s" data. */
8853                                   SV *sv;
8854
8855                                   if (!unicode_alternate)
8856                                       unicode_alternate = newAV();
8857                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8858                                                      TRUE);
8859                                   av_push(unicode_alternate, sv);
8860                                   OP(ret) = ANYOFV;
8861                               }
8862                          }
8863
8864                          /* If folding and the value is one of the Greek
8865                           * sigmas insert a few more sigmas to make the
8866                           * folding rules of the sigmas to work right.
8867                           * Note that not all the possible combinations
8868                           * are handled here: some of them are handled
8869                           * by the standard folding rules, and some of
8870                           * them (literal or EXACTF cases) are handled
8871                           * during runtime in regexec.c:S_find_byclass(). */
8872                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8873                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8874                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8875                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8876                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8877                          }
8878                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8879                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8880                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8881                     }
8882                 }
8883             }
8884 #ifdef EBCDIC
8885             literal_endpoint = 0;
8886 #endif
8887         }
8888
8889         range = 0; /* this range (if it was one) is done now */
8890     }
8891
8892
8893
8894     if (SIZE_ONLY)
8895         return ret;
8896     /****** !SIZE_ONLY AFTER HERE *********/
8897
8898     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
8899      * set the FOLD flag yet, so this this does optimize those.  It doesn't
8900      * optimize locale.  Doing so perhaps could be done as long as there is
8901      * nothing like \w in it; some thought also would have to be given to the
8902      * interaction with above 0x100 chars */
8903     if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8904         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8905             ANYOF_BITMAP(ret)[value] ^= 0xFF;
8906         stored = 256 - stored;
8907
8908         /* The inversion means that everything above 255 is matched; and at the
8909          * same time we clear the invert flag */
8910         ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
8911     }
8912
8913     if (FOLD) {
8914         SV *sv;
8915
8916         /* This is the one character in the bitmap that needs special handling
8917          * under non-locale folding, as it folds to two characters 'ss'.  This
8918          * happens if it is set and not inverting, or isn't set and are
8919          * inverting */
8920         if (! LOC
8921             && (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S))
8922                 ^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT)))
8923         {
8924             OP(ret) = ANYOFV;   /* Can match more than a single char */
8925
8926             /* Under Unicode semantics), it can do this when the target string
8927              * isn't in utf8 */
8928             if (UNI_SEMANTICS) {
8929                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
8930             }
8931
8932             if (!unicode_alternate) {
8933                 unicode_alternate = newAV();
8934             }
8935             sv = newSVpvn_utf8("ss", 2, TRUE);
8936             av_push(unicode_alternate, sv);
8937         }
8938
8939         /* Folding in the bitmap is taken care of above, but not for locale
8940          * (for which we have to wait to see what folding is in effect at
8941          * runtime), and for things not in the bitmap.  Set run-time fold flag
8942          * for these */
8943         if ((LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
8944             ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
8945         }
8946     }
8947
8948     /* A single character class can be "optimized" into an EXACTish node.
8949      * Note that since we don't currently count how many characters there are
8950      * outside the bitmap, we are XXX missing optimization possibilities for
8951      * them.  This optimization can't happen unless this is a truly single
8952      * character class, which means that it can't be an inversion into a
8953      * many-character class, and there must be no possibility of there being
8954      * things outside the bitmap.  'stored' (only) for locales doesn't include
8955      * \w, etc, so have to make a special test that they aren't present
8956      *
8957      * Similarly A 2-character class of the very special form like [bB] can be
8958      * optimized into an EXACTFish node, but only for non-locales, and for
8959      * characters which only have the two folds; so things like 'fF' and 'Ii'
8960      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
8961      * FI'. */
8962     if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
8963         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8964                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
8965             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8966                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
8967                                  /* If the latest code point has a fold whose
8968                                   * bit is set, it must be the only other one */
8969                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
8970                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
8971     {
8972         /* Note that the information needed to decide to do this optimization
8973          * is not currently available until the 2nd pass, and that the actually
8974          * used EXACTish node takes less space than the calculated ANYOF node,
8975          * and hence the amount of space calculated in the first pass is larger
8976          * than actually used, so this optimization doesn't gain us any space.
8977          * But an EXACT node is faster than an ANYOF node, and can be combined
8978          * with any adjacent EXACT nodes later by the optimizer for further
8979          * gains.  The speed of executing an EXACTF is similar to an ANYOF
8980          * node, so the optimization advantage comes from the ability to join
8981          * it to adjacent EXACT nodes */
8982
8983         const char * cur_parse= RExC_parse;
8984         U8 op;
8985         RExC_emit = (regnode *)orig_emit;
8986         RExC_parse = (char *)orig_parse;
8987
8988         if (stored == 1) {
8989
8990             /* A locale node with one point can be folded; all the other cases
8991              * with folding will have two points, since we calculate them above
8992              */
8993             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
8994                  op = EXACTFL;
8995             }
8996             else {
8997                 op = EXACT;
8998             }
8999         }   /* else 2 chars in the bit map: the folds of each other */
9000         else if (UNI_SEMANTICS || !isASCII(value)) {
9001
9002             /* To join adjacent nodes, they must be the exact EXACTish type.
9003              * Try to use the most likely type, by using EXACTFU if the regex
9004              * calls for them, or is required because the character is
9005              * non-ASCII */
9006             op = EXACTFU;
9007         }
9008         else {    /* Otherwise, more likely to be EXACTF type */
9009             op = EXACTF;
9010         }
9011
9012         ret = reg_node(pRExC_state, op);
9013         RExC_parse = (char *)cur_parse;
9014         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
9015             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
9016             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
9017             STR_LEN(ret)= 2;
9018             RExC_emit += STR_SZ(2);
9019         }
9020         else {
9021             *STRING(ret)= (char)value;
9022             STR_LEN(ret)= 1;
9023             RExC_emit += STR_SZ(1);
9024         }
9025         SvREFCNT_dec(listsv);
9026         return ret;
9027     }
9028
9029     {
9030         AV * const av = newAV();
9031         SV *rv;
9032         /* The 0th element stores the character class description
9033          * in its textual form: used later (regexec.c:Perl_regclass_swash())
9034          * to initialize the appropriate swash (which gets stored in
9035          * the 1st element), and also useful for dumping the regnode.
9036          * The 2nd element stores the multicharacter foldings,
9037          * used later (regexec.c:S_reginclass()). */
9038         av_store(av, 0, listsv);
9039         av_store(av, 1, NULL);
9040         av_store(av, 2, MUTABLE_SV(unicode_alternate));
9041         rv = newRV_noinc(MUTABLE_SV(av));
9042         n = add_data(pRExC_state, 1, "s");
9043         RExC_rxi->data->data[n] = (void*)rv;
9044         ARG_SET(ret, n);
9045     }
9046     return ret;
9047 }
9048 #undef _C_C_T_
9049
9050
9051 /* reg_skipcomment()
9052
9053    Absorbs an /x style # comments from the input stream.
9054    Returns true if there is more text remaining in the stream.
9055    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
9056    terminates the pattern without including a newline.
9057
9058    Note its the callers responsibility to ensure that we are
9059    actually in /x mode
9060
9061 */
9062
9063 STATIC bool
9064 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
9065 {
9066     bool ended = 0;
9067
9068     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
9069
9070     while (RExC_parse < RExC_end)
9071         if (*RExC_parse++ == '\n') {
9072             ended = 1;
9073             break;
9074         }
9075     if (!ended) {
9076         /* we ran off the end of the pattern without ending
9077            the comment, so we have to add an \n when wrapping */
9078         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9079         return 0;
9080     } else
9081         return 1;
9082 }
9083
9084 /* nextchar()
9085
9086    Advances the parse position, and optionally absorbs
9087    "whitespace" from the inputstream.
9088
9089    Without /x "whitespace" means (?#...) style comments only,
9090    with /x this means (?#...) and # comments and whitespace proper.
9091
9092    Returns the RExC_parse point from BEFORE the scan occurs.
9093
9094    This is the /x friendly way of saying RExC_parse++.
9095 */
9096
9097 STATIC char*
9098 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
9099 {
9100     char* const retval = RExC_parse++;
9101
9102     PERL_ARGS_ASSERT_NEXTCHAR;
9103
9104     for (;;) {
9105         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9106                 RExC_parse[2] == '#') {
9107             while (*RExC_parse != ')') {
9108                 if (RExC_parse == RExC_end)
9109                     FAIL("Sequence (?#... not terminated");
9110                 RExC_parse++;
9111             }
9112             RExC_parse++;
9113             continue;
9114         }
9115         if (RExC_flags & RXf_PMf_EXTENDED) {
9116             if (isSPACE(*RExC_parse)) {
9117                 RExC_parse++;
9118                 continue;
9119             }
9120             else if (*RExC_parse == '#') {
9121                 if ( reg_skipcomment( pRExC_state ) )
9122                     continue;
9123             }
9124         }
9125         return retval;
9126     }
9127 }
9128
9129 /*
9130 - reg_node - emit a node
9131 */
9132 STATIC regnode *                        /* Location. */
9133 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
9134 {
9135     dVAR;
9136     register regnode *ptr;
9137     regnode * const ret = RExC_emit;
9138     GET_RE_DEBUG_FLAGS_DECL;
9139
9140     PERL_ARGS_ASSERT_REG_NODE;
9141
9142     if (SIZE_ONLY) {
9143         SIZE_ALIGN(RExC_size);
9144         RExC_size += 1;
9145         return(ret);
9146     }
9147     if (RExC_emit >= RExC_emit_bound)
9148         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9149
9150     NODE_ALIGN_FILL(ret);
9151     ptr = ret;
9152     FILL_ADVANCE_NODE(ptr, op);
9153 #ifdef RE_TRACK_PATTERN_OFFSETS
9154     if (RExC_offsets) {         /* MJD */
9155         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
9156               "reg_node", __LINE__,
9157               PL_reg_name[op],
9158               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9159                 ? "Overwriting end of array!\n" : "OK",
9160               (UV)(RExC_emit - RExC_emit_start),
9161               (UV)(RExC_parse - RExC_start),
9162               (UV)RExC_offsets[0]));
9163         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
9164     }
9165 #endif
9166     RExC_emit = ptr;
9167     return(ret);
9168 }
9169
9170 /*
9171 - reganode - emit a node with an argument
9172 */
9173 STATIC regnode *                        /* Location. */
9174 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
9175 {
9176     dVAR;
9177     register regnode *ptr;
9178     regnode * const ret = RExC_emit;
9179     GET_RE_DEBUG_FLAGS_DECL;
9180
9181     PERL_ARGS_ASSERT_REGANODE;
9182
9183     if (SIZE_ONLY) {
9184         SIZE_ALIGN(RExC_size);
9185         RExC_size += 2;
9186         /*
9187            We can't do this:
9188
9189            assert(2==regarglen[op]+1);
9190
9191            Anything larger than this has to allocate the extra amount.
9192            If we changed this to be:
9193
9194            RExC_size += (1 + regarglen[op]);
9195
9196            then it wouldn't matter. Its not clear what side effect
9197            might come from that so its not done so far.
9198            -- dmq
9199         */
9200         return(ret);
9201     }
9202     if (RExC_emit >= RExC_emit_bound)
9203         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9204
9205     NODE_ALIGN_FILL(ret);
9206     ptr = ret;
9207     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
9208 #ifdef RE_TRACK_PATTERN_OFFSETS
9209     if (RExC_offsets) {         /* MJD */
9210         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9211               "reganode",
9212               __LINE__,
9213               PL_reg_name[op],
9214               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
9215               "Overwriting end of array!\n" : "OK",
9216               (UV)(RExC_emit - RExC_emit_start),
9217               (UV)(RExC_parse - RExC_start),
9218               (UV)RExC_offsets[0]));
9219         Set_Cur_Node_Offset;
9220     }
9221 #endif
9222     RExC_emit = ptr;
9223     return(ret);
9224 }
9225
9226 /*
9227 - reguni - emit (if appropriate) a Unicode character
9228 */
9229 STATIC STRLEN
9230 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
9231 {
9232     dVAR;
9233
9234     PERL_ARGS_ASSERT_REGUNI;
9235
9236     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
9237 }
9238
9239 /*
9240 - reginsert - insert an operator in front of already-emitted operand
9241 *
9242 * Means relocating the operand.
9243 */
9244 STATIC void
9245 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
9246 {
9247     dVAR;
9248     register regnode *src;
9249     register regnode *dst;
9250     register regnode *place;
9251     const int offset = regarglen[(U8)op];
9252     const int size = NODE_STEP_REGNODE + offset;
9253     GET_RE_DEBUG_FLAGS_DECL;
9254
9255     PERL_ARGS_ASSERT_REGINSERT;
9256     PERL_UNUSED_ARG(depth);
9257 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
9258     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9259     if (SIZE_ONLY) {
9260         RExC_size += size;
9261         return;
9262     }
9263
9264     src = RExC_emit;
9265     RExC_emit += size;
9266     dst = RExC_emit;
9267     if (RExC_open_parens) {
9268         int paren;
9269         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9270         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9271             if ( RExC_open_parens[paren] >= opnd ) {
9272                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9273                 RExC_open_parens[paren] += size;
9274             } else {
9275                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9276             }
9277             if ( RExC_close_parens[paren] >= opnd ) {
9278                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9279                 RExC_close_parens[paren] += size;
9280             } else {
9281                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9282             }
9283         }
9284     }
9285
9286     while (src > opnd) {
9287         StructCopy(--src, --dst, regnode);
9288 #ifdef RE_TRACK_PATTERN_OFFSETS
9289         if (RExC_offsets) {     /* MJD 20010112 */
9290             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9291                   "reg_insert",
9292                   __LINE__,
9293                   PL_reg_name[op],
9294                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9295                     ? "Overwriting end of array!\n" : "OK",
9296                   (UV)(src - RExC_emit_start),
9297                   (UV)(dst - RExC_emit_start),
9298                   (UV)RExC_offsets[0]));
9299             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9300             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9301         }
9302 #endif
9303     }
9304
9305
9306     place = opnd;               /* Op node, where operand used to be. */
9307 #ifdef RE_TRACK_PATTERN_OFFSETS
9308     if (RExC_offsets) {         /* MJD */
9309         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9310               "reginsert",
9311               __LINE__,
9312               PL_reg_name[op],
9313               (UV)(place - RExC_emit_start) > RExC_offsets[0]
9314               ? "Overwriting end of array!\n" : "OK",
9315               (UV)(place - RExC_emit_start),
9316               (UV)(RExC_parse - RExC_start),
9317               (UV)RExC_offsets[0]));
9318         Set_Node_Offset(place, RExC_parse);
9319         Set_Node_Length(place, 1);
9320     }
9321 #endif
9322     src = NEXTOPER(place);
9323     FILL_ADVANCE_NODE(place, op);
9324     Zero(src, offset, regnode);
9325 }
9326
9327 /*
9328 - regtail - set the next-pointer at the end of a node chain of p to val.
9329 - SEE ALSO: regtail_study
9330 */
9331 /* TODO: All three parms should be const */
9332 STATIC void
9333 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9334 {
9335     dVAR;
9336     register regnode *scan;
9337     GET_RE_DEBUG_FLAGS_DECL;
9338
9339     PERL_ARGS_ASSERT_REGTAIL;
9340 #ifndef DEBUGGING
9341     PERL_UNUSED_ARG(depth);
9342 #endif
9343
9344     if (SIZE_ONLY)
9345         return;
9346
9347     /* Find last node. */
9348     scan = p;
9349     for (;;) {
9350         regnode * const temp = regnext(scan);
9351         DEBUG_PARSE_r({
9352             SV * const mysv=sv_newmortal();
9353             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9354             regprop(RExC_rx, mysv, scan);
9355             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9356                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9357                     (temp == NULL ? "->" : ""),
9358                     (temp == NULL ? PL_reg_name[OP(val)] : "")
9359             );
9360         });
9361         if (temp == NULL)
9362             break;
9363         scan = temp;
9364     }
9365
9366     if (reg_off_by_arg[OP(scan)]) {
9367         ARG_SET(scan, val - scan);
9368     }
9369     else {
9370         NEXT_OFF(scan) = val - scan;
9371     }
9372 }
9373
9374 #ifdef DEBUGGING
9375 /*
9376 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9377 - Look for optimizable sequences at the same time.
9378 - currently only looks for EXACT chains.
9379
9380 This is experimental code. The idea is to use this routine to perform
9381 in place optimizations on branches and groups as they are constructed,
9382 with the long term intention of removing optimization from study_chunk so
9383 that it is purely analytical.
9384
9385 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9386 to control which is which.
9387
9388 */
9389 /* TODO: All four parms should be const */
9390
9391 STATIC U8
9392 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9393 {
9394     dVAR;
9395     register regnode *scan;
9396     U8 exact = PSEUDO;
9397 #ifdef EXPERIMENTAL_INPLACESCAN
9398     I32 min = 0;
9399 #endif
9400     GET_RE_DEBUG_FLAGS_DECL;
9401
9402     PERL_ARGS_ASSERT_REGTAIL_STUDY;
9403
9404
9405     if (SIZE_ONLY)
9406         return exact;
9407
9408     /* Find last node. */
9409
9410     scan = p;
9411     for (;;) {
9412         regnode * const temp = regnext(scan);
9413 #ifdef EXPERIMENTAL_INPLACESCAN
9414         if (PL_regkind[OP(scan)] == EXACT)
9415             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9416                 return EXACT;
9417 #endif
9418         if ( exact ) {
9419             switch (OP(scan)) {
9420                 case EXACT:
9421                 case EXACTF:
9422                 case EXACTFU:
9423                 case EXACTFL:
9424                         if( exact == PSEUDO )
9425                             exact= OP(scan);
9426                         else if ( exact != OP(scan) )
9427                             exact= 0;
9428                 case NOTHING:
9429                     break;
9430                 default:
9431                     exact= 0;
9432             }
9433         }
9434         DEBUG_PARSE_r({
9435             SV * const mysv=sv_newmortal();
9436             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9437             regprop(RExC_rx, mysv, scan);
9438             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9439                 SvPV_nolen_const(mysv),
9440                 REG_NODE_NUM(scan),
9441                 PL_reg_name[exact]);
9442         });
9443         if (temp == NULL)
9444             break;
9445         scan = temp;
9446     }
9447     DEBUG_PARSE_r({
9448         SV * const mysv_val=sv_newmortal();
9449         DEBUG_PARSE_MSG("");
9450         regprop(RExC_rx, mysv_val, val);
9451         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9452                       SvPV_nolen_const(mysv_val),
9453                       (IV)REG_NODE_NUM(val),
9454                       (IV)(val - scan)
9455         );
9456     });
9457     if (reg_off_by_arg[OP(scan)]) {
9458         ARG_SET(scan, val - scan);
9459     }
9460     else {
9461         NEXT_OFF(scan) = val - scan;
9462     }
9463
9464     return exact;
9465 }
9466 #endif
9467
9468 /*
9469  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9470  */
9471 #ifdef DEBUGGING
9472 static void
9473 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9474 {
9475     int bit;
9476     int set=0;
9477     regex_charset cs;
9478
9479     for (bit=0; bit<32; bit++) {
9480         if (flags & (1<<bit)) {
9481             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
9482                 continue;
9483             }
9484             if (!set++ && lead)
9485                 PerlIO_printf(Perl_debug_log, "%s",lead);
9486             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9487         }
9488     }
9489     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
9490             if (!set++ && lead) {
9491                 PerlIO_printf(Perl_debug_log, "%s",lead);
9492             }
9493             switch (cs) {
9494                 case REGEX_UNICODE_CHARSET:
9495                     PerlIO_printf(Perl_debug_log, "UNICODE");
9496                     break;
9497                 case REGEX_LOCALE_CHARSET:
9498                     PerlIO_printf(Perl_debug_log, "LOCALE");
9499                     break;
9500                 default:
9501                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
9502                     break;
9503             }
9504     }
9505     if (lead)  {
9506         if (set)
9507             PerlIO_printf(Perl_debug_log, "\n");
9508         else
9509             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9510     }
9511 }
9512 #endif
9513
9514 void
9515 Perl_regdump(pTHX_ const regexp *r)
9516 {
9517 #ifdef DEBUGGING
9518     dVAR;
9519     SV * const sv = sv_newmortal();
9520     SV *dsv= sv_newmortal();
9521     RXi_GET_DECL(r,ri);
9522     GET_RE_DEBUG_FLAGS_DECL;
9523
9524     PERL_ARGS_ASSERT_REGDUMP;
9525
9526     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9527
9528     /* Header fields of interest. */
9529     if (r->anchored_substr) {
9530         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9531             RE_SV_DUMPLEN(r->anchored_substr), 30);
9532         PerlIO_printf(Perl_debug_log,
9533                       "anchored %s%s at %"IVdf" ",
9534                       s, RE_SV_TAIL(r->anchored_substr),
9535                       (IV)r->anchored_offset);
9536     } else if (r->anchored_utf8) {
9537         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9538             RE_SV_DUMPLEN(r->anchored_utf8), 30);
9539         PerlIO_printf(Perl_debug_log,
9540                       "anchored utf8 %s%s at %"IVdf" ",
9541                       s, RE_SV_TAIL(r->anchored_utf8),
9542                       (IV)r->anchored_offset);
9543     }
9544     if (r->float_substr) {
9545         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9546             RE_SV_DUMPLEN(r->float_substr), 30);
9547         PerlIO_printf(Perl_debug_log,
9548                       "floating %s%s at %"IVdf"..%"UVuf" ",
9549                       s, RE_SV_TAIL(r->float_substr),
9550                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9551     } else if (r->float_utf8) {
9552         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9553             RE_SV_DUMPLEN(r->float_utf8), 30);
9554         PerlIO_printf(Perl_debug_log,
9555                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9556                       s, RE_SV_TAIL(r->float_utf8),
9557                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9558     }
9559     if (r->check_substr || r->check_utf8)
9560         PerlIO_printf(Perl_debug_log,
9561                       (const char *)
9562                       (r->check_substr == r->float_substr
9563                        && r->check_utf8 == r->float_utf8
9564                        ? "(checking floating" : "(checking anchored"));
9565     if (r->extflags & RXf_NOSCAN)
9566         PerlIO_printf(Perl_debug_log, " noscan");
9567     if (r->extflags & RXf_CHECK_ALL)
9568         PerlIO_printf(Perl_debug_log, " isall");
9569     if (r->check_substr || r->check_utf8)
9570         PerlIO_printf(Perl_debug_log, ") ");
9571
9572     if (ri->regstclass) {
9573         regprop(r, sv, ri->regstclass);
9574         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9575     }
9576     if (r->extflags & RXf_ANCH) {
9577         PerlIO_printf(Perl_debug_log, "anchored");
9578         if (r->extflags & RXf_ANCH_BOL)
9579             PerlIO_printf(Perl_debug_log, "(BOL)");
9580         if (r->extflags & RXf_ANCH_MBOL)
9581             PerlIO_printf(Perl_debug_log, "(MBOL)");
9582         if (r->extflags & RXf_ANCH_SBOL)
9583             PerlIO_printf(Perl_debug_log, "(SBOL)");
9584         if (r->extflags & RXf_ANCH_GPOS)
9585             PerlIO_printf(Perl_debug_log, "(GPOS)");
9586         PerlIO_putc(Perl_debug_log, ' ');
9587     }
9588     if (r->extflags & RXf_GPOS_SEEN)
9589         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9590     if (r->intflags & PREGf_SKIP)
9591         PerlIO_printf(Perl_debug_log, "plus ");
9592     if (r->intflags & PREGf_IMPLICIT)
9593         PerlIO_printf(Perl_debug_log, "implicit ");
9594     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9595     if (r->extflags & RXf_EVAL_SEEN)
9596         PerlIO_printf(Perl_debug_log, "with eval ");
9597     PerlIO_printf(Perl_debug_log, "\n");
9598     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9599 #else
9600     PERL_ARGS_ASSERT_REGDUMP;
9601     PERL_UNUSED_CONTEXT;
9602     PERL_UNUSED_ARG(r);
9603 #endif  /* DEBUGGING */
9604 }
9605
9606 /*
9607 - regprop - printable representation of opcode
9608 */
9609 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9610 STMT_START { \
9611         if (do_sep) {                           \
9612             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9613             if (flags & ANYOF_INVERT)           \
9614                 /*make sure the invert info is in each */ \
9615                 sv_catpvs(sv, "^");             \
9616             do_sep = 0;                         \
9617         }                                       \
9618 } STMT_END
9619
9620 void
9621 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9622 {
9623 #ifdef DEBUGGING
9624     dVAR;
9625     register int k;
9626     RXi_GET_DECL(prog,progi);
9627     GET_RE_DEBUG_FLAGS_DECL;
9628
9629     PERL_ARGS_ASSERT_REGPROP;
9630
9631     sv_setpvs(sv, "");
9632
9633     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9634         /* It would be nice to FAIL() here, but this may be called from
9635            regexec.c, and it would be hard to supply pRExC_state. */
9636         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9637     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9638
9639     k = PL_regkind[OP(o)];
9640
9641     if (k == EXACT) {
9642         sv_catpvs(sv, " ");
9643         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9644          * is a crude hack but it may be the best for now since
9645          * we have no flag "this EXACTish node was UTF-8"
9646          * --jhi */
9647         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9648                   PERL_PV_ESCAPE_UNI_DETECT |
9649                   PERL_PV_ESCAPE_NONASCII   |
9650                   PERL_PV_PRETTY_ELLIPSES   |
9651                   PERL_PV_PRETTY_LTGT       |
9652                   PERL_PV_PRETTY_NOCLEAR
9653                   );
9654     } else if (k == TRIE) {
9655         /* print the details of the trie in dumpuntil instead, as
9656          * progi->data isn't available here */
9657         const char op = OP(o);
9658         const U32 n = ARG(o);
9659         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9660                (reg_ac_data *)progi->data->data[n] :
9661                NULL;
9662         const reg_trie_data * const trie
9663             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9664
9665         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9666         DEBUG_TRIE_COMPILE_r(
9667             Perl_sv_catpvf(aTHX_ sv,
9668                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9669                 (UV)trie->startstate,
9670                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9671                 (UV)trie->wordcount,
9672                 (UV)trie->minlen,
9673                 (UV)trie->maxlen,
9674                 (UV)TRIE_CHARCOUNT(trie),
9675                 (UV)trie->uniquecharcount
9676             )
9677         );
9678         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9679             int i;
9680             int rangestart = -1;
9681             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9682             sv_catpvs(sv, "[");
9683             for (i = 0; i <= 256; i++) {
9684                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9685                     if (rangestart == -1)
9686                         rangestart = i;
9687                 } else if (rangestart != -1) {
9688                     if (i <= rangestart + 3)
9689                         for (; rangestart < i; rangestart++)
9690                             put_byte(sv, rangestart);
9691                     else {
9692                         put_byte(sv, rangestart);
9693                         sv_catpvs(sv, "-");
9694                         put_byte(sv, i - 1);
9695                     }
9696                     rangestart = -1;
9697                 }
9698             }
9699             sv_catpvs(sv, "]");
9700         }
9701
9702     } else if (k == CURLY) {
9703         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9704             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9705         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9706     }
9707     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9708         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9709     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9710         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9711         if ( RXp_PAREN_NAMES(prog) ) {
9712             if ( k != REF || (OP(o) < NREF)) {
9713                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9714                 SV **name= av_fetch(list, ARG(o), 0 );
9715                 if (name)
9716                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9717             }
9718             else {
9719                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9720                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9721                 I32 *nums=(I32*)SvPVX(sv_dat);
9722                 SV **name= av_fetch(list, nums[0], 0 );
9723                 I32 n;
9724                 if (name) {
9725                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9726                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9727                                     (n ? "," : ""), (IV)nums[n]);
9728                     }
9729                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9730                 }
9731             }
9732         }
9733     } else if (k == GOSUB)
9734         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9735     else if (k == VERB) {
9736         if (!o->flags)
9737             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9738                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9739     } else if (k == LOGICAL)
9740         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9741     else if (k == FOLDCHAR)
9742         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9743     else if (k == ANYOF) {
9744         int i, rangestart = -1;
9745         const U8 flags = ANYOF_FLAGS(o);
9746         int do_sep = 0;
9747
9748         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9749         static const char * const anyofs[] = {
9750             "\\w",
9751             "\\W",
9752             "\\s",
9753             "\\S",
9754             "\\d",
9755             "\\D",
9756             "[:alnum:]",
9757             "[:^alnum:]",
9758             "[:alpha:]",
9759             "[:^alpha:]",
9760             "[:ascii:]",
9761             "[:^ascii:]",
9762             "[:cntrl:]",
9763             "[:^cntrl:]",
9764             "[:graph:]",
9765             "[:^graph:]",
9766             "[:lower:]",
9767             "[:^lower:]",
9768             "[:print:]",
9769             "[:^print:]",
9770             "[:punct:]",
9771             "[:^punct:]",
9772             "[:upper:]",
9773             "[:^upper:]",
9774             "[:xdigit:]",
9775             "[:^xdigit:]",
9776             "[:space:]",
9777             "[:^space:]",
9778             "[:blank:]",
9779             "[:^blank:]"
9780         };
9781
9782         if (flags & ANYOF_LOCALE)
9783             sv_catpvs(sv, "{loc}");
9784         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
9785             sv_catpvs(sv, "{i}");
9786         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9787         if (flags & ANYOF_INVERT)
9788             sv_catpvs(sv, "^");
9789
9790         /* output what the standard cp 0-255 bitmap matches */
9791         for (i = 0; i <= 256; i++) {
9792             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9793                 if (rangestart == -1)
9794                     rangestart = i;
9795             } else if (rangestart != -1) {
9796                 if (i <= rangestart + 3)
9797                     for (; rangestart < i; rangestart++)
9798                         put_byte(sv, rangestart);
9799                 else {
9800                     put_byte(sv, rangestart);
9801                     sv_catpvs(sv, "-");
9802                     put_byte(sv, i - 1);
9803                 }
9804                 do_sep = 1;
9805                 rangestart = -1;
9806             }
9807         }
9808
9809         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9810         /* output any special charclass tests (used entirely under use locale) */
9811         if (ANYOF_CLASS_TEST_ANY_SET(o))
9812             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9813                 if (ANYOF_CLASS_TEST(o,i)) {
9814                     sv_catpv(sv, anyofs[i]);
9815                     do_sep = 1;
9816                 }
9817
9818         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9819
9820         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
9821             sv_catpvs(sv, "{non-utf8-latin1-all}");
9822         }
9823
9824         /* output information about the unicode matching */
9825         if (flags & ANYOF_UNICODE_ALL)
9826             sv_catpvs(sv, "{unicode_all}");
9827         else if (flags & ANYOF_UTF8)
9828             sv_catpvs(sv, "{unicode}");
9829         if (flags & ANYOF_NONBITMAP_NON_UTF8)
9830             sv_catpvs(sv, "{outside bitmap}");
9831
9832         {
9833             SV *lv;
9834             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9835
9836             if (lv) {
9837                 if (sw) {
9838                     U8 s[UTF8_MAXBYTES_CASE+1];
9839
9840                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9841                         uvchr_to_utf8(s, i);
9842
9843                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9844                             if (rangestart == -1)
9845                                 rangestart = i;
9846                         } else if (rangestart != -1) {
9847                             if (i <= rangestart + 3)
9848                                 for (; rangestart < i; rangestart++) {
9849                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9850                                     U8 *p;
9851                                     for(p = s; p < e; p++)
9852                                         put_byte(sv, *p);
9853                                 }
9854                             else {
9855                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9856                                 U8 *p;
9857                                 for (p = s; p < e; p++)
9858                                     put_byte(sv, *p);
9859                                 sv_catpvs(sv, "-");
9860                                 e = uvchr_to_utf8(s, i-1);
9861                                 for (p = s; p < e; p++)
9862                                     put_byte(sv, *p);
9863                                 }
9864                                 rangestart = -1;
9865                             }
9866                         }
9867
9868                     sv_catpvs(sv, "..."); /* et cetera */
9869                 }
9870
9871                 {
9872                     char *s = savesvpv(lv);
9873                     char * const origs = s;
9874
9875                     while (*s && *s != '\n')
9876                         s++;
9877
9878                     if (*s == '\n') {
9879                         const char * const t = ++s;
9880
9881                         while (*s) {
9882                             if (*s == '\n')
9883                                 *s = ' ';
9884                             s++;
9885                         }
9886                         if (s[-1] == ' ')
9887                             s[-1] = 0;
9888
9889                         sv_catpv(sv, t);
9890                     }
9891
9892                     Safefree(origs);
9893                 }
9894             }
9895         }
9896
9897         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9898     }
9899     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9900         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9901 #else
9902     PERL_UNUSED_CONTEXT;
9903     PERL_UNUSED_ARG(sv);
9904     PERL_UNUSED_ARG(o);
9905     PERL_UNUSED_ARG(prog);
9906 #endif  /* DEBUGGING */
9907 }
9908
9909 SV *
9910 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9911 {                               /* Assume that RE_INTUIT is set */
9912     dVAR;
9913     struct regexp *const prog = (struct regexp *)SvANY(r);
9914     GET_RE_DEBUG_FLAGS_DECL;
9915
9916     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9917     PERL_UNUSED_CONTEXT;
9918
9919     DEBUG_COMPILE_r(
9920         {
9921             const char * const s = SvPV_nolen_const(prog->check_substr
9922                       ? prog->check_substr : prog->check_utf8);
9923
9924             if (!PL_colorset) reginitcolors();
9925             PerlIO_printf(Perl_debug_log,
9926                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9927                       PL_colors[4],
9928                       prog->check_substr ? "" : "utf8 ",
9929                       PL_colors[5],PL_colors[0],
9930                       s,
9931                       PL_colors[1],
9932                       (strlen(s) > 60 ? "..." : ""));
9933         } );
9934
9935     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9936 }
9937
9938 /*
9939    pregfree()
9940
9941    handles refcounting and freeing the perl core regexp structure. When
9942    it is necessary to actually free the structure the first thing it
9943    does is call the 'free' method of the regexp_engine associated to
9944    the regexp, allowing the handling of the void *pprivate; member
9945    first. (This routine is not overridable by extensions, which is why
9946    the extensions free is called first.)
9947
9948    See regdupe and regdupe_internal if you change anything here.
9949 */
9950 #ifndef PERL_IN_XSUB_RE
9951 void
9952 Perl_pregfree(pTHX_ REGEXP *r)
9953 {
9954     SvREFCNT_dec(r);
9955 }
9956
9957 void
9958 Perl_pregfree2(pTHX_ REGEXP *rx)
9959 {
9960     dVAR;
9961     struct regexp *const r = (struct regexp *)SvANY(rx);
9962     GET_RE_DEBUG_FLAGS_DECL;
9963
9964     PERL_ARGS_ASSERT_PREGFREE2;
9965
9966     if (r->mother_re) {
9967         ReREFCNT_dec(r->mother_re);
9968     } else {
9969         CALLREGFREE_PVT(rx); /* free the private data */
9970         SvREFCNT_dec(RXp_PAREN_NAMES(r));
9971     }
9972     if (r->substrs) {
9973         SvREFCNT_dec(r->anchored_substr);
9974         SvREFCNT_dec(r->anchored_utf8);
9975         SvREFCNT_dec(r->float_substr);
9976         SvREFCNT_dec(r->float_utf8);
9977         Safefree(r->substrs);
9978     }
9979     RX_MATCH_COPY_FREE(rx);
9980 #ifdef PERL_OLD_COPY_ON_WRITE
9981     SvREFCNT_dec(r->saved_copy);
9982 #endif
9983     Safefree(r->offs);
9984 }
9985
9986 /*  reg_temp_copy()
9987
9988     This is a hacky workaround to the structural issue of match results
9989     being stored in the regexp structure which is in turn stored in
9990     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9991     could be PL_curpm in multiple contexts, and could require multiple
9992     result sets being associated with the pattern simultaneously, such
9993     as when doing a recursive match with (??{$qr})
9994
9995     The solution is to make a lightweight copy of the regexp structure
9996     when a qr// is returned from the code executed by (??{$qr}) this
9997     lightweight copy doesn't actually own any of its data except for
9998     the starp/end and the actual regexp structure itself.
9999
10000 */
10001
10002
10003 REGEXP *
10004 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
10005 {
10006     struct regexp *ret;
10007     struct regexp *const r = (struct regexp *)SvANY(rx);
10008     register const I32 npar = r->nparens+1;
10009
10010     PERL_ARGS_ASSERT_REG_TEMP_COPY;
10011
10012     if (!ret_x)
10013         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
10014     ret = (struct regexp *)SvANY(ret_x);
10015
10016     (void)ReREFCNT_inc(rx);
10017     /* We can take advantage of the existing "copied buffer" mechanism in SVs
10018        by pointing directly at the buffer, but flagging that the allocated
10019        space in the copy is zero. As we've just done a struct copy, it's now
10020        a case of zero-ing that, rather than copying the current length.  */
10021     SvPV_set(ret_x, RX_WRAPPED(rx));
10022     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
10023     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
10024            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
10025     SvLEN_set(ret_x, 0);
10026     SvSTASH_set(ret_x, NULL);
10027     SvMAGIC_set(ret_x, NULL);
10028     Newx(ret->offs, npar, regexp_paren_pair);
10029     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10030     if (r->substrs) {
10031         Newx(ret->substrs, 1, struct reg_substr_data);
10032         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10033
10034         SvREFCNT_inc_void(ret->anchored_substr);
10035         SvREFCNT_inc_void(ret->anchored_utf8);
10036         SvREFCNT_inc_void(ret->float_substr);
10037         SvREFCNT_inc_void(ret->float_utf8);
10038
10039         /* check_substr and check_utf8, if non-NULL, point to either their
10040            anchored or float namesakes, and don't hold a second reference.  */
10041     }
10042     RX_MATCH_COPIED_off(ret_x);
10043 #ifdef PERL_OLD_COPY_ON_WRITE
10044     ret->saved_copy = NULL;
10045 #endif
10046     ret->mother_re = rx;
10047
10048     return ret_x;
10049 }
10050 #endif
10051
10052 /* regfree_internal()
10053
10054    Free the private data in a regexp. This is overloadable by
10055    extensions. Perl takes care of the regexp structure in pregfree(),
10056    this covers the *pprivate pointer which technically perl doesn't
10057    know about, however of course we have to handle the
10058    regexp_internal structure when no extension is in use.
10059
10060    Note this is called before freeing anything in the regexp
10061    structure.
10062  */
10063
10064 void
10065 Perl_regfree_internal(pTHX_ REGEXP * const rx)
10066 {
10067     dVAR;
10068     struct regexp *const r = (struct regexp *)SvANY(rx);
10069     RXi_GET_DECL(r,ri);
10070     GET_RE_DEBUG_FLAGS_DECL;
10071
10072     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
10073
10074     DEBUG_COMPILE_r({
10075         if (!PL_colorset)
10076             reginitcolors();
10077         {
10078             SV *dsv= sv_newmortal();
10079             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
10080                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
10081             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
10082                 PL_colors[4],PL_colors[5],s);
10083         }
10084     });
10085 #ifdef RE_TRACK_PATTERN_OFFSETS
10086     if (ri->u.offsets)
10087         Safefree(ri->u.offsets);             /* 20010421 MJD */
10088 #endif
10089     if (ri->data) {
10090         int n = ri->data->count;
10091         PAD* new_comppad = NULL;
10092         PAD* old_comppad;
10093         PADOFFSET refcnt;
10094
10095         while (--n >= 0) {
10096           /* If you add a ->what type here, update the comment in regcomp.h */
10097             switch (ri->data->what[n]) {
10098             case 'a':
10099             case 's':
10100             case 'S':
10101             case 'u':
10102                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
10103                 break;
10104             case 'f':
10105                 Safefree(ri->data->data[n]);
10106                 break;
10107             case 'p':
10108                 new_comppad = MUTABLE_AV(ri->data->data[n]);
10109                 break;
10110             case 'o':
10111                 if (new_comppad == NULL)
10112                     Perl_croak(aTHX_ "panic: pregfree comppad");
10113                 PAD_SAVE_LOCAL(old_comppad,
10114                     /* Watch out for global destruction's random ordering. */
10115                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
10116                 );
10117                 OP_REFCNT_LOCK;
10118                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
10119                 OP_REFCNT_UNLOCK;
10120                 if (!refcnt)
10121                     op_free((OP_4tree*)ri->data->data[n]);
10122
10123                 PAD_RESTORE_LOCAL(old_comppad);
10124                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
10125                 new_comppad = NULL;
10126                 break;
10127             case 'n':
10128                 break;
10129             case 'T':
10130                 { /* Aho Corasick add-on structure for a trie node.
10131                      Used in stclass optimization only */
10132                     U32 refcount;
10133                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
10134                     OP_REFCNT_LOCK;
10135                     refcount = --aho->refcount;
10136                     OP_REFCNT_UNLOCK;
10137                     if ( !refcount ) {
10138                         PerlMemShared_free(aho->states);
10139                         PerlMemShared_free(aho->fail);
10140                          /* do this last!!!! */
10141                         PerlMemShared_free(ri->data->data[n]);
10142                         PerlMemShared_free(ri->regstclass);
10143                     }
10144                 }
10145                 break;
10146             case 't':
10147                 {
10148                     /* trie structure. */
10149                     U32 refcount;
10150                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
10151                     OP_REFCNT_LOCK;
10152                     refcount = --trie->refcount;
10153                     OP_REFCNT_UNLOCK;
10154                     if ( !refcount ) {
10155                         PerlMemShared_free(trie->charmap);
10156                         PerlMemShared_free(trie->states);
10157                         PerlMemShared_free(trie->trans);
10158                         if (trie->bitmap)
10159                             PerlMemShared_free(trie->bitmap);
10160                         if (trie->jump)
10161                             PerlMemShared_free(trie->jump);
10162                         PerlMemShared_free(trie->wordinfo);
10163                         /* do this last!!!! */
10164                         PerlMemShared_free(ri->data->data[n]);
10165                     }
10166                 }
10167                 break;
10168             default:
10169                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
10170             }
10171         }
10172         Safefree(ri->data->what);
10173         Safefree(ri->data);
10174     }
10175
10176     Safefree(ri);
10177 }
10178
10179 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10180 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
10181 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
10182
10183 /*
10184    re_dup - duplicate a regexp.
10185
10186    This routine is expected to clone a given regexp structure. It is only
10187    compiled under USE_ITHREADS.
10188
10189    After all of the core data stored in struct regexp is duplicated
10190    the regexp_engine.dupe method is used to copy any private data
10191    stored in the *pprivate pointer. This allows extensions to handle
10192    any duplication it needs to do.
10193
10194    See pregfree() and regfree_internal() if you change anything here.
10195 */
10196 #if defined(USE_ITHREADS)
10197 #ifndef PERL_IN_XSUB_RE
10198 void
10199 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
10200 {
10201     dVAR;
10202     I32 npar;
10203     const struct regexp *r = (const struct regexp *)SvANY(sstr);
10204     struct regexp *ret = (struct regexp *)SvANY(dstr);
10205
10206     PERL_ARGS_ASSERT_RE_DUP_GUTS;
10207
10208     npar = r->nparens+1;
10209     Newx(ret->offs, npar, regexp_paren_pair);
10210     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10211     if(ret->swap) {
10212         /* no need to copy these */
10213         Newx(ret->swap, npar, regexp_paren_pair);
10214     }
10215
10216     if (ret->substrs) {
10217         /* Do it this way to avoid reading from *r after the StructCopy().
10218            That way, if any of the sv_dup_inc()s dislodge *r from the L1
10219            cache, it doesn't matter.  */
10220         const bool anchored = r->check_substr
10221             ? r->check_substr == r->anchored_substr
10222             : r->check_utf8 == r->anchored_utf8;
10223         Newx(ret->substrs, 1, struct reg_substr_data);
10224         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10225
10226         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10227         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10228         ret->float_substr = sv_dup_inc(ret->float_substr, param);
10229         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
10230
10231         /* check_substr and check_utf8, if non-NULL, point to either their
10232            anchored or float namesakes, and don't hold a second reference.  */
10233
10234         if (ret->check_substr) {
10235             if (anchored) {
10236                 assert(r->check_utf8 == r->anchored_utf8);
10237                 ret->check_substr = ret->anchored_substr;
10238                 ret->check_utf8 = ret->anchored_utf8;
10239             } else {
10240                 assert(r->check_substr == r->float_substr);
10241                 assert(r->check_utf8 == r->float_utf8);
10242                 ret->check_substr = ret->float_substr;
10243                 ret->check_utf8 = ret->float_utf8;
10244             }
10245         } else if (ret->check_utf8) {
10246             if (anchored) {
10247                 ret->check_utf8 = ret->anchored_utf8;
10248             } else {
10249                 ret->check_utf8 = ret->float_utf8;
10250             }
10251         }
10252     }
10253
10254     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
10255
10256     if (ret->pprivate)
10257         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
10258
10259     if (RX_MATCH_COPIED(dstr))
10260         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
10261     else
10262         ret->subbeg = NULL;
10263 #ifdef PERL_OLD_COPY_ON_WRITE
10264     ret->saved_copy = NULL;
10265 #endif
10266
10267     if (ret->mother_re) {
10268         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10269             /* Our storage points directly to our mother regexp, but that's
10270                1: a buffer in a different thread
10271                2: something we no longer hold a reference on
10272                so we need to copy it locally.  */
10273             /* Note we need to sue SvCUR() on our mother_re, because it, in
10274                turn, may well be pointing to its own mother_re.  */
10275             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10276                                    SvCUR(ret->mother_re)+1));
10277             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10278         }
10279         ret->mother_re      = NULL;
10280     }
10281     ret->gofs = 0;
10282 }
10283 #endif /* PERL_IN_XSUB_RE */
10284
10285 /*
10286    regdupe_internal()
10287
10288    This is the internal complement to regdupe() which is used to copy
10289    the structure pointed to by the *pprivate pointer in the regexp.
10290    This is the core version of the extension overridable cloning hook.
10291    The regexp structure being duplicated will be copied by perl prior
10292    to this and will be provided as the regexp *r argument, however
10293    with the /old/ structures pprivate pointer value. Thus this routine
10294    may override any copying normally done by perl.
10295
10296    It returns a pointer to the new regexp_internal structure.
10297 */
10298
10299 void *
10300 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10301 {
10302     dVAR;
10303     struct regexp *const r = (struct regexp *)SvANY(rx);
10304     regexp_internal *reti;
10305     int len, npar;
10306     RXi_GET_DECL(r,ri);
10307
10308     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10309
10310     npar = r->nparens+1;
10311     len = ProgLen(ri);
10312
10313     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10314     Copy(ri->program, reti->program, len+1, regnode);
10315
10316
10317     reti->regstclass = NULL;
10318
10319     if (ri->data) {
10320         struct reg_data *d;
10321         const int count = ri->data->count;
10322         int i;
10323
10324         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10325                 char, struct reg_data);
10326         Newx(d->what, count, U8);
10327
10328         d->count = count;
10329         for (i = 0; i < count; i++) {
10330             d->what[i] = ri->data->what[i];
10331             switch (d->what[i]) {
10332                 /* legal options are one of: sSfpontTua
10333                    see also regcomp.h and pregfree() */
10334             case 'a': /* actually an AV, but the dup function is identical.  */
10335             case 's':
10336             case 'S':
10337             case 'p': /* actually an AV, but the dup function is identical.  */
10338             case 'u': /* actually an HV, but the dup function is identical.  */
10339                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10340                 break;
10341             case 'f':
10342                 /* This is cheating. */
10343                 Newx(d->data[i], 1, struct regnode_charclass_class);
10344                 StructCopy(ri->data->data[i], d->data[i],
10345                             struct regnode_charclass_class);
10346                 reti->regstclass = (regnode*)d->data[i];
10347                 break;
10348             case 'o':
10349                 /* Compiled op trees are readonly and in shared memory,
10350                    and can thus be shared without duplication. */
10351                 OP_REFCNT_LOCK;
10352                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10353                 OP_REFCNT_UNLOCK;
10354                 break;
10355             case 'T':
10356                 /* Trie stclasses are readonly and can thus be shared
10357                  * without duplication. We free the stclass in pregfree
10358                  * when the corresponding reg_ac_data struct is freed.
10359                  */
10360                 reti->regstclass= ri->regstclass;
10361                 /* Fall through */
10362             case 't':
10363                 OP_REFCNT_LOCK;
10364                 ((reg_trie_data*)ri->data->data[i])->refcount++;
10365                 OP_REFCNT_UNLOCK;
10366                 /* Fall through */
10367             case 'n':
10368                 d->data[i] = ri->data->data[i];
10369                 break;
10370             default:
10371                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10372             }
10373         }
10374
10375         reti->data = d;
10376     }
10377     else
10378         reti->data = NULL;
10379
10380     reti->name_list_idx = ri->name_list_idx;
10381
10382 #ifdef RE_TRACK_PATTERN_OFFSETS
10383     if (ri->u.offsets) {
10384         Newx(reti->u.offsets, 2*len+1, U32);
10385         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10386     }
10387 #else
10388     SetProgLen(reti,len);
10389 #endif
10390
10391     return (void*)reti;
10392 }
10393
10394 #endif    /* USE_ITHREADS */
10395
10396 #ifndef PERL_IN_XSUB_RE
10397
10398 /*
10399  - regnext - dig the "next" pointer out of a node
10400  */
10401 regnode *
10402 Perl_regnext(pTHX_ register regnode *p)
10403 {
10404     dVAR;
10405     register I32 offset;
10406
10407     if (!p)
10408         return(NULL);
10409
10410     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
10411         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10412     }
10413
10414     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10415     if (offset == 0)
10416         return(NULL);
10417
10418     return(p+offset);
10419 }
10420 #endif
10421
10422 STATIC void
10423 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10424 {
10425     va_list args;
10426     STRLEN l1 = strlen(pat1);
10427     STRLEN l2 = strlen(pat2);
10428     char buf[512];
10429     SV *msv;
10430     const char *message;
10431
10432     PERL_ARGS_ASSERT_RE_CROAK2;
10433
10434     if (l1 > 510)
10435         l1 = 510;
10436     if (l1 + l2 > 510)
10437         l2 = 510 - l1;
10438     Copy(pat1, buf, l1 , char);
10439     Copy(pat2, buf + l1, l2 , char);
10440     buf[l1 + l2] = '\n';
10441     buf[l1 + l2 + 1] = '\0';
10442 #ifdef I_STDARG
10443     /* ANSI variant takes additional second argument */
10444     va_start(args, pat2);
10445 #else
10446     va_start(args);
10447 #endif
10448     msv = vmess(buf, &args);
10449     va_end(args);
10450     message = SvPV_const(msv,l1);
10451     if (l1 > 512)
10452         l1 = 512;
10453     Copy(message, buf, l1 , char);
10454     buf[l1-1] = '\0';                   /* Overwrite \n */
10455     Perl_croak(aTHX_ "%s", buf);
10456 }
10457
10458 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
10459
10460 #ifndef PERL_IN_XSUB_RE
10461 void
10462 Perl_save_re_context(pTHX)
10463 {
10464     dVAR;
10465
10466     struct re_save_state *state;
10467
10468     SAVEVPTR(PL_curcop);
10469     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10470
10471     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10472     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10473     SSPUSHUV(SAVEt_RE_STATE);
10474
10475     Copy(&PL_reg_state, state, 1, struct re_save_state);
10476
10477     PL_reg_start_tmp = 0;
10478     PL_reg_start_tmpl = 0;
10479     PL_reg_oldsaved = NULL;
10480     PL_reg_oldsavedlen = 0;
10481     PL_reg_maxiter = 0;
10482     PL_reg_leftiter = 0;
10483     PL_reg_poscache = NULL;
10484     PL_reg_poscache_size = 0;
10485 #ifdef PERL_OLD_COPY_ON_WRITE
10486     PL_nrs = NULL;
10487 #endif
10488
10489     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10490     if (PL_curpm) {
10491         const REGEXP * const rx = PM_GETRE(PL_curpm);
10492         if (rx) {
10493             U32 i;
10494             for (i = 1; i <= RX_NPARENS(rx); i++) {
10495                 char digits[TYPE_CHARS(long)];
10496                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10497                 GV *const *const gvp
10498                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10499
10500                 if (gvp) {
10501                     GV * const gv = *gvp;
10502                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10503                         save_scalar(gv);
10504                 }
10505             }
10506         }
10507     }
10508 }
10509 #endif
10510
10511 static void
10512 clear_re(pTHX_ void *r)
10513 {
10514     dVAR;
10515     ReREFCNT_dec((REGEXP *)r);
10516 }
10517
10518 #ifdef DEBUGGING
10519
10520 STATIC void
10521 S_put_byte(pTHX_ SV *sv, int c)
10522 {
10523     PERL_ARGS_ASSERT_PUT_BYTE;
10524
10525     /* Our definition of isPRINT() ignores locales, so only bytes that are
10526        not part of UTF-8 are considered printable. I assume that the same
10527        holds for UTF-EBCDIC.
10528        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10529        which Wikipedia says:
10530
10531        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10532        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10533        identical, to the ASCII delete (DEL) or rubout control character.
10534        ) So the old condition can be simplified to !isPRINT(c)  */
10535     if (!isPRINT(c)) {
10536         if (c < 256) {
10537             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10538         }
10539         else {
10540             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10541         }
10542     }
10543     else {
10544         const char string = c;
10545         if (c == '-' || c == ']' || c == '\\' || c == '^')
10546             sv_catpvs(sv, "\\");
10547         sv_catpvn(sv, &string, 1);
10548     }
10549 }
10550
10551
10552 #define CLEAR_OPTSTART \
10553     if (optstart) STMT_START { \
10554             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10555             optstart=NULL; \
10556     } STMT_END
10557
10558 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10559
10560 STATIC const regnode *
10561 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10562             const regnode *last, const regnode *plast,
10563             SV* sv, I32 indent, U32 depth)
10564 {
10565     dVAR;
10566     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
10567     register const regnode *next;
10568     const regnode *optstart= NULL;
10569
10570     RXi_GET_DECL(r,ri);
10571     GET_RE_DEBUG_FLAGS_DECL;
10572
10573     PERL_ARGS_ASSERT_DUMPUNTIL;
10574
10575 #ifdef DEBUG_DUMPUNTIL
10576     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10577         last ? last-start : 0,plast ? plast-start : 0);
10578 #endif
10579
10580     if (plast && plast < last)
10581         last= plast;
10582
10583     while (PL_regkind[op] != END && (!last || node < last)) {
10584         /* While that wasn't END last time... */
10585         NODE_ALIGN(node);
10586         op = OP(node);
10587         if (op == CLOSE || op == WHILEM)
10588             indent--;
10589         next = regnext((regnode *)node);
10590
10591         /* Where, what. */
10592         if (OP(node) == OPTIMIZED) {
10593             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10594                 optstart = node;
10595             else
10596                 goto after_print;
10597         } else
10598             CLEAR_OPTSTART;
10599
10600         regprop(r, sv, node);
10601         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10602                       (int)(2*indent + 1), "", SvPVX_const(sv));
10603
10604         if (OP(node) != OPTIMIZED) {
10605             if (next == NULL)           /* Next ptr. */
10606                 PerlIO_printf(Perl_debug_log, " (0)");
10607             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10608                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10609             else
10610                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10611             (void)PerlIO_putc(Perl_debug_log, '\n');
10612         }
10613
10614       after_print:
10615         if (PL_regkind[(U8)op] == BRANCHJ) {
10616             assert(next);
10617             {
10618                 register const regnode *nnode = (OP(next) == LONGJMP
10619                                              ? regnext((regnode *)next)
10620                                              : next);
10621                 if (last && nnode > last)
10622                     nnode = last;
10623                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10624             }
10625         }
10626         else if (PL_regkind[(U8)op] == BRANCH) {
10627             assert(next);
10628             DUMPUNTIL(NEXTOPER(node), next);
10629         }
10630         else if ( PL_regkind[(U8)op]  == TRIE ) {
10631             const regnode *this_trie = node;
10632             const char op = OP(node);
10633             const U32 n = ARG(node);
10634             const reg_ac_data * const ac = op>=AHOCORASICK ?
10635                (reg_ac_data *)ri->data->data[n] :
10636                NULL;
10637             const reg_trie_data * const trie =
10638                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10639 #ifdef DEBUGGING
10640             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10641 #endif
10642             const regnode *nextbranch= NULL;
10643             I32 word_idx;
10644             sv_setpvs(sv, "");
10645             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10646                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10647
10648                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10649                    (int)(2*(indent+3)), "",
10650                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10651                             PL_colors[0], PL_colors[1],
10652                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10653                             PERL_PV_PRETTY_ELLIPSES    |
10654                             PERL_PV_PRETTY_LTGT
10655                             )
10656                             : "???"
10657                 );
10658                 if (trie->jump) {
10659                     U16 dist= trie->jump[word_idx+1];
10660                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10661                                   (UV)((dist ? this_trie + dist : next) - start));
10662                     if (dist) {
10663                         if (!nextbranch)
10664                             nextbranch= this_trie + trie->jump[0];
10665                         DUMPUNTIL(this_trie + dist, nextbranch);
10666                     }
10667                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10668                         nextbranch= regnext((regnode *)nextbranch);
10669                 } else {
10670                     PerlIO_printf(Perl_debug_log, "\n");
10671                 }
10672             }
10673             if (last && next > last)
10674                 node= last;
10675             else
10676                 node= next;
10677         }
10678         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10679             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10680                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10681         }
10682         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10683             assert(next);
10684             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10685         }
10686         else if ( op == PLUS || op == STAR) {
10687             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10688         }
10689         else if (PL_regkind[(U8)op] == ANYOF) {
10690             /* arglen 1 + class block */
10691             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
10692                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10693             node = NEXTOPER(node);
10694         }
10695         else if (PL_regkind[(U8)op] == EXACT) {
10696             /* Literal string, where present. */
10697             node += NODE_SZ_STR(node) - 1;
10698             node = NEXTOPER(node);
10699         }
10700         else {
10701             node = NEXTOPER(node);
10702             node += regarglen[(U8)op];
10703         }
10704         if (op == CURLYX || op == OPEN)
10705             indent++;
10706     }
10707     CLEAR_OPTSTART;
10708 #ifdef DEBUG_DUMPUNTIL
10709     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10710 #endif
10711     return node;
10712 }
10713
10714 #endif  /* DEBUGGING */
10715
10716 /*
10717  * Local variables:
10718  * c-indentation-style: bsd
10719  * c-basic-offset: 4
10720  * indent-tabs-mode: t
10721  * End:
10722  *
10723  * ex: set ts=8 sts=4 sw=4 noet:
10724  */