regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     HV          *paren_names;           /* Paren names */
 138
 139     regnode     **recurse;              /* Recurse regops */
 140     I32         recurse_count;          /* Number of recurse regops */
 141 #if ADD_TO_REGEXEC
 142     char        *starttry;              /* -Dr: where regtry was called. */
 143 #define RExC_starttry   (pRExC_state->starttry)
 144 #endif
 145 #ifdef DEBUGGING
 146     const char  *lastparse;
 147     I32         lastnum;
 148     AV          *paren_name_list;       /* idx -> name */
 149 #define RExC_lastparse  (pRExC_state->lastparse)
 150 #define RExC_lastnum    (pRExC_state->lastnum)
 151 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 152 #endif
 153 } RExC_state_t;
 154
 155 #define RExC_flags      (pRExC_state->flags)
 156 #define RExC_precomp    (pRExC_state->precomp)
 157 #define RExC_rx_sv      (pRExC_state->rx_sv)
 158 #define RExC_rx         (pRExC_state->rx)
 159 #define RExC_rxi        (pRExC_state->rxi)
 160 #define RExC_start      (pRExC_state->start)
 161 #define RExC_end        (pRExC_state->end)
 162 #define RExC_parse      (pRExC_state->parse)
 163 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 164 #ifdef RE_TRACK_PATTERN_OFFSETS
 165 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 166 #endif
 167 #define RExC_emit       (pRExC_state->emit)
 168 #define RExC_emit_start (pRExC_state->emit_start)
 169 #define RExC_emit_bound (pRExC_state->emit_bound)
 170 #define RExC_naughty    (pRExC_state->naughty)
 171 #define RExC_sawback    (pRExC_state->sawback)
 172 #define RExC_seen       (pRExC_state->seen)
 173 #define RExC_size       (pRExC_state->size)
 174 #define RExC_npar       (pRExC_state->npar)
 175 #define RExC_nestroot   (pRExC_state->nestroot)
 176 #define RExC_extralen   (pRExC_state->extralen)
 177 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 178 #define RExC_seen_evals (pRExC_state->seen_evals)
 179 #define RExC_utf8       (pRExC_state->utf8)
 180 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 181 #define RExC_open_parens        (pRExC_state->open_parens)
 182 #define RExC_close_parens       (pRExC_state->close_parens)
 183 #define RExC_opend      (pRExC_state->opend)
 184 #define RExC_paren_names        (pRExC_state->paren_names)
 185 #define RExC_recurse    (pRExC_state->recurse)
 186 #define RExC_recurse_count      (pRExC_state->recurse_count)
 187
 188
 189 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 190 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 191         ((*s) == '{' && regcurly(s)))
 192
 193 #ifdef SPSTART
 194 #undef SPSTART          /* dratted cpp namespace... */
 195 #endif
 196 /*
 197  * Flags to be passed up and down.
 198  */
 199 #define WORST           0       /* Worst case. */
 200 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 201
 202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 203  * character, and if utf8, must be invariant. */
 204 #define SIMPLE          0x02
 205 #define SPSTART         0x04    /* Starts with * or +. */
 206 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 207 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 208
 209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 210
 211 /* whether trie related optimizations are enabled */
 212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 213 #define TRIE_STUDY_OPT
 214 #define FULL_TRIE_STUDY
 215 #define TRIE_STCLASS
 216 #endif
 217
 218
 219
 220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 221 #define PBITVAL(paren) (1 << ((paren) & 7))
 222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 225
 226 /* If not already in utf8, do a longjmp back to the beginning */
 227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 228 #define REQUIRE_UTF8    STMT_START {                                       \
 229                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 230                         } STMT_END
 231
 232 /* About scan_data_t.
 233
 234   During optimisation we recurse through the regexp program performing
 235   various inplace (keyhole style) optimisations. In addition study_chunk
 236   and scan_commit populate this data structure with information about
 237   what strings MUST appear in the pattern. We look for the longest
 238   string that must appear at a fixed location, and we look for the
 239   longest string that may appear at a floating location. So for instance
 240   in the pattern:
 241
 242     /FOO[xX]A.*B[xX]BAR/
 243
 244   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 245   strings (because they follow a .* construct). study_chunk will identify
 246   both FOO and BAR as being the longest fixed and floating strings respectively.
 247
 248   The strings can be composites, for instance
 249
 250      /(f)(o)(o)/
 251
 252   will result in a composite fixed substring 'foo'.
 253
 254   For each string some basic information is maintained:
 255
 256   - offset or min_offset
 257     This is the position the string must appear at, or not before.
 258     It also implicitly (when combined with minlenp) tells us how many
 259     characters must match before the string we are searching for.
 260     Likewise when combined with minlenp and the length of the string it
 261     tells us how many characters must appear after the string we have
 262     found.
 263
 264   - max_offset
 265     Only used for floating strings. This is the rightmost point that
 266     the string can appear at. If set to I32 max it indicates that the
 267     string can occur infinitely far to the right.
 268
 269   - minlenp
 270     A pointer to the minimum length of the pattern that the string
 271     was found inside. This is important as in the case of positive
 272     lookahead or positive lookbehind we can have multiple patterns
 273     involved. Consider
 274
 275     /(?=FOO).*F/
 276
 277     The minimum length of the pattern overall is 3, the minimum length
 278     of the lookahead part is 3, but the minimum length of the part that
 279     will actually match is 1. So 'FOO's minimum length is 3, but the
 280     minimum length for the F is 1. This is important as the minimum length
 281     is used to determine offsets in front of and behind the string being
 282     looked for.  Since strings can be composites this is the length of the
 283     pattern at the time it was commited with a scan_commit. Note that
 284     the length is calculated by study_chunk, so that the minimum lengths
 285     are not known until the full pattern has been compiled, thus the
 286     pointer to the value.
 287
 288   - lookbehind
 289
 290     In the case of lookbehind the string being searched for can be
 291     offset past the start point of the final matching string.
 292     If this value was just blithely removed from the min_offset it would
 293     invalidate some of the calculations for how many chars must match
 294     before or after (as they are derived from min_offset and minlen and
 295     the length of the string being searched for).
 296     When the final pattern is compiled and the data is moved from the
 297     scan_data_t structure into the regexp structure the information
 298     about lookbehind is factored in, with the information that would
 299     have been lost precalculated in the end_shift field for the
 300     associated string.
 301
 302   The fields pos_min and pos_delta are used to store the minimum offset
 303   and the delta to the maximum offset at the current point in the pattern.
 304
 305 */
 306
 307 typedef struct scan_data_t {
 308     /*I32 len_min;      unused */
 309     /*I32 len_delta;    unused */
 310     I32 pos_min;
 311     I32 pos_delta;
 312     SV *last_found;
 313     I32 last_end;           /* min value, <0 unless valid. */
 314     I32 last_start_min;
 315     I32 last_start_max;
 316     SV **longest;           /* Either &l_fixed, or &l_float. */
 317     SV *longest_fixed;      /* longest fixed string found in pattern */
 318     I32 offset_fixed;       /* offset where it starts */
 319     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 320     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 321     SV *longest_float;      /* longest floating string found in pattern */
 322     I32 offset_float_min;   /* earliest point in string it can appear */
 323     I32 offset_float_max;   /* latest point in string it can appear */
 324     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 325     I32 lookbehind_float;   /* is the position of the string modified by LB */
 326     I32 flags;
 327     I32 whilem_c;
 328     I32 *last_closep;
 329     struct regnode_charclass_class *start_class;
 330 } scan_data_t;
 331
 332 /*
 333  * Forward declarations for pregcomp()'s friends.
 334  */
 335
 336 static const scan_data_t zero_scan_data =
 337   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 338
 339 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 340 #define SF_BEFORE_SEOL          0x0001
 341 #define SF_BEFORE_MEOL          0x0002
 342 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 343 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 344
 345 #ifdef NO_UNARY_PLUS
 346 #  define SF_FIX_SHIFT_EOL      (0+2)
 347 #  define SF_FL_SHIFT_EOL               (0+4)
 348 #else
 349 #  define SF_FIX_SHIFT_EOL      (+2)
 350 #  define SF_FL_SHIFT_EOL               (+4)
 351 #endif
 352
 353 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 354 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 355
 356 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 357 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 358 #define SF_IS_INF               0x0040
 359 #define SF_HAS_PAR              0x0080
 360 #define SF_IN_PAR               0x0100
 361 #define SF_HAS_EVAL             0x0200
 362 #define SCF_DO_SUBSTR           0x0400
 363 #define SCF_DO_STCLASS_AND      0x0800
 364 #define SCF_DO_STCLASS_OR       0x1000
 365 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 366 #define SCF_WHILEM_VISITED_POS  0x2000
 367
 368 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 369 #define SCF_SEEN_ACCEPT         0x8000
 370
 371 #define UTF cBOOL(RExC_utf8)
 372 #define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
 373 #define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
 374 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 375
 376 #define OOB_UNICODE             12345678
 377 #define OOB_NAMEDCLASS          -1
 378
 379 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 380 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 381
 382
 383 /* length of regex to show in messages that don't mark a position within */
 384 #define RegexLengthToShowInErrorMessages 127
 385
 386 /*
 387  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 388  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 389  * op/pragma/warn/regcomp.
 390  */
 391 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 392 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 393
 394 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 395
 396 /*
 397  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 398  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 399  * "...".
 400  */
 401 #define _FAIL(code) STMT_START {                                        \
 402     const char *ellipses = "";                                          \
 403     IV len = RExC_end - RExC_precomp;                                   \
 404                                                                         \
 405     if (!SIZE_ONLY)                                                     \
 406         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 407     if (len > RegexLengthToShowInErrorMessages) {                       \
 408         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 409         len = RegexLengthToShowInErrorMessages - 10;                    \
 410         ellipses = "...";                                               \
 411     }                                                                   \
 412     code;                                                               \
 413 } STMT_END
 414
 415 #define FAIL(msg) _FAIL(                            \
 416     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 417             msg, (int)len, RExC_precomp, ellipses))
 418
 419 #define FAIL2(msg,arg) _FAIL(                       \
 420     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 421             arg, (int)len, RExC_precomp, ellipses))
 422
 423 /*
 424  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 425  */
 426 #define Simple_vFAIL(m) STMT_START {                                    \
 427     const IV offset = RExC_parse - RExC_precomp;                        \
 428     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 429             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 430 } STMT_END
 431
 432 /*
 433  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 434  */
 435 #define vFAIL(m) STMT_START {                           \
 436     if (!SIZE_ONLY)                                     \
 437         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 438     Simple_vFAIL(m);                                    \
 439 } STMT_END
 440
 441 /*
 442  * Like Simple_vFAIL(), but accepts two arguments.
 443  */
 444 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 445     const IV offset = RExC_parse - RExC_precomp;                        \
 446     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 447             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 448 } STMT_END
 449
 450 /*
 451  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 452  */
 453 #define vFAIL2(m,a1) STMT_START {                       \
 454     if (!SIZE_ONLY)                                     \
 455         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 456     Simple_vFAIL2(m, a1);                               \
 457 } STMT_END
 458
 459
 460 /*
 461  * Like Simple_vFAIL(), but accepts three arguments.
 462  */
 463 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 464     const IV offset = RExC_parse - RExC_precomp;                \
 465     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 466             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 467 } STMT_END
 468
 469 /*
 470  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 471  */
 472 #define vFAIL3(m,a1,a2) STMT_START {                    \
 473     if (!SIZE_ONLY)                                     \
 474         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 475     Simple_vFAIL3(m, a1, a2);                           \
 476 } STMT_END
 477
 478 /*
 479  * Like Simple_vFAIL(), but accepts four arguments.
 480  */
 481 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 482     const IV offset = RExC_parse - RExC_precomp;                \
 483     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 484             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 485 } STMT_END
 486
 487 #define ckWARNreg(loc,m) STMT_START {                                   \
 488     const IV offset = loc - RExC_precomp;                               \
 489     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 490             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 491 } STMT_END
 492
 493 #define ckWARNregdep(loc,m) STMT_START {                                \
 494     const IV offset = loc - RExC_precomp;                               \
 495     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 496             m REPORT_LOCATION,                                          \
 497             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 498 } STMT_END
 499
 500 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 501     const IV offset = loc - RExC_precomp;                               \
 502     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 503             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 504 } STMT_END
 505
 506 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 507     const IV offset = loc - RExC_precomp;                               \
 508     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 509             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 510 } STMT_END
 511
 512 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 515             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 516 } STMT_END
 517
 518 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 519     const IV offset = loc - RExC_precomp;                               \
 520     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 521             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 522 } STMT_END
 523
 524 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 525     const IV offset = loc - RExC_precomp;                               \
 526     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 527             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 528 } STMT_END
 529
 530 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 531     const IV offset = loc - RExC_precomp;                               \
 532     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 533             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 534 } STMT_END
 535
 536
 537 /* Allow for side effects in s */
 538 #define REGC(c,s) STMT_START {                  \
 539     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 540 } STMT_END
 541
 542 /* Macros for recording node offsets.   20001227 mjd@plover.com
 543  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 544  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 545  * Element 0 holds the number n.
 546  * Position is 1 indexed.
 547  */
 548 #ifndef RE_TRACK_PATTERN_OFFSETS
 549 #define Set_Node_Offset_To_R(node,byte)
 550 #define Set_Node_Offset(node,byte)
 551 #define Set_Cur_Node_Offset
 552 #define Set_Node_Length_To_R(node,len)
 553 #define Set_Node_Length(node,len)
 554 #define Set_Node_Cur_Length(node)
 555 #define Node_Offset(n)
 556 #define Node_Length(n)
 557 #define Set_Node_Offset_Length(node,offset,len)
 558 #define ProgLen(ri) ri->u.proglen
 559 #define SetProgLen(ri,x) ri->u.proglen = x
 560 #else
 561 #define ProgLen(ri) ri->u.offsets[0]
 562 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 563 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 564     if (! SIZE_ONLY) {                                                  \
 565         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 566                     __LINE__, (int)(node), (int)(byte)));               \
 567         if((node) < 0) {                                                \
 568             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 569         } else {                                                        \
 570             RExC_offsets[2*(node)-1] = (byte);                          \
 571         }                                                               \
 572     }                                                                   \
 573 } STMT_END
 574
 575 #define Set_Node_Offset(node,byte) \
 576     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 577 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 578
 579 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 580     if (! SIZE_ONLY) {                                                  \
 581         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 582                 __LINE__, (int)(node), (int)(len)));                    \
 583         if((node) < 0) {                                                \
 584             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 585         } else {                                                        \
 586             RExC_offsets[2*(node)] = (len);                             \
 587         }                                                               \
 588     }                                                                   \
 589 } STMT_END
 590
 591 #define Set_Node_Length(node,len) \
 592     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 593 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 594 #define Set_Node_Cur_Length(node) \
 595     Set_Node_Length(node, RExC_parse - parse_start)
 596
 597 /* Get offsets and lengths */
 598 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 599 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 600
 601 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 602     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 603     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 604 } STMT_END
 605 #endif
 606
 607 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 608 #define EXPERIMENTAL_INPLACESCAN
 609 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 610
 611 #define DEBUG_STUDYDATA(str,data,depth)                              \
 612 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 613     PerlIO_printf(Perl_debug_log,                                    \
 614         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 615         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 616         (int)(depth)*2, "",                                          \
 617         (IV)((data)->pos_min),                                       \
 618         (IV)((data)->pos_delta),                                     \
 619         (UV)((data)->flags),                                         \
 620         (IV)((data)->whilem_c),                                      \
 621         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 622         is_inf ? "INF " : ""                                         \
 623     );                                                               \
 624     if ((data)->last_found)                                          \
 625         PerlIO_printf(Perl_debug_log,                                \
 626             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 627             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 628             SvPVX_const((data)->last_found),                         \
 629             (IV)((data)->last_end),                                  \
 630             (IV)((data)->last_start_min),                            \
 631             (IV)((data)->last_start_max),                            \
 632             ((data)->longest &&                                      \
 633              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 634             SvPVX_const((data)->longest_fixed),                      \
 635             (IV)((data)->offset_fixed),                              \
 636             ((data)->longest &&                                      \
 637              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 638             SvPVX_const((data)->longest_float),                      \
 639             (IV)((data)->offset_float_min),                          \
 640             (IV)((data)->offset_float_max)                           \
 641         );                                                           \
 642     PerlIO_printf(Perl_debug_log,"\n");                              \
 643 });
 644
 645 static void clear_re(pTHX_ void *r);
 646
 647 /* Mark that we cannot extend a found fixed substring at this point.
 648    Update the longest found anchored substring and the longest found
 649    floating substrings if needed. */
 650
 651 STATIC void
 652 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 653 {
 654     const STRLEN l = CHR_SVLEN(data->last_found);
 655     const STRLEN old_l = CHR_SVLEN(*data->longest);
 656     GET_RE_DEBUG_FLAGS_DECL;
 657
 658     PERL_ARGS_ASSERT_SCAN_COMMIT;
 659
 660     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 661         SvSetMagicSV(*data->longest, data->last_found);
 662         if (*data->longest == data->longest_fixed) {
 663             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 664             if (data->flags & SF_BEFORE_EOL)
 665                 data->flags
 666                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 667             else
 668                 data->flags &= ~SF_FIX_BEFORE_EOL;
 669             data->minlen_fixed=minlenp;
 670             data->lookbehind_fixed=0;
 671         }
 672         else { /* *data->longest == data->longest_float */
 673             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 674             data->offset_float_max = (l
 675                                       ? data->last_start_max
 676                                       : data->pos_min + data->pos_delta);
 677             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 678                 data->offset_float_max = I32_MAX;
 679             if (data->flags & SF_BEFORE_EOL)
 680                 data->flags
 681                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 682             else
 683                 data->flags &= ~SF_FL_BEFORE_EOL;
 684             data->minlen_float=minlenp;
 685             data->lookbehind_float=0;
 686         }
 687     }
 688     SvCUR_set(data->last_found, 0);
 689     {
 690         SV * const sv = data->last_found;
 691         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 692             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 693             if (mg)
 694                 mg->mg_len = 0;
 695         }
 696     }
 697     data->last_end = -1;
 698     data->flags &= ~SF_BEFORE_EOL;
 699     DEBUG_STUDYDATA("commit: ",data,0);
 700 }
 701
 702 /* Can match anything (initialization) */
 703 STATIC void
 704 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 705 {
 706     PERL_ARGS_ASSERT_CL_ANYTHING;
 707
 708     ANYOF_CLASS_ZERO(cl);
 709     ANYOF_BITMAP_SETALL(cl);
 710     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 711     if (LOC)
 712         cl->flags |= ANYOF_LOCALE;
 713     cl->flags |= ANYOF_FOLD;
 714 }
 715
 716 /* Can match anything (initialization) */
 717 STATIC int
 718 S_cl_is_anything(const struct regnode_charclass_class *cl)
 719 {
 720     int value;
 721
 722     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 723
 724     for (value = 0; value <= ANYOF_MAX; value += 2)
 725         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 726             return 1;
 727     if (!(cl->flags & ANYOF_UNICODE_ALL))
 728         return 0;
 729     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 730         return 0;
 731     return 1;
 732 }
 733
 734 /* Can match anything (initialization) */
 735 STATIC void
 736 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 737 {
 738     PERL_ARGS_ASSERT_CL_INIT;
 739
 740     Zero(cl, 1, struct regnode_charclass_class);
 741     cl->type = ANYOF;
 742     cl_anything(pRExC_state, cl);
 743 }
 744
 745 STATIC void
 746 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 747 {
 748     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 749
 750     Zero(cl, 1, struct regnode_charclass_class);
 751     cl->type = ANYOF;
 752     cl_anything(pRExC_state, cl);
 753     if (LOC)
 754         cl->flags |= ANYOF_LOCALE;
 755 }
 756
 757 /* 'And' a given class with another one.  Can create false positives */
 758 /* We assume that cl is not inverted */
 759 STATIC void
 760 S_cl_and(struct regnode_charclass_class *cl,
 761         const struct regnode_charclass_class *and_with)
 762 {
 763     PERL_ARGS_ASSERT_CL_AND;
 764
 765     assert(and_with->type == ANYOF);
 766     if (!(and_with->flags & ANYOF_CLASS)
 767         && !(cl->flags & ANYOF_CLASS)
 768         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 769         && !(and_with->flags & ANYOF_FOLD)
 770         && !(cl->flags & ANYOF_FOLD)) {
 771         int i;
 772
 773         if (and_with->flags & ANYOF_INVERT)
 774             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 775                 cl->bitmap[i] &= ~and_with->bitmap[i];
 776         else
 777             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 778                 cl->bitmap[i] &= and_with->bitmap[i];
 779     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 780     if (!(and_with->flags & ANYOF_EOS))
 781         cl->flags &= ~ANYOF_EOS;
 782
 783     if (!(and_with->flags & ANYOF_FOLD))
 784         cl->flags &= ~ANYOF_FOLD;
 785
 786     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
 787         !(and_with->flags & ANYOF_INVERT)) {
 788         cl->flags &= ~ANYOF_UNICODE_ALL;
 789         cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
 790                                                            only the one(s)
 791                                                            actually set */
 792         ARG_SET(cl, ARG(and_with));
 793     }
 794     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 795         !(and_with->flags & ANYOF_INVERT))
 796         cl->flags &= ~ANYOF_UNICODE_ALL;
 797     if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
 798         !(and_with->flags & ANYOF_INVERT))
 799         cl->flags &= ~ANYOF_NONBITMAP;
 800 }
 801
 802 /* 'OR' a given class with another one.  Can create false positives */
 803 /* We assume that cl is not inverted */
 804 STATIC void
 805 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 806 {
 807     PERL_ARGS_ASSERT_CL_OR;
 808
 809     if (or_with->flags & ANYOF_INVERT) {
 810         /* We do not use
 811          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 812          *   <= (B1 | !B2) | (CL1 | !CL2)
 813          * which is wasteful if CL2 is small, but we ignore CL2:
 814          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 815          * XXXX Can we handle case-fold?  Unclear:
 816          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 817          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 818          */
 819         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 820              && !(or_with->flags & ANYOF_FOLD)
 821              && !(cl->flags & ANYOF_FOLD) ) {
 822             int i;
 823
 824             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 825                 cl->bitmap[i] |= ~or_with->bitmap[i];
 826         } /* XXXX: logic is complicated otherwise */
 827         else {
 828             cl_anything(pRExC_state, cl);
 829         }
 830     } else {
 831         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 832         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 833              && (!(or_with->flags & ANYOF_FOLD)
 834                  || (cl->flags & ANYOF_FOLD)) ) {
 835             int i;
 836
 837             /* OR char bitmap and class bitmap separately */
 838             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 839                 cl->bitmap[i] |= or_with->bitmap[i];
 840             if (or_with->flags & ANYOF_CLASS) {
 841                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 842                     cl->classflags[i] |= or_with->classflags[i];
 843                 cl->flags |= ANYOF_CLASS;
 844             }
 845         }
 846         else { /* XXXX: logic is complicated, leave it along for a moment. */
 847             cl_anything(pRExC_state, cl);
 848         }
 849     }
 850     if (or_with->flags & ANYOF_EOS)
 851         cl->flags |= ANYOF_EOS;
 852
 853     if (or_with->flags & ANYOF_FOLD)
 854         cl->flags |= ANYOF_FOLD;
 855
 856     /* If both nodes match something outside the bitmap, but what they match
 857      * outside is not the same pointer, and hence not easily compared, give up
 858      * and allow the start class to match everything outside the bitmap */
 859     if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
 860         ARG(cl) != ARG(or_with)) {
 861         cl->flags |= ANYOF_UNICODE_ALL;
 862     }
 863
 864     if (or_with->flags & ANYOF_UNICODE_ALL) {
 865         cl->flags |= ANYOF_UNICODE_ALL;
 866     }
 867 }
 868
 869 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 870 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 871 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 872 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 873
 874
 875 #ifdef DEBUGGING
 876 /*
 877    dump_trie(trie,widecharmap,revcharmap)
 878    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 879    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 880
 881    These routines dump out a trie in a somewhat readable format.
 882    The _interim_ variants are used for debugging the interim
 883    tables that are used to generate the final compressed
 884    representation which is what dump_trie expects.
 885
 886    Part of the reason for their existance is to provide a form
 887    of documentation as to how the different representations function.
 888
 889 */
 890
 891 /*
 892   Dumps the final compressed table form of the trie to Perl_debug_log.
 893   Used for debugging make_trie().
 894 */
 895
 896 STATIC void
 897 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 898             AV *revcharmap, U32 depth)
 899 {
 900     U32 state;
 901     SV *sv=sv_newmortal();
 902     int colwidth= widecharmap ? 6 : 4;
 903     U16 word;
 904     GET_RE_DEBUG_FLAGS_DECL;
 905
 906     PERL_ARGS_ASSERT_DUMP_TRIE;
 907
 908     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 909         (int)depth * 2 + 2,"",
 910         "Match","Base","Ofs" );
 911
 912     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 913         SV ** const tmp = av_fetch( revcharmap, state, 0);
 914         if ( tmp ) {
 915             PerlIO_printf( Perl_debug_log, "%*s",
 916                 colwidth,
 917                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 918                             PL_colors[0], PL_colors[1],
 919                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 920                             PERL_PV_ESCAPE_FIRSTCHAR
 921                 )
 922             );
 923         }
 924     }
 925     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 926         (int)depth * 2 + 2,"");
 927
 928     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 929         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 930     PerlIO_printf( Perl_debug_log, "\n");
 931
 932     for( state = 1 ; state < trie->statecount ; state++ ) {
 933         const U32 base = trie->states[ state ].trans.base;
 934
 935         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 936
 937         if ( trie->states[ state ].wordnum ) {
 938             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 939         } else {
 940             PerlIO_printf( Perl_debug_log, "%6s", "" );
 941         }
 942
 943         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 944
 945         if ( base ) {
 946             U32 ofs = 0;
 947
 948             while( ( base + ofs  < trie->uniquecharcount ) ||
 949                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 950                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 951                     ofs++;
 952
 953             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 954
 955             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 956                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 957                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 958                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 959                 {
 960                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 961                     colwidth,
 962                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 963                 } else {
 964                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 965                 }
 966             }
 967
 968             PerlIO_printf( Perl_debug_log, "]");
 969
 970         }
 971         PerlIO_printf( Perl_debug_log, "\n" );
 972     }
 973     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 974     for (word=1; word <= trie->wordcount; word++) {
 975         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 976             (int)word, (int)(trie->wordinfo[word].prev),
 977             (int)(trie->wordinfo[word].len));
 978     }
 979     PerlIO_printf(Perl_debug_log, "\n" );
 980 }
 981 /*
 982   Dumps a fully constructed but uncompressed trie in list form.
 983   List tries normally only are used for construction when the number of
 984   possible chars (trie->uniquecharcount) is very high.
 985   Used for debugging make_trie().
 986 */
 987 STATIC void
 988 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 989                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
 990                          U32 depth)
 991 {
 992     U32 state;
 993     SV *sv=sv_newmortal();
 994     int colwidth= widecharmap ? 6 : 4;
 995     GET_RE_DEBUG_FLAGS_DECL;
 996
 997     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 998
 999     /* print out the table precompression.  */
1000     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1001         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1002         "------:-----+-----------------\n" );
1003
1004     for( state=1 ; state < next_alloc ; state ++ ) {
1005         U16 charid;
1006
1007         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1008             (int)depth * 2 + 2,"", (UV)state  );
1009         if ( ! trie->states[ state ].wordnum ) {
1010             PerlIO_printf( Perl_debug_log, "%5s| ","");
1011         } else {
1012             PerlIO_printf( Perl_debug_log, "W%4x| ",
1013                 trie->states[ state ].wordnum
1014             );
1015         }
1016         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1017             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1018             if ( tmp ) {
1019                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1020                     colwidth,
1021                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1022                             PL_colors[0], PL_colors[1],
1023                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1024                             PERL_PV_ESCAPE_FIRSTCHAR
1025                     ) ,
1026                     TRIE_LIST_ITEM(state,charid).forid,
1027                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1028                 );
1029                 if (!(charid % 10))
1030                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1031                         (int)((depth * 2) + 14), "");
1032             }
1033         }
1034         PerlIO_printf( Perl_debug_log, "\n");
1035     }
1036 }
1037
1038 /*
1039   Dumps a fully constructed but uncompressed trie in table form.
1040   This is the normal DFA style state transition table, with a few
1041   twists to facilitate compression later.
1042   Used for debugging make_trie().
1043 */
1044 STATIC void
1045 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1046                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1047                           U32 depth)
1048 {
1049     U32 state;
1050     U16 charid;
1051     SV *sv=sv_newmortal();
1052     int colwidth= widecharmap ? 6 : 4;
1053     GET_RE_DEBUG_FLAGS_DECL;
1054
1055     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1056
1057     /*
1058        print out the table precompression so that we can do a visual check
1059        that they are identical.
1060      */
1061
1062     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1063
1064     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1065         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1066         if ( tmp ) {
1067             PerlIO_printf( Perl_debug_log, "%*s",
1068                 colwidth,
1069                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1070                             PL_colors[0], PL_colors[1],
1071                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1072                             PERL_PV_ESCAPE_FIRSTCHAR
1073                 )
1074             );
1075         }
1076     }
1077
1078     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1079
1080     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1081         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1082     }
1083
1084     PerlIO_printf( Perl_debug_log, "\n" );
1085
1086     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1087
1088         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1089             (int)depth * 2 + 2,"",
1090             (UV)TRIE_NODENUM( state ) );
1091
1092         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1093             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1094             if (v)
1095                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1096             else
1097                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1098         }
1099         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1100             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1101         } else {
1102             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1103             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1104         }
1105     }
1106 }
1107
1108 #endif
1109
1110
1111 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1112   startbranch: the first branch in the whole branch sequence
1113   first      : start branch of sequence of branch-exact nodes.
1114                May be the same as startbranch
1115   last       : Thing following the last branch.
1116                May be the same as tail.
1117   tail       : item following the branch sequence
1118   count      : words in the sequence
1119   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1120   depth      : indent depth
1121
1122 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1123
1124 A trie is an N'ary tree where the branches are determined by digital
1125 decomposition of the key. IE, at the root node you look up the 1st character and
1126 follow that branch repeat until you find the end of the branches. Nodes can be
1127 marked as "accepting" meaning they represent a complete word. Eg:
1128
1129   /he|she|his|hers/
1130
1131 would convert into the following structure. Numbers represent states, letters
1132 following numbers represent valid transitions on the letter from that state, if
1133 the number is in square brackets it represents an accepting state, otherwise it
1134 will be in parenthesis.
1135
1136       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1137       |    |
1138       |   (2)
1139       |    |
1140      (1)   +-i->(6)-+-s->[7]
1141       |
1142       +-s->(3)-+-h->(4)-+-e->[5]
1143
1144       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1145
1146 This shows that when matching against the string 'hers' we will begin at state 1
1147 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1148 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1149 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1150 single traverse. We store a mapping from accepting to state to which word was
1151 matched, and then when we have multiple possibilities we try to complete the
1152 rest of the regex in the order in which they occured in the alternation.
1153
1154 The only prior NFA like behaviour that would be changed by the TRIE support is
1155 the silent ignoring of duplicate alternations which are of the form:
1156
1157  / (DUPE|DUPE) X? (?{ ... }) Y /x
1158
1159 Thus EVAL blocks following a trie may be called a different number of times with
1160 and without the optimisation. With the optimisations dupes will be silently
1161 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1162 the following demonstrates:
1163
1164  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1165
1166 which prints out 'word' three times, but
1167
1168  'words'=~/(word|word|word)(?{ print $1 })S/
1169
1170 which doesnt print it out at all. This is due to other optimisations kicking in.
1171
1172 Example of what happens on a structural level:
1173
1174 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1175
1176    1: CURLYM[1] {1,32767}(18)
1177    5:   BRANCH(8)
1178    6:     EXACT <ac>(16)
1179    8:   BRANCH(11)
1180    9:     EXACT <ad>(16)
1181   11:   BRANCH(14)
1182   12:     EXACT <ab>(16)
1183   16:   SUCCEED(0)
1184   17:   NOTHING(18)
1185   18: END(0)
1186
1187 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1188 and should turn into:
1189
1190    1: CURLYM[1] {1,32767}(18)
1191    5:   TRIE(16)
1192         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1193           <ac>
1194           <ad>
1195           <ab>
1196   16:   SUCCEED(0)
1197   17:   NOTHING(18)
1198   18: END(0)
1199
1200 Cases where tail != last would be like /(?foo|bar)baz/:
1201
1202    1: BRANCH(4)
1203    2:   EXACT <foo>(8)
1204    4: BRANCH(7)
1205    5:   EXACT <bar>(8)
1206    7: TAIL(8)
1207    8: EXACT <baz>(10)
1208   10: END(0)
1209
1210 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1211 and would end up looking like:
1212
1213     1: TRIE(8)
1214       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1215         <foo>
1216         <bar>
1217    7: TAIL(8)
1218    8: EXACT <baz>(10)
1219   10: END(0)
1220
1221     d = uvuni_to_utf8_flags(d, uv, 0);
1222
1223 is the recommended Unicode-aware way of saying
1224
1225     *(d++) = uv;
1226 */
1227
1228 #define TRIE_STORE_REVCHAR                                                 \
1229     STMT_START {                                                           \
1230         if (UTF) {                                                         \
1231             SV *zlopp = newSV(2);                                          \
1232             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1233             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1234             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1235             SvPOK_on(zlopp);                                               \
1236             SvUTF8_on(zlopp);                                              \
1237             av_push(revcharmap, zlopp);                                    \
1238         } else {                                                           \
1239             char ooooff = (char)uvc;                                               \
1240             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1241         }                                                                  \
1242         } STMT_END
1243
1244 #define TRIE_READ_CHAR STMT_START {                                           \
1245     wordlen++;                                                                \
1246     if ( UTF ) {                                                              \
1247         if ( folder ) {                                                       \
1248             if ( foldlen > 0 ) {                                              \
1249                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1250                foldlen -= len;                                                \
1251                scan += len;                                                   \
1252                len = 0;                                                       \
1253             } else {                                                          \
1254                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1255                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1256                 foldlen -= UNISKIP( uvc );                                    \
1257                 scan = foldbuf + UNISKIP( uvc );                              \
1258             }                                                                 \
1259         } else {                                                              \
1260             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1261         }                                                                     \
1262     } else {                                                                  \
1263         uvc = (U32)*uc;                                                       \
1264         len = 1;                                                              \
1265     }                                                                         \
1266 } STMT_END
1267
1268
1269
1270 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1271     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1272         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1273         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1274     }                                                           \
1275     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1276     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1277     TRIE_LIST_CUR( state )++;                                   \
1278 } STMT_END
1279
1280 #define TRIE_LIST_NEW(state) STMT_START {                       \
1281     Newxz( trie->states[ state ].trans.list,               \
1282         4, reg_trie_trans_le );                                 \
1283      TRIE_LIST_CUR( state ) = 1;                                \
1284      TRIE_LIST_LEN( state ) = 4;                                \
1285 } STMT_END
1286
1287 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1288     U16 dupe= trie->states[ state ].wordnum;                    \
1289     regnode * const noper_next = regnext( noper );              \
1290                                                                 \
1291     DEBUG_r({                                                   \
1292         /* store the word for dumping */                        \
1293         SV* tmp;                                                \
1294         if (OP(noper) != NOTHING)                               \
1295             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1296         else                                                    \
1297             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1298         av_push( trie_words, tmp );                             \
1299     });                                                         \
1300                                                                 \
1301     curword++;                                                  \
1302     trie->wordinfo[curword].prev   = 0;                         \
1303     trie->wordinfo[curword].len    = wordlen;                   \
1304     trie->wordinfo[curword].accept = state;                     \
1305                                                                 \
1306     if ( noper_next < tail ) {                                  \
1307         if (!trie->jump)                                        \
1308             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1309         trie->jump[curword] = (U16)(noper_next - convert);      \
1310         if (!jumper)                                            \
1311             jumper = noper_next;                                \
1312         if (!nextbranch)                                        \
1313             nextbranch= regnext(cur);                           \
1314     }                                                           \
1315                                                                 \
1316     if ( dupe ) {                                               \
1317         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1318         /* chain, so that when the bits of chain are later    */\
1319         /* linked together, the dups appear in the chain      */\
1320         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1321         trie->wordinfo[dupe].prev = curword;                    \
1322     } else {                                                    \
1323         /* we haven't inserted this word yet.                */ \
1324         trie->states[ state ].wordnum = curword;                \
1325     }                                                           \
1326 } STMT_END
1327
1328
1329 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1330      ( ( base + charid >=  ucharcount                                   \
1331          && base + charid < ubound                                      \
1332          && state == trie->trans[ base - ucharcount + charid ].check    \
1333          && trie->trans[ base - ucharcount + charid ].next )            \
1334            ? trie->trans[ base - ucharcount + charid ].next             \
1335            : ( state==1 ? special : 0 )                                 \
1336       )
1337
1338 #define MADE_TRIE       1
1339 #define MADE_JUMP_TRIE  2
1340 #define MADE_EXACT_TRIE 4
1341
1342 STATIC I32
1343 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1344 {
1345     dVAR;
1346     /* first pass, loop through and scan words */
1347     reg_trie_data *trie;
1348     HV *widecharmap = NULL;
1349     AV *revcharmap = newAV();
1350     regnode *cur;
1351     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1352     STRLEN len = 0;
1353     UV uvc = 0;
1354     U16 curword = 0;
1355     U32 next_alloc = 0;
1356     regnode *jumper = NULL;
1357     regnode *nextbranch = NULL;
1358     regnode *convert = NULL;
1359     U32 *prev_states; /* temp array mapping each state to previous one */
1360     /* we just use folder as a flag in utf8 */
1361     const U8 * folder = NULL;
1362
1363 #ifdef DEBUGGING
1364     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1365     AV *trie_words = NULL;
1366     /* along with revcharmap, this only used during construction but both are
1367      * useful during debugging so we store them in the struct when debugging.
1368      */
1369 #else
1370     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1371     STRLEN trie_charcount=0;
1372 #endif
1373     SV *re_trie_maxbuff;
1374     GET_RE_DEBUG_FLAGS_DECL;
1375
1376     PERL_ARGS_ASSERT_MAKE_TRIE;
1377 #ifndef DEBUGGING
1378     PERL_UNUSED_ARG(depth);
1379 #endif
1380
1381     switch (flags) {
1382         case EXACTFU: folder = PL_fold_latin1; break;
1383         case EXACTF:  folder = PL_fold; break;
1384         case EXACTFL: folder = PL_fold_locale; break;
1385     }
1386
1387     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1388     trie->refcount = 1;
1389     trie->startstate = 1;
1390     trie->wordcount = word_count;
1391     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1392     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1393     if (!(UTF && folder))
1394         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1395     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1396                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1397
1398     DEBUG_r({
1399         trie_words = newAV();
1400     });
1401
1402     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1403     if (!SvIOK(re_trie_maxbuff)) {
1404         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1405     }
1406     DEBUG_OPTIMISE_r({
1407                 PerlIO_printf( Perl_debug_log,
1408                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1409                   (int)depth * 2 + 2, "",
1410                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1411                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1412                   (int)depth);
1413     });
1414
1415    /* Find the node we are going to overwrite */
1416     if ( first == startbranch && OP( last ) != BRANCH ) {
1417         /* whole branch chain */
1418         convert = first;
1419     } else {
1420         /* branch sub-chain */
1421         convert = NEXTOPER( first );
1422     }
1423
1424     /*  -- First loop and Setup --
1425
1426        We first traverse the branches and scan each word to determine if it
1427        contains widechars, and how many unique chars there are, this is
1428        important as we have to build a table with at least as many columns as we
1429        have unique chars.
1430
1431        We use an array of integers to represent the character codes 0..255
1432        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1433        native representation of the character value as the key and IV's for the
1434        coded index.
1435
1436        *TODO* If we keep track of how many times each character is used we can
1437        remap the columns so that the table compression later on is more
1438        efficient in terms of memory by ensuring the most common value is in the
1439        middle and the least common are on the outside.  IMO this would be better
1440        than a most to least common mapping as theres a decent chance the most
1441        common letter will share a node with the least common, meaning the node
1442        will not be compressable. With a middle is most common approach the worst
1443        case is when we have the least common nodes twice.
1444
1445      */
1446
1447     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1448         regnode * const noper = NEXTOPER( cur );
1449         const U8 *uc = (U8*)STRING( noper );
1450         const U8 * const e  = uc + STR_LEN( noper );
1451         STRLEN foldlen = 0;
1452         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1453         const U8 *scan = (U8*)NULL;
1454         U32 wordlen      = 0;         /* required init */
1455         STRLEN chars = 0;
1456         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1457
1458         if (OP(noper) == NOTHING) {
1459             trie->minlen= 0;
1460             continue;
1461         }
1462         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1463             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1464                                           regardless of encoding */
1465
1466         for ( ; uc < e ; uc += len ) {
1467             TRIE_CHARCOUNT(trie)++;
1468             TRIE_READ_CHAR;
1469             chars++;
1470             if ( uvc < 256 ) {
1471                 if ( !trie->charmap[ uvc ] ) {
1472                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1473                     if ( folder )
1474                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1475                     TRIE_STORE_REVCHAR;
1476                 }
1477                 if ( set_bit ) {
1478                     /* store the codepoint in the bitmap, and its folded
1479                      * equivalent. */
1480                     TRIE_BITMAP_SET(trie,uvc);
1481
1482                     /* store the folded codepoint */
1483                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1484
1485                     if ( !UTF ) {
1486                         /* store first byte of utf8 representation of
1487                            variant codepoints */
1488                         if (! UNI_IS_INVARIANT(uvc)) {
1489                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1490                         }
1491                     }
1492                     set_bit = 0; /* We've done our bit :-) */
1493                 }
1494             } else {
1495                 SV** svpp;
1496                 if ( !widecharmap )
1497                     widecharmap = newHV();
1498
1499                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1500
1501                 if ( !svpp )
1502                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1503
1504                 if ( !SvTRUE( *svpp ) ) {
1505                     sv_setiv( *svpp, ++trie->uniquecharcount );
1506                     TRIE_STORE_REVCHAR;
1507                 }
1508             }
1509         }
1510         if( cur == first ) {
1511             trie->minlen=chars;
1512             trie->maxlen=chars;
1513         } else if (chars < trie->minlen) {
1514             trie->minlen=chars;
1515         } else if (chars > trie->maxlen) {
1516             trie->maxlen=chars;
1517         }
1518
1519     } /* end first pass */
1520     DEBUG_TRIE_COMPILE_r(
1521         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1522                 (int)depth * 2 + 2,"",
1523                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1524                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1525                 (int)trie->minlen, (int)trie->maxlen )
1526     );
1527
1528     /*
1529         We now know what we are dealing with in terms of unique chars and
1530         string sizes so we can calculate how much memory a naive
1531         representation using a flat table  will take. If it's over a reasonable
1532         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1533         conservative but potentially much slower representation using an array
1534         of lists.
1535
1536         At the end we convert both representations into the same compressed
1537         form that will be used in regexec.c for matching with. The latter
1538         is a form that cannot be used to construct with but has memory
1539         properties similar to the list form and access properties similar
1540         to the table form making it both suitable for fast searches and
1541         small enough that its feasable to store for the duration of a program.
1542
1543         See the comment in the code where the compressed table is produced
1544         inplace from the flat tabe representation for an explanation of how
1545         the compression works.
1546
1547     */
1548
1549
1550     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1551     prev_states[1] = 0;
1552
1553     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1554         /*
1555             Second Pass -- Array Of Lists Representation
1556
1557             Each state will be represented by a list of charid:state records
1558             (reg_trie_trans_le) the first such element holds the CUR and LEN
1559             points of the allocated array. (See defines above).
1560
1561             We build the initial structure using the lists, and then convert
1562             it into the compressed table form which allows faster lookups
1563             (but cant be modified once converted).
1564         */
1565
1566         STRLEN transcount = 1;
1567
1568         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1569             "%*sCompiling trie using list compiler\n",
1570             (int)depth * 2 + 2, ""));
1571
1572         trie->states = (reg_trie_state *)
1573             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1574                                   sizeof(reg_trie_state) );
1575         TRIE_LIST_NEW(1);
1576         next_alloc = 2;
1577
1578         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1579
1580             regnode * const noper = NEXTOPER( cur );
1581             U8 *uc           = (U8*)STRING( noper );
1582             const U8 * const e = uc + STR_LEN( noper );
1583             U32 state        = 1;         /* required init */
1584             U16 charid       = 0;         /* sanity init */
1585             U8 *scan         = (U8*)NULL; /* sanity init */
1586             STRLEN foldlen   = 0;         /* required init */
1587             U32 wordlen      = 0;         /* required init */
1588             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1589
1590             if (OP(noper) != NOTHING) {
1591                 for ( ; uc < e ; uc += len ) {
1592
1593                     TRIE_READ_CHAR;
1594
1595                     if ( uvc < 256 ) {
1596                         charid = trie->charmap[ uvc ];
1597                     } else {
1598                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1599                         if ( !svpp ) {
1600                             charid = 0;
1601                         } else {
1602                             charid=(U16)SvIV( *svpp );
1603                         }
1604                     }
1605                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1606                     if ( charid ) {
1607
1608                         U16 check;
1609                         U32 newstate = 0;
1610
1611                         charid--;
1612                         if ( !trie->states[ state ].trans.list ) {
1613                             TRIE_LIST_NEW( state );
1614                         }
1615                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1616                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1617                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1618                                 break;
1619                             }
1620                         }
1621                         if ( ! newstate ) {
1622                             newstate = next_alloc++;
1623                             prev_states[newstate] = state;
1624                             TRIE_LIST_PUSH( state, charid, newstate );
1625                             transcount++;
1626                         }
1627                         state = newstate;
1628                     } else {
1629                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1630                     }
1631                 }
1632             }
1633             TRIE_HANDLE_WORD(state);
1634
1635         } /* end second pass */
1636
1637         /* next alloc is the NEXT state to be allocated */
1638         trie->statecount = next_alloc;
1639         trie->states = (reg_trie_state *)
1640             PerlMemShared_realloc( trie->states,
1641                                    next_alloc
1642                                    * sizeof(reg_trie_state) );
1643
1644         /* and now dump it out before we compress it */
1645         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1646                                                          revcharmap, next_alloc,
1647                                                          depth+1)
1648         );
1649
1650         trie->trans = (reg_trie_trans *)
1651             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1652         {
1653             U32 state;
1654             U32 tp = 0;
1655             U32 zp = 0;
1656
1657
1658             for( state=1 ; state < next_alloc ; state ++ ) {
1659                 U32 base=0;
1660
1661                 /*
1662                 DEBUG_TRIE_COMPILE_MORE_r(
1663                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1664                 );
1665                 */
1666
1667                 if (trie->states[state].trans.list) {
1668                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1669                     U16 maxid=minid;
1670                     U16 idx;
1671
1672                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1673                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1674                         if ( forid < minid ) {
1675                             minid=forid;
1676                         } else if ( forid > maxid ) {
1677                             maxid=forid;
1678                         }
1679                     }
1680                     if ( transcount < tp + maxid - minid + 1) {
1681                         transcount *= 2;
1682                         trie->trans = (reg_trie_trans *)
1683                             PerlMemShared_realloc( trie->trans,
1684                                                      transcount
1685                                                      * sizeof(reg_trie_trans) );
1686                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1687                     }
1688                     base = trie->uniquecharcount + tp - minid;
1689                     if ( maxid == minid ) {
1690                         U32 set = 0;
1691                         for ( ; zp < tp ; zp++ ) {
1692                             if ( ! trie->trans[ zp ].next ) {
1693                                 base = trie->uniquecharcount + zp - minid;
1694                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1695                                 trie->trans[ zp ].check = state;
1696                                 set = 1;
1697                                 break;
1698                             }
1699                         }
1700                         if ( !set ) {
1701                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1702                             trie->trans[ tp ].check = state;
1703                             tp++;
1704                             zp = tp;
1705                         }
1706                     } else {
1707                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1708                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1709                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1710                             trie->trans[ tid ].check = state;
1711                         }
1712                         tp += ( maxid - minid + 1 );
1713                     }
1714                     Safefree(trie->states[ state ].trans.list);
1715                 }
1716                 /*
1717                 DEBUG_TRIE_COMPILE_MORE_r(
1718                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1719                 );
1720                 */
1721                 trie->states[ state ].trans.base=base;
1722             }
1723             trie->lasttrans = tp + 1;
1724         }
1725     } else {
1726         /*
1727            Second Pass -- Flat Table Representation.
1728
1729            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1730            We know that we will need Charcount+1 trans at most to store the data
1731            (one row per char at worst case) So we preallocate both structures
1732            assuming worst case.
1733
1734            We then construct the trie using only the .next slots of the entry
1735            structs.
1736
1737            We use the .check field of the first entry of the node temporarily to
1738            make compression both faster and easier by keeping track of how many non
1739            zero fields are in the node.
1740
1741            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1742            transition.
1743
1744            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1745            number representing the first entry of the node, and state as a
1746            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1747            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1748            are 2 entrys per node. eg:
1749
1750              A B       A B
1751           1. 2 4    1. 3 7
1752           2. 0 3    3. 0 5
1753           3. 0 0    5. 0 0
1754           4. 0 0    7. 0 0
1755
1756            The table is internally in the right hand, idx form. However as we also
1757            have to deal with the states array which is indexed by nodenum we have to
1758            use TRIE_NODENUM() to convert.
1759
1760         */
1761         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1762             "%*sCompiling trie using table compiler\n",
1763             (int)depth * 2 + 2, ""));
1764
1765         trie->trans = (reg_trie_trans *)
1766             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1767                                   * trie->uniquecharcount + 1,
1768                                   sizeof(reg_trie_trans) );
1769         trie->states = (reg_trie_state *)
1770             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1771                                   sizeof(reg_trie_state) );
1772         next_alloc = trie->uniquecharcount + 1;
1773
1774
1775         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1776
1777             regnode * const noper   = NEXTOPER( cur );
1778             const U8 *uc     = (U8*)STRING( noper );
1779             const U8 * const e = uc + STR_LEN( noper );
1780
1781             U32 state        = 1;         /* required init */
1782
1783             U16 charid       = 0;         /* sanity init */
1784             U32 accept_state = 0;         /* sanity init */
1785             U8 *scan         = (U8*)NULL; /* sanity init */
1786
1787             STRLEN foldlen   = 0;         /* required init */
1788             U32 wordlen      = 0;         /* required init */
1789             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1790
1791             if ( OP(noper) != NOTHING ) {
1792                 for ( ; uc < e ; uc += len ) {
1793
1794                     TRIE_READ_CHAR;
1795
1796                     if ( uvc < 256 ) {
1797                         charid = trie->charmap[ uvc ];
1798                     } else {
1799                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1800                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1801                     }
1802                     if ( charid ) {
1803                         charid--;
1804                         if ( !trie->trans[ state + charid ].next ) {
1805                             trie->trans[ state + charid ].next = next_alloc;
1806                             trie->trans[ state ].check++;
1807                             prev_states[TRIE_NODENUM(next_alloc)]
1808                                     = TRIE_NODENUM(state);
1809                             next_alloc += trie->uniquecharcount;
1810                         }
1811                         state = trie->trans[ state + charid ].next;
1812                     } else {
1813                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1814                     }
1815                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1816                 }
1817             }
1818             accept_state = TRIE_NODENUM( state );
1819             TRIE_HANDLE_WORD(accept_state);
1820
1821         } /* end second pass */
1822
1823         /* and now dump it out before we compress it */
1824         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1825                                                           revcharmap,
1826                                                           next_alloc, depth+1));
1827
1828         {
1829         /*
1830            * Inplace compress the table.*
1831
1832            For sparse data sets the table constructed by the trie algorithm will
1833            be mostly 0/FAIL transitions or to put it another way mostly empty.
1834            (Note that leaf nodes will not contain any transitions.)
1835
1836            This algorithm compresses the tables by eliminating most such
1837            transitions, at the cost of a modest bit of extra work during lookup:
1838
1839            - Each states[] entry contains a .base field which indicates the
1840            index in the state[] array wheres its transition data is stored.
1841
1842            - If .base is 0 there are no valid transitions from that node.
1843
1844            - If .base is nonzero then charid is added to it to find an entry in
1845            the trans array.
1846
1847            -If trans[states[state].base+charid].check!=state then the
1848            transition is taken to be a 0/Fail transition. Thus if there are fail
1849            transitions at the front of the node then the .base offset will point
1850            somewhere inside the previous nodes data (or maybe even into a node
1851            even earlier), but the .check field determines if the transition is
1852            valid.
1853
1854            XXX - wrong maybe?
1855            The following process inplace converts the table to the compressed
1856            table: We first do not compress the root node 1,and mark all its
1857            .check pointers as 1 and set its .base pointer as 1 as well. This
1858            allows us to do a DFA construction from the compressed table later,
1859            and ensures that any .base pointers we calculate later are greater
1860            than 0.
1861
1862            - We set 'pos' to indicate the first entry of the second node.
1863
1864            - We then iterate over the columns of the node, finding the first and
1865            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1866            and set the .check pointers accordingly, and advance pos
1867            appropriately and repreat for the next node. Note that when we copy
1868            the next pointers we have to convert them from the original
1869            NODEIDX form to NODENUM form as the former is not valid post
1870            compression.
1871
1872            - If a node has no transitions used we mark its base as 0 and do not
1873            advance the pos pointer.
1874
1875            - If a node only has one transition we use a second pointer into the
1876            structure to fill in allocated fail transitions from other states.
1877            This pointer is independent of the main pointer and scans forward
1878            looking for null transitions that are allocated to a state. When it
1879            finds one it writes the single transition into the "hole".  If the
1880            pointer doesnt find one the single transition is appended as normal.
1881
1882            - Once compressed we can Renew/realloc the structures to release the
1883            excess space.
1884
1885            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1886            specifically Fig 3.47 and the associated pseudocode.
1887
1888            demq
1889         */
1890         const U32 laststate = TRIE_NODENUM( next_alloc );
1891         U32 state, charid;
1892         U32 pos = 0, zp=0;
1893         trie->statecount = laststate;
1894
1895         for ( state = 1 ; state < laststate ; state++ ) {
1896             U8 flag = 0;
1897             const U32 stateidx = TRIE_NODEIDX( state );
1898             const U32 o_used = trie->trans[ stateidx ].check;
1899             U32 used = trie->trans[ stateidx ].check;
1900             trie->trans[ stateidx ].check = 0;
1901
1902             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1903                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1904                     if ( trie->trans[ stateidx + charid ].next ) {
1905                         if (o_used == 1) {
1906                             for ( ; zp < pos ; zp++ ) {
1907                                 if ( ! trie->trans[ zp ].next ) {
1908                                     break;
1909                                 }
1910                             }
1911                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1912                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1913                             trie->trans[ zp ].check = state;
1914                             if ( ++zp > pos ) pos = zp;
1915                             break;
1916                         }
1917                         used--;
1918                     }
1919                     if ( !flag ) {
1920                         flag = 1;
1921                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1922                     }
1923                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1924                     trie->trans[ pos ].check = state;
1925                     pos++;
1926                 }
1927             }
1928         }
1929         trie->lasttrans = pos + 1;
1930         trie->states = (reg_trie_state *)
1931             PerlMemShared_realloc( trie->states, laststate
1932                                    * sizeof(reg_trie_state) );
1933         DEBUG_TRIE_COMPILE_MORE_r(
1934                 PerlIO_printf( Perl_debug_log,
1935                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1936                     (int)depth * 2 + 2,"",
1937                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1938                     (IV)next_alloc,
1939                     (IV)pos,
1940                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1941             );
1942
1943         } /* end table compress */
1944     }
1945     DEBUG_TRIE_COMPILE_MORE_r(
1946             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1947                 (int)depth * 2 + 2, "",
1948                 (UV)trie->statecount,
1949                 (UV)trie->lasttrans)
1950     );
1951     /* resize the trans array to remove unused space */
1952     trie->trans = (reg_trie_trans *)
1953         PerlMemShared_realloc( trie->trans, trie->lasttrans
1954                                * sizeof(reg_trie_trans) );
1955
1956     {   /* Modify the program and insert the new TRIE node */
1957         U8 nodetype =(U8)(flags & 0xFF);
1958         char *str=NULL;
1959
1960 #ifdef DEBUGGING
1961         regnode *optimize = NULL;
1962 #ifdef RE_TRACK_PATTERN_OFFSETS
1963
1964         U32 mjd_offset = 0;
1965         U32 mjd_nodelen = 0;
1966 #endif /* RE_TRACK_PATTERN_OFFSETS */
1967 #endif /* DEBUGGING */
1968         /*
1969            This means we convert either the first branch or the first Exact,
1970            depending on whether the thing following (in 'last') is a branch
1971            or not and whther first is the startbranch (ie is it a sub part of
1972            the alternation or is it the whole thing.)
1973            Assuming its a sub part we convert the EXACT otherwise we convert
1974            the whole branch sequence, including the first.
1975          */
1976         /* Find the node we are going to overwrite */
1977         if ( first != startbranch || OP( last ) == BRANCH ) {
1978             /* branch sub-chain */
1979             NEXT_OFF( first ) = (U16)(last - first);
1980 #ifdef RE_TRACK_PATTERN_OFFSETS
1981             DEBUG_r({
1982                 mjd_offset= Node_Offset((convert));
1983                 mjd_nodelen= Node_Length((convert));
1984             });
1985 #endif
1986             /* whole branch chain */
1987         }
1988 #ifdef RE_TRACK_PATTERN_OFFSETS
1989         else {
1990             DEBUG_r({
1991                 const  regnode *nop = NEXTOPER( convert );
1992                 mjd_offset= Node_Offset((nop));
1993                 mjd_nodelen= Node_Length((nop));
1994             });
1995         }
1996         DEBUG_OPTIMISE_r(
1997             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1998                 (int)depth * 2 + 2, "",
1999                 (UV)mjd_offset, (UV)mjd_nodelen)
2000         );
2001 #endif
2002         /* But first we check to see if there is a common prefix we can
2003            split out as an EXACT and put in front of the TRIE node.  */
2004         trie->startstate= 1;
2005         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2006             U32 state;
2007             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2008                 U32 ofs = 0;
2009                 I32 idx = -1;
2010                 U32 count = 0;
2011                 const U32 base = trie->states[ state ].trans.base;
2012
2013                 if ( trie->states[state].wordnum )
2014                         count = 1;
2015
2016                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2017                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2018                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2019                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2020                     {
2021                         if ( ++count > 1 ) {
2022                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2023                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2024                             if ( state == 1 ) break;
2025                             if ( count == 2 ) {
2026                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2027                                 DEBUG_OPTIMISE_r(
2028                                     PerlIO_printf(Perl_debug_log,
2029                                         "%*sNew Start State=%"UVuf" Class: [",
2030                                         (int)depth * 2 + 2, "",
2031                                         (UV)state));
2032                                 if (idx >= 0) {
2033                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2034                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2035
2036                                     TRIE_BITMAP_SET(trie,*ch);
2037                                     if ( folder )
2038                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2039                                     DEBUG_OPTIMISE_r(
2040                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2041                                     );
2042                                 }
2043                             }
2044                             TRIE_BITMAP_SET(trie,*ch);
2045                             if ( folder )
2046                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2047                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2048                         }
2049                         idx = ofs;
2050                     }
2051                 }
2052                 if ( count == 1 ) {
2053                     SV **tmp = av_fetch( revcharmap, idx, 0);
2054                     STRLEN len;
2055                     char *ch = SvPV( *tmp, len );
2056                     DEBUG_OPTIMISE_r({
2057                         SV *sv=sv_newmortal();
2058                         PerlIO_printf( Perl_debug_log,
2059                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2060                             (int)depth * 2 + 2, "",
2061                             (UV)state, (UV)idx,
2062                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2063                                 PL_colors[0], PL_colors[1],
2064                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2065                                 PERL_PV_ESCAPE_FIRSTCHAR
2066                             )
2067                         );
2068                     });
2069                     if ( state==1 ) {
2070                         OP( convert ) = nodetype;
2071                         str=STRING(convert);
2072                         STR_LEN(convert)=0;
2073                     }
2074                     STR_LEN(convert) += len;
2075                     while (len--)
2076                         *str++ = *ch++;
2077                 } else {
2078 #ifdef DEBUGGING
2079                     if (state>1)
2080                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2081 #endif
2082                     break;
2083                 }
2084             }
2085             trie->prefixlen = (state-1);
2086             if (str) {
2087                 regnode *n = convert+NODE_SZ_STR(convert);
2088                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2089                 trie->startstate = state;
2090                 trie->minlen -= (state - 1);
2091                 trie->maxlen -= (state - 1);
2092 #ifdef DEBUGGING
2093                /* At least the UNICOS C compiler choked on this
2094                 * being argument to DEBUG_r(), so let's just have
2095                 * it right here. */
2096                if (
2097 #ifdef PERL_EXT_RE_BUILD
2098                    1
2099 #else
2100                    DEBUG_r_TEST
2101 #endif
2102                    ) {
2103                    regnode *fix = convert;
2104                    U32 word = trie->wordcount;
2105                    mjd_nodelen++;
2106                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2107                    while( ++fix < n ) {
2108                        Set_Node_Offset_Length(fix, 0, 0);
2109                    }
2110                    while (word--) {
2111                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2112                        if (tmp) {
2113                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2114                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2115                            else
2116                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2117                        }
2118                    }
2119                }
2120 #endif
2121                 if (trie->maxlen) {
2122                     convert = n;
2123                 } else {
2124                     NEXT_OFF(convert) = (U16)(tail - convert);
2125                     DEBUG_r(optimize= n);
2126                 }
2127             }
2128         }
2129         if (!jumper)
2130             jumper = last;
2131         if ( trie->maxlen ) {
2132             NEXT_OFF( convert ) = (U16)(tail - convert);
2133             ARG_SET( convert, data_slot );
2134             /* Store the offset to the first unabsorbed branch in
2135                jump[0], which is otherwise unused by the jump logic.
2136                We use this when dumping a trie and during optimisation. */
2137             if (trie->jump)
2138                 trie->jump[0] = (U16)(nextbranch - convert);
2139
2140             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2141              *   and there is a bitmap
2142              *   and the first "jump target" node we found leaves enough room
2143              * then convert the TRIE node into a TRIEC node, with the bitmap
2144              * embedded inline in the opcode - this is hypothetically faster.
2145              */
2146             if ( !trie->states[trie->startstate].wordnum
2147                  && trie->bitmap
2148                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2149             {
2150                 OP( convert ) = TRIEC;
2151                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2152                 PerlMemShared_free(trie->bitmap);
2153                 trie->bitmap= NULL;
2154             } else
2155                 OP( convert ) = TRIE;
2156
2157             /* store the type in the flags */
2158             convert->flags = nodetype;
2159             DEBUG_r({
2160             optimize = convert
2161                       + NODE_STEP_REGNODE
2162                       + regarglen[ OP( convert ) ];
2163             });
2164             /* XXX We really should free up the resource in trie now,
2165                    as we won't use them - (which resources?) dmq */
2166         }
2167         /* needed for dumping*/
2168         DEBUG_r(if (optimize) {
2169             regnode *opt = convert;
2170
2171             while ( ++opt < optimize) {
2172                 Set_Node_Offset_Length(opt,0,0);
2173             }
2174             /*
2175                 Try to clean up some of the debris left after the
2176                 optimisation.
2177              */
2178             while( optimize < jumper ) {
2179                 mjd_nodelen += Node_Length((optimize));
2180                 OP( optimize ) = OPTIMIZED;
2181                 Set_Node_Offset_Length(optimize,0,0);
2182                 optimize++;
2183             }
2184             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2185         });
2186     } /* end node insert */
2187
2188     /*  Finish populating the prev field of the wordinfo array.  Walk back
2189      *  from each accept state until we find another accept state, and if
2190      *  so, point the first word's .prev field at the second word. If the
2191      *  second already has a .prev field set, stop now. This will be the
2192      *  case either if we've already processed that word's accept state,
2193      *  or that state had multiple words, and the overspill words were
2194      *  already linked up earlier.
2195      */
2196     {
2197         U16 word;
2198         U32 state;
2199         U16 prev;
2200
2201         for (word=1; word <= trie->wordcount; word++) {
2202             prev = 0;
2203             if (trie->wordinfo[word].prev)
2204                 continue;
2205             state = trie->wordinfo[word].accept;
2206             while (state) {
2207                 state = prev_states[state];
2208                 if (!state)
2209                     break;
2210                 prev = trie->states[state].wordnum;
2211                 if (prev)
2212                     break;
2213             }
2214             trie->wordinfo[word].prev = prev;
2215         }
2216         Safefree(prev_states);
2217     }
2218
2219
2220     /* and now dump out the compressed format */
2221     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2222
2223     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2224 #ifdef DEBUGGING
2225     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2226     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2227 #else
2228     SvREFCNT_dec(revcharmap);
2229 #endif
2230     return trie->jump
2231            ? MADE_JUMP_TRIE
2232            : trie->startstate>1
2233              ? MADE_EXACT_TRIE
2234              : MADE_TRIE;
2235 }
2236
2237 STATIC void
2238 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2239 {
2240 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2241
2242    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2243    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2244    ISBN 0-201-10088-6
2245
2246    We find the fail state for each state in the trie, this state is the longest proper
2247    suffix of the current state's 'word' that is also a proper prefix of another word in our
2248    trie. State 1 represents the word '' and is thus the default fail state. This allows
2249    the DFA not to have to restart after its tried and failed a word at a given point, it
2250    simply continues as though it had been matching the other word in the first place.
2251    Consider
2252       'abcdgu'=~/abcdefg|cdgu/
2253    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2254    fail, which would bring us to the state representing 'd' in the second word where we would
2255    try 'g' and succeed, proceeding to match 'cdgu'.
2256  */
2257  /* add a fail transition */
2258     const U32 trie_offset = ARG(source);
2259     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2260     U32 *q;
2261     const U32 ucharcount = trie->uniquecharcount;
2262     const U32 numstates = trie->statecount;
2263     const U32 ubound = trie->lasttrans + ucharcount;
2264     U32 q_read = 0;
2265     U32 q_write = 0;
2266     U32 charid;
2267     U32 base = trie->states[ 1 ].trans.base;
2268     U32 *fail;
2269     reg_ac_data *aho;
2270     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2271     GET_RE_DEBUG_FLAGS_DECL;
2272
2273     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2274 #ifndef DEBUGGING
2275     PERL_UNUSED_ARG(depth);
2276 #endif
2277
2278
2279     ARG_SET( stclass, data_slot );
2280     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2281     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2282     aho->trie=trie_offset;
2283     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2284     Copy( trie->states, aho->states, numstates, reg_trie_state );
2285     Newxz( q, numstates, U32);
2286     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2287     aho->refcount = 1;
2288     fail = aho->fail;
2289     /* initialize fail[0..1] to be 1 so that we always have
2290        a valid final fail state */
2291     fail[ 0 ] = fail[ 1 ] = 1;
2292
2293     for ( charid = 0; charid < ucharcount ; charid++ ) {
2294         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2295         if ( newstate ) {
2296             q[ q_write ] = newstate;
2297             /* set to point at the root */
2298             fail[ q[ q_write++ ] ]=1;
2299         }
2300     }
2301     while ( q_read < q_write) {
2302         const U32 cur = q[ q_read++ % numstates ];
2303         base = trie->states[ cur ].trans.base;
2304
2305         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2306             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2307             if (ch_state) {
2308                 U32 fail_state = cur;
2309                 U32 fail_base;
2310                 do {
2311                     fail_state = fail[ fail_state ];
2312                     fail_base = aho->states[ fail_state ].trans.base;
2313                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2314
2315                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2316                 fail[ ch_state ] = fail_state;
2317                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2318                 {
2319                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2320                 }
2321                 q[ q_write++ % numstates] = ch_state;
2322             }
2323         }
2324     }
2325     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2326        when we fail in state 1, this allows us to use the
2327        charclass scan to find a valid start char. This is based on the principle
2328        that theres a good chance the string being searched contains lots of stuff
2329        that cant be a start char.
2330      */
2331     fail[ 0 ] = fail[ 1 ] = 0;
2332     DEBUG_TRIE_COMPILE_r({
2333         PerlIO_printf(Perl_debug_log,
2334                       "%*sStclass Failtable (%"UVuf" states): 0",
2335                       (int)(depth * 2), "", (UV)numstates
2336         );
2337         for( q_read=1; q_read<numstates; q_read++ ) {
2338             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2339         }
2340         PerlIO_printf(Perl_debug_log, "\n");
2341     });
2342     Safefree(q);
2343     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2344 }
2345
2346
2347 /*
2348  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2349  * These need to be revisited when a newer toolchain becomes available.
2350  */
2351 #if defined(__sparc64__) && defined(__GNUC__)
2352 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2353 #       undef  SPARC64_GCC_WORKAROUND
2354 #       define SPARC64_GCC_WORKAROUND 1
2355 #   endif
2356 #endif
2357
2358 #define DEBUG_PEEP(str,scan,depth) \
2359     DEBUG_OPTIMISE_r({if (scan){ \
2360        SV * const mysv=sv_newmortal(); \
2361        regnode *Next = regnext(scan); \
2362        regprop(RExC_rx, mysv, scan); \
2363        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2364        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2365        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2366    }});
2367
2368
2369
2370
2371
2372 #define JOIN_EXACT(scan,min,flags) \
2373     if (PL_regkind[OP(scan)] == EXACT) \
2374         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2375
2376 STATIC U32
2377 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2378     /* Merge several consecutive EXACTish nodes into one. */
2379     regnode *n = regnext(scan);
2380     U32 stringok = 1;
2381     regnode *next = scan + NODE_SZ_STR(scan);
2382     U32 merged = 0;
2383     U32 stopnow = 0;
2384 #ifdef DEBUGGING
2385     regnode *stop = scan;
2386     GET_RE_DEBUG_FLAGS_DECL;
2387 #else
2388     PERL_UNUSED_ARG(depth);
2389 #endif
2390
2391     PERL_ARGS_ASSERT_JOIN_EXACT;
2392 #ifndef EXPERIMENTAL_INPLACESCAN
2393     PERL_UNUSED_ARG(flags);
2394     PERL_UNUSED_ARG(val);
2395 #endif
2396     DEBUG_PEEP("join",scan,depth);
2397
2398     /* Skip NOTHING, merge EXACT*. */
2399     while (n &&
2400            ( PL_regkind[OP(n)] == NOTHING ||
2401              (stringok && (OP(n) == OP(scan))))
2402            && NEXT_OFF(n)
2403            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2404
2405         if (OP(n) == TAIL || n > next)
2406             stringok = 0;
2407         if (PL_regkind[OP(n)] == NOTHING) {
2408             DEBUG_PEEP("skip:",n,depth);
2409             NEXT_OFF(scan) += NEXT_OFF(n);
2410             next = n + NODE_STEP_REGNODE;
2411 #ifdef DEBUGGING
2412             if (stringok)
2413                 stop = n;
2414 #endif
2415             n = regnext(n);
2416         }
2417         else if (stringok) {
2418             const unsigned int oldl = STR_LEN(scan);
2419             regnode * const nnext = regnext(n);
2420
2421             DEBUG_PEEP("merg",n,depth);
2422
2423             merged++;
2424             if (oldl + STR_LEN(n) > U8_MAX)
2425                 break;
2426             NEXT_OFF(scan) += NEXT_OFF(n);
2427             STR_LEN(scan) += STR_LEN(n);
2428             next = n + NODE_SZ_STR(n);
2429             /* Now we can overwrite *n : */
2430             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2431 #ifdef DEBUGGING
2432             stop = next - 1;
2433 #endif
2434             n = nnext;
2435             if (stopnow) break;
2436         }
2437
2438 #ifdef EXPERIMENTAL_INPLACESCAN
2439         if (flags && !NEXT_OFF(n)) {
2440             DEBUG_PEEP("atch", val, depth);
2441             if (reg_off_by_arg[OP(n)]) {
2442                 ARG_SET(n, val - n);
2443             }
2444             else {
2445                 NEXT_OFF(n) = val - n;
2446             }
2447             stopnow = 1;
2448         }
2449 #endif
2450     }
2451 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2452 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2453 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2454 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2455
2456     if (UTF
2457         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2458         && ( STR_LEN(scan) >= 6 ) )
2459     {
2460     /*
2461     Two problematic code points in Unicode casefolding of EXACT nodes:
2462
2463     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2464     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2465
2466     which casefold to
2467
2468     Unicode                      UTF-8
2469
2470     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2471     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2472
2473     This means that in case-insensitive matching (or "loose matching",
2474     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2475     length of the above casefolded versions) can match a target string
2476     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2477     This would rather mess up the minimum length computation.
2478
2479     What we'll do is to look for the tail four bytes, and then peek
2480     at the preceding two bytes to see whether we need to decrease
2481     the minimum length by four (six minus two).
2482
2483     Thanks to the design of UTF-8, there cannot be false matches:
2484     A sequence of valid UTF-8 bytes cannot be a subsequence of
2485     another valid sequence of UTF-8 bytes.
2486
2487     */
2488          char * const s0 = STRING(scan), *s, *t;
2489          char * const s1 = s0 + STR_LEN(scan) - 1;
2490          char * const s2 = s1 - 4;
2491 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2492          const char t0[] = "\xaf\x49\xaf\x42";
2493 #else
2494          const char t0[] = "\xcc\x88\xcc\x81";
2495 #endif
2496          const char * const t1 = t0 + 3;
2497
2498          for (s = s0 + 2;
2499               s < s2 && (t = ninstr(s, s1, t0, t1));
2500               s = t + 4) {
2501 #ifdef EBCDIC
2502               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2503                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2504 #else
2505               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2506                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2507 #endif
2508                    *min -= 4;
2509          }
2510     }
2511
2512 #ifdef DEBUGGING
2513     /* Allow dumping */
2514     n = scan + NODE_SZ_STR(scan);
2515     while (n <= stop) {
2516         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2517             OP(n) = OPTIMIZED;
2518             NEXT_OFF(n) = 0;
2519         }
2520         n++;
2521     }
2522 #endif
2523     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2524     return stopnow;
2525 }
2526
2527 /* REx optimizer.  Converts nodes into quickier variants "in place".
2528    Finds fixed substrings.  */
2529
2530 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2531    to the position after last scanned or to NULL. */
2532
2533 #define INIT_AND_WITHP \
2534     assert(!and_withp); \
2535     Newx(and_withp,1,struct regnode_charclass_class); \
2536     SAVEFREEPV(and_withp)
2537
2538 /* this is a chain of data about sub patterns we are processing that
2539    need to be handled seperately/specially in study_chunk. Its so
2540    we can simulate recursion without losing state.  */
2541 struct scan_frame;
2542 typedef struct scan_frame {
2543     regnode *last;  /* last node to process in this frame */
2544     regnode *next;  /* next node to process when last is reached */
2545     struct scan_frame *prev; /*previous frame*/
2546     I32 stop; /* what stopparen do we use */
2547 } scan_frame;
2548
2549
2550 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2551
2552 #define CASE_SYNST_FNC(nAmE)                                       \
2553 case nAmE:                                                         \
2554     if (flags & SCF_DO_STCLASS_AND) {                              \
2555             for (value = 0; value < 256; value++)                  \
2556                 if (!is_ ## nAmE ## _cp(value))                       \
2557                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2558     }                                                              \
2559     else {                                                         \
2560             for (value = 0; value < 256; value++)                  \
2561                 if (is_ ## nAmE ## _cp(value))                        \
2562                     ANYOF_BITMAP_SET(data->start_class, value);    \
2563     }                                                              \
2564     break;                                                         \
2565 case N ## nAmE:                                                    \
2566     if (flags & SCF_DO_STCLASS_AND) {                              \
2567             for (value = 0; value < 256; value++)                   \
2568                 if (is_ ## nAmE ## _cp(value))                         \
2569                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2570     }                                                               \
2571     else {                                                          \
2572             for (value = 0; value < 256; value++)                   \
2573                 if (!is_ ## nAmE ## _cp(value))                        \
2574                     ANYOF_BITMAP_SET(data->start_class, value);     \
2575     }                                                               \
2576     break
2577
2578
2579
2580 STATIC I32
2581 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2582                         I32 *minlenp, I32 *deltap,
2583                         regnode *last,
2584                         scan_data_t *data,
2585                         I32 stopparen,
2586                         U8* recursed,
2587                         struct regnode_charclass_class *and_withp,
2588                         U32 flags, U32 depth)
2589                         /* scanp: Start here (read-write). */
2590                         /* deltap: Write maxlen-minlen here. */
2591                         /* last: Stop before this one. */
2592                         /* data: string data about the pattern */
2593                         /* stopparen: treat close N as END */
2594                         /* recursed: which subroutines have we recursed into */
2595                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2596 {
2597     dVAR;
2598     I32 min = 0, pars = 0, code;
2599     regnode *scan = *scanp, *next;
2600     I32 delta = 0;
2601     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2602     int is_inf_internal = 0;            /* The studied chunk is infinite */
2603     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2604     scan_data_t data_fake;
2605     SV *re_trie_maxbuff = NULL;
2606     regnode *first_non_open = scan;
2607     I32 stopmin = I32_MAX;
2608     scan_frame *frame = NULL;
2609     GET_RE_DEBUG_FLAGS_DECL;
2610
2611     PERL_ARGS_ASSERT_STUDY_CHUNK;
2612
2613 #ifdef DEBUGGING
2614     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2615 #endif
2616
2617     if ( depth == 0 ) {
2618         while (first_non_open && OP(first_non_open) == OPEN)
2619             first_non_open=regnext(first_non_open);
2620     }
2621
2622
2623   fake_study_recurse:
2624     while ( scan && OP(scan) != END && scan < last ){
2625         /* Peephole optimizer: */
2626         DEBUG_STUDYDATA("Peep:", data,depth);
2627         DEBUG_PEEP("Peep",scan,depth);
2628         JOIN_EXACT(scan,&min,0);
2629
2630         /* Follow the next-chain of the current node and optimize
2631            away all the NOTHINGs from it.  */
2632         if (OP(scan) != CURLYX) {
2633             const int max = (reg_off_by_arg[OP(scan)]
2634                        ? I32_MAX
2635                        /* I32 may be smaller than U16 on CRAYs! */
2636                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2637             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2638             int noff;
2639             regnode *n = scan;
2640
2641             /* Skip NOTHING and LONGJMP. */
2642             while ((n = regnext(n))
2643                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2644                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2645                    && off + noff < max)
2646                 off += noff;
2647             if (reg_off_by_arg[OP(scan)])
2648                 ARG(scan) = off;
2649             else
2650                 NEXT_OFF(scan) = off;
2651         }
2652
2653
2654
2655         /* The principal pseudo-switch.  Cannot be a switch, since we
2656            look into several different things.  */
2657         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2658                    || OP(scan) == IFTHEN) {
2659             next = regnext(scan);
2660             code = OP(scan);
2661             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2662
2663             if (OP(next) == code || code == IFTHEN) {
2664                 /* NOTE - There is similar code to this block below for handling
2665                    TRIE nodes on a re-study.  If you change stuff here check there
2666                    too. */
2667                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2668                 struct regnode_charclass_class accum;
2669                 regnode * const startbranch=scan;
2670
2671                 if (flags & SCF_DO_SUBSTR)
2672                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2673                 if (flags & SCF_DO_STCLASS)
2674                     cl_init_zero(pRExC_state, &accum);
2675
2676                 while (OP(scan) == code) {
2677                     I32 deltanext, minnext, f = 0, fake;
2678                     struct regnode_charclass_class this_class;
2679
2680                     num++;
2681                     data_fake.flags = 0;
2682                     if (data) {
2683                         data_fake.whilem_c = data->whilem_c;
2684                         data_fake.last_closep = data->last_closep;
2685                     }
2686                     else
2687                         data_fake.last_closep = &fake;
2688
2689                     data_fake.pos_delta = delta;
2690                     next = regnext(scan);
2691                     scan = NEXTOPER(scan);
2692                     if (code != BRANCH)
2693                         scan = NEXTOPER(scan);
2694                     if (flags & SCF_DO_STCLASS) {
2695                         cl_init(pRExC_state, &this_class);
2696                         data_fake.start_class = &this_class;
2697                         f = SCF_DO_STCLASS_AND;
2698                     }
2699                     if (flags & SCF_WHILEM_VISITED_POS)
2700                         f |= SCF_WHILEM_VISITED_POS;
2701
2702                     /* we suppose the run is continuous, last=next...*/
2703                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2704                                           next, &data_fake,
2705                                           stopparen, recursed, NULL, f,depth+1);
2706                     if (min1 > minnext)
2707                         min1 = minnext;
2708                     if (max1 < minnext + deltanext)
2709                         max1 = minnext + deltanext;
2710                     if (deltanext == I32_MAX)
2711                         is_inf = is_inf_internal = 1;
2712                     scan = next;
2713                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2714                         pars++;
2715                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2716                         if ( stopmin > minnext)
2717                             stopmin = min + min1;
2718                         flags &= ~SCF_DO_SUBSTR;
2719                         if (data)
2720                             data->flags |= SCF_SEEN_ACCEPT;
2721                     }
2722                     if (data) {
2723                         if (data_fake.flags & SF_HAS_EVAL)
2724                             data->flags |= SF_HAS_EVAL;
2725                         data->whilem_c = data_fake.whilem_c;
2726                     }
2727                     if (flags & SCF_DO_STCLASS)
2728                         cl_or(pRExC_state, &accum, &this_class);
2729                 }
2730                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2731                     min1 = 0;
2732                 if (flags & SCF_DO_SUBSTR) {
2733                     data->pos_min += min1;
2734                     data->pos_delta += max1 - min1;
2735                     if (max1 != min1 || is_inf)
2736                         data->longest = &(data->longest_float);
2737                 }
2738                 min += min1;
2739                 delta += max1 - min1;
2740                 if (flags & SCF_DO_STCLASS_OR) {
2741                     cl_or(pRExC_state, data->start_class, &accum);
2742                     if (min1) {
2743                         cl_and(data->start_class, and_withp);
2744                         flags &= ~SCF_DO_STCLASS;
2745                     }
2746                 }
2747                 else if (flags & SCF_DO_STCLASS_AND) {
2748                     if (min1) {
2749                         cl_and(data->start_class, &accum);
2750                         flags &= ~SCF_DO_STCLASS;
2751                     }
2752                     else {
2753                         /* Switch to OR mode: cache the old value of
2754                          * data->start_class */
2755                         INIT_AND_WITHP;
2756                         StructCopy(data->start_class, and_withp,
2757                                    struct regnode_charclass_class);
2758                         flags &= ~SCF_DO_STCLASS_AND;
2759                         StructCopy(&accum, data->start_class,
2760                                    struct regnode_charclass_class);
2761                         flags |= SCF_DO_STCLASS_OR;
2762                         data->start_class->flags |= ANYOF_EOS;
2763                     }
2764                 }
2765
2766                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2767                 /* demq.
2768
2769                    Assuming this was/is a branch we are dealing with: 'scan' now
2770                    points at the item that follows the branch sequence, whatever
2771                    it is. We now start at the beginning of the sequence and look
2772                    for subsequences of
2773
2774                    BRANCH->EXACT=>x1
2775                    BRANCH->EXACT=>x2
2776                    tail
2777
2778                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2779
2780                    If we can find such a subseqence we need to turn the first
2781                    element into a trie and then add the subsequent branch exact
2782                    strings to the trie.
2783
2784                    We have two cases
2785
2786                      1. patterns where the whole set of branches can be converted.
2787
2788                      2. patterns where only a subset can be converted.
2789
2790                    In case 1 we can replace the whole set with a single regop
2791                    for the trie. In case 2 we need to keep the start and end
2792                    branches so
2793
2794                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2795                      becomes BRANCH TRIE; BRANCH X;
2796
2797                   There is an additional case, that being where there is a
2798                   common prefix, which gets split out into an EXACT like node
2799                   preceding the TRIE node.
2800
2801                   If x(1..n)==tail then we can do a simple trie, if not we make
2802                   a "jump" trie, such that when we match the appropriate word
2803                   we "jump" to the appopriate tail node. Essentailly we turn
2804                   a nested if into a case structure of sorts.
2805
2806                 */
2807
2808                     int made=0;
2809                     if (!re_trie_maxbuff) {
2810                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2811                         if (!SvIOK(re_trie_maxbuff))
2812                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2813                     }
2814                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2815                         regnode *cur;
2816                         regnode *first = (regnode *)NULL;
2817                         regnode *last = (regnode *)NULL;
2818                         regnode *tail = scan;
2819                         U8 optype = 0;
2820                         U32 count=0;
2821
2822 #ifdef DEBUGGING
2823                         SV * const mysv = sv_newmortal();       /* for dumping */
2824 #endif
2825                         /* var tail is used because there may be a TAIL
2826                            regop in the way. Ie, the exacts will point to the
2827                            thing following the TAIL, but the last branch will
2828                            point at the TAIL. So we advance tail. If we
2829                            have nested (?:) we may have to move through several
2830                            tails.
2831                          */
2832
2833                         while ( OP( tail ) == TAIL ) {
2834                             /* this is the TAIL generated by (?:) */
2835                             tail = regnext( tail );
2836                         }
2837
2838
2839                         DEBUG_OPTIMISE_r({
2840                             regprop(RExC_rx, mysv, tail );
2841                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2842                                 (int)depth * 2 + 2, "",
2843                                 "Looking for TRIE'able sequences. Tail node is: ",
2844                                 SvPV_nolen_const( mysv )
2845                             );
2846                         });
2847
2848                         /*
2849
2850                            step through the branches, cur represents each
2851                            branch, noper is the first thing to be matched
2852                            as part of that branch and noper_next is the
2853                            regnext() of that node. if noper is an EXACT
2854                            and noper_next is the same as scan (our current
2855                            position in the regex) then the EXACT branch is
2856                            a possible optimization target. Once we have
2857                            two or more consequetive such branches we can
2858                            create a trie of the EXACT's contents and stich
2859                            it in place. If the sequence represents all of
2860                            the branches we eliminate the whole thing and
2861                            replace it with a single TRIE. If it is a
2862                            subsequence then we need to stitch it in. This
2863                            means the first branch has to remain, and needs
2864                            to be repointed at the item on the branch chain
2865                            following the last branch optimized. This could
2866                            be either a BRANCH, in which case the
2867                            subsequence is internal, or it could be the
2868                            item following the branch sequence in which
2869                            case the subsequence is at the end.
2870
2871                         */
2872
2873                         /* dont use tail as the end marker for this traverse */
2874                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2875                             regnode * const noper = NEXTOPER( cur );
2876 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2877                             regnode * const noper_next = regnext( noper );
2878 #endif
2879
2880                             DEBUG_OPTIMISE_r({
2881                                 regprop(RExC_rx, mysv, cur);
2882                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2883                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2884
2885                                 regprop(RExC_rx, mysv, noper);
2886                                 PerlIO_printf( Perl_debug_log, " -> %s",
2887                                     SvPV_nolen_const(mysv));
2888
2889                                 if ( noper_next ) {
2890                                   regprop(RExC_rx, mysv, noper_next );
2891                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2892                                     SvPV_nolen_const(mysv));
2893                                 }
2894                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2895                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2896                             });
2897                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2898                                          : PL_regkind[ OP( noper ) ] == EXACT )
2899                                   || OP(noper) == NOTHING )
2900 #ifdef NOJUMPTRIE
2901                                   && noper_next == tail
2902 #endif
2903                                   && count < U16_MAX)
2904                             {
2905                                 count++;
2906                                 if ( !first || optype == NOTHING ) {
2907                                     if (!first) first = cur;
2908                                     optype = OP( noper );
2909                                 } else {
2910                                     last = cur;
2911                                 }
2912                             } else {
2913 /*
2914     Currently we do not believe that the trie logic can
2915     handle case insensitive matching properly when the
2916     pattern is not unicode (thus forcing unicode semantics).
2917
2918     If/when this is fixed the following define can be swapped
2919     in below to fully enable trie logic.
2920
2921 #define TRIE_TYPE_IS_SAFE 1
2922
2923 */
2924 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2925
2926                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2927                                     make_trie( pRExC_state,
2928                                             startbranch, first, cur, tail, count,
2929                                             optype, depth+1 );
2930                                 }
2931                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2932 #ifdef NOJUMPTRIE
2933                                      && noper_next == tail
2934 #endif
2935                                 ){
2936                                     count = 1;
2937                                     first = cur;
2938                                     optype = OP( noper );
2939                                 } else {
2940                                     count = 0;
2941                                     first = NULL;
2942                                     optype = 0;
2943                                 }
2944                                 last = NULL;
2945                             }
2946                         }
2947                         DEBUG_OPTIMISE_r({
2948                             regprop(RExC_rx, mysv, cur);
2949                             PerlIO_printf( Perl_debug_log,
2950                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2951                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2952
2953                         });
2954
2955                         if ( last && TRIE_TYPE_IS_SAFE ) {
2956                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2957 #ifdef TRIE_STUDY_OPT
2958                             if ( ((made == MADE_EXACT_TRIE &&
2959                                  startbranch == first)
2960                                  || ( first_non_open == first )) &&
2961                                  depth==0 ) {
2962                                 flags |= SCF_TRIE_RESTUDY;
2963                                 if ( startbranch == first
2964                                      && scan == tail )
2965                                 {
2966                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2967                                 }
2968                             }
2969 #endif
2970                         }
2971                     }
2972
2973                 } /* do trie */
2974
2975             }
2976             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2977                 scan = NEXTOPER(NEXTOPER(scan));
2978             } else                      /* single branch is optimized. */
2979                 scan = NEXTOPER(scan);
2980             continue;
2981         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2982             scan_frame *newframe = NULL;
2983             I32 paren;
2984             regnode *start;
2985             regnode *end;
2986
2987             if (OP(scan) != SUSPEND) {
2988             /* set the pointer */
2989                 if (OP(scan) == GOSUB) {
2990                     paren = ARG(scan);
2991                     RExC_recurse[ARG2L(scan)] = scan;
2992                     start = RExC_open_parens[paren-1];
2993                     end   = RExC_close_parens[paren-1];
2994                 } else {
2995                     paren = 0;
2996                     start = RExC_rxi->program + 1;
2997                     end   = RExC_opend;
2998                 }
2999                 if (!recursed) {
3000                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3001                     SAVEFREEPV(recursed);
3002                 }
3003                 if (!PAREN_TEST(recursed,paren+1)) {
3004                     PAREN_SET(recursed,paren+1);
3005                     Newx(newframe,1,scan_frame);
3006                 } else {
3007                     if (flags & SCF_DO_SUBSTR) {
3008                         SCAN_COMMIT(pRExC_state,data,minlenp);
3009                         data->longest = &(data->longest_float);
3010                     }
3011                     is_inf = is_inf_internal = 1;
3012                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3013                         cl_anything(pRExC_state, data->start_class);
3014                     flags &= ~SCF_DO_STCLASS;
3015                 }
3016             } else {
3017                 Newx(newframe,1,scan_frame);
3018                 paren = stopparen;
3019                 start = scan+2;
3020                 end = regnext(scan);
3021             }
3022             if (newframe) {
3023                 assert(start);
3024                 assert(end);
3025                 SAVEFREEPV(newframe);
3026                 newframe->next = regnext(scan);
3027                 newframe->last = last;
3028                 newframe->stop = stopparen;
3029                 newframe->prev = frame;
3030
3031                 frame = newframe;
3032                 scan =  start;
3033                 stopparen = paren;
3034                 last = end;
3035
3036                 continue;
3037             }
3038         }
3039         else if (OP(scan) == EXACT) {
3040             I32 l = STR_LEN(scan);
3041             UV uc;
3042             if (UTF) {
3043                 const U8 * const s = (U8*)STRING(scan);
3044                 l = utf8_length(s, s + l);
3045                 uc = utf8_to_uvchr(s, NULL);
3046             } else {
3047                 uc = *((U8*)STRING(scan));
3048             }
3049             min += l;
3050             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3051                 /* The code below prefers earlier match for fixed
3052                    offset, later match for variable offset.  */
3053                 if (data->last_end == -1) { /* Update the start info. */
3054                     data->last_start_min = data->pos_min;
3055                     data->last_start_max = is_inf
3056                         ? I32_MAX : data->pos_min + data->pos_delta;
3057                 }
3058                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3059                 if (UTF)
3060                     SvUTF8_on(data->last_found);
3061                 {
3062                     SV * const sv = data->last_found;
3063                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3064                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3065                     if (mg && mg->mg_len >= 0)
3066                         mg->mg_len += utf8_length((U8*)STRING(scan),
3067                                                   (U8*)STRING(scan)+STR_LEN(scan));
3068                 }
3069                 data->last_end = data->pos_min + l;
3070                 data->pos_min += l; /* As in the first entry. */
3071                 data->flags &= ~SF_BEFORE_EOL;
3072             }
3073             if (flags & SCF_DO_STCLASS_AND) {
3074                 /* Check whether it is compatible with what we know already! */
3075                 int compat = 1;
3076
3077
3078                 /* If compatibile, we or it in below.  It is compatible if is
3079                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3080                  * it's for a locale.  Even if there isn't unicode semantics
3081                  * here, at runtime there may be because of matching against a
3082                  * utf8 string, so accept a possible false positive for
3083                  * latin1-range folds */
3084                 if (uc >= 0x100 ||
3085                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3086                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3087                     && (!(data->start_class->flags & ANYOF_FOLD)
3088                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3089                     )
3090                     compat = 0;
3091                 ANYOF_CLASS_ZERO(data->start_class);
3092                 ANYOF_BITMAP_ZERO(data->start_class);
3093                 if (compat)
3094                     ANYOF_BITMAP_SET(data->start_class, uc);
3095                 data->start_class->flags &= ~ANYOF_EOS;
3096                 if (uc < 0x100)
3097                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3098             }
3099             else if (flags & SCF_DO_STCLASS_OR) {
3100                 /* false positive possible if the class is case-folded */
3101                 if (uc < 0x100)
3102                     ANYOF_BITMAP_SET(data->start_class, uc);
3103                 else
3104                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3105                 data->start_class->flags &= ~ANYOF_EOS;
3106                 cl_and(data->start_class, and_withp);
3107             }
3108             flags &= ~SCF_DO_STCLASS;
3109         }
3110         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3111             I32 l = STR_LEN(scan);
3112             UV uc = *((U8*)STRING(scan));
3113
3114             /* Search for fixed substrings supports EXACT only. */
3115             if (flags & SCF_DO_SUBSTR) {
3116                 assert(data);
3117                 SCAN_COMMIT(pRExC_state, data, minlenp);
3118             }
3119             if (UTF) {
3120                 const U8 * const s = (U8 *)STRING(scan);
3121                 l = utf8_length(s, s + l);
3122                 uc = utf8_to_uvchr(s, NULL);
3123             }
3124             min += l;
3125             if (flags & SCF_DO_SUBSTR)
3126                 data->pos_min += l;
3127             if (flags & SCF_DO_STCLASS_AND) {
3128                 /* Check whether it is compatible with what we know already! */
3129                 int compat = 1;
3130                 if (uc >= 0x100 ||
3131                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3132                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3133                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3134                 {
3135                     compat = 0;
3136                 }
3137                 ANYOF_CLASS_ZERO(data->start_class);
3138                 ANYOF_BITMAP_ZERO(data->start_class);
3139                 if (compat) {
3140                     ANYOF_BITMAP_SET(data->start_class, uc);
3141                     data->start_class->flags &= ~ANYOF_EOS;
3142                     data->start_class->flags |= ANYOF_FOLD;
3143                     if (OP(scan) == EXACTFL) {
3144                         data->start_class->flags |= ANYOF_LOCALE;
3145                     }
3146                     else {
3147
3148                         /* Also set the other member of the fold pair.  In case
3149                          * that unicode semantics is called for at runtime, use
3150                          * the full latin1 fold.  (Can't do this for locale,
3151                          * because not known until runtime */
3152                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3153                     }
3154                 }
3155             }
3156             else if (flags & SCF_DO_STCLASS_OR) {
3157                 if (data->start_class->flags & ANYOF_FOLD) {
3158                     /* false positive possible if the class is case-folded.
3159                        Assume that the locale settings are the same... */
3160                     if (uc < 0x100) {
3161                         ANYOF_BITMAP_SET(data->start_class, uc);
3162                         if (OP(scan) != EXACTFL) {
3163
3164                             /* And set the other member of the fold pair, but
3165                              * can't do that in locale because not known until
3166                              * run-time */
3167                             ANYOF_BITMAP_SET(data->start_class,
3168                                              PL_fold_latin1[uc]);
3169                         }
3170                     }
3171                     data->start_class->flags &= ~ANYOF_EOS;
3172                 }
3173                 cl_and(data->start_class, and_withp);
3174             }
3175             flags &= ~SCF_DO_STCLASS;
3176         }
3177         else if (REGNODE_VARIES(OP(scan))) {
3178             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3179             I32 f = flags, pos_before = 0;
3180             regnode * const oscan = scan;
3181             struct regnode_charclass_class this_class;
3182             struct regnode_charclass_class *oclass = NULL;
3183             I32 next_is_eval = 0;
3184
3185             switch (PL_regkind[OP(scan)]) {
3186             case WHILEM:                /* End of (?:...)* . */
3187                 scan = NEXTOPER(scan);
3188                 goto finish;
3189             case PLUS:
3190                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3191                     next = NEXTOPER(scan);
3192                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3193                         mincount = 1;
3194                         maxcount = REG_INFTY;
3195                         next = regnext(scan);
3196                         scan = NEXTOPER(scan);
3197                         goto do_curly;
3198                     }
3199                 }
3200                 if (flags & SCF_DO_SUBSTR)
3201                     data->pos_min++;
3202                 min++;
3203                 /* Fall through. */
3204             case STAR:
3205                 if (flags & SCF_DO_STCLASS) {
3206                     mincount = 0;
3207                     maxcount = REG_INFTY;
3208                     next = regnext(scan);
3209                     scan = NEXTOPER(scan);
3210                     goto do_curly;
3211                 }
3212                 is_inf = is_inf_internal = 1;
3213                 scan = regnext(scan);
3214                 if (flags & SCF_DO_SUBSTR) {
3215                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3216                     data->longest = &(data->longest_float);
3217                 }
3218                 goto optimize_curly_tail;
3219             case CURLY:
3220                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3221                     && (scan->flags == stopparen))
3222                 {
3223                     mincount = 1;
3224                     maxcount = 1;
3225                 } else {
3226                     mincount = ARG1(scan);
3227                     maxcount = ARG2(scan);
3228                 }
3229                 next = regnext(scan);
3230                 if (OP(scan) == CURLYX) {
3231                     I32 lp = (data ? *(data->last_closep) : 0);
3232                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3233                 }
3234                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3235                 next_is_eval = (OP(scan) == EVAL);
3236               do_curly:
3237                 if (flags & SCF_DO_SUBSTR) {
3238                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3239                     pos_before = data->pos_min;
3240                 }
3241                 if (data) {
3242                     fl = data->flags;
3243                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3244                     if (is_inf)
3245                         data->flags |= SF_IS_INF;
3246                 }
3247                 if (flags & SCF_DO_STCLASS) {
3248                     cl_init(pRExC_state, &this_class);
3249                     oclass = data->start_class;
3250                     data->start_class = &this_class;
3251                     f |= SCF_DO_STCLASS_AND;
3252                     f &= ~SCF_DO_STCLASS_OR;
3253                 }
3254                 /* Exclude from super-linear cache processing any {n,m}
3255                    regops for which the combination of input pos and regex
3256                    pos is not enough information to determine if a match
3257                    will be possible.
3258
3259                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3260                    regex pos at the \s*, the prospects for a match depend not
3261                    only on the input position but also on how many (bar\s*)
3262                    repeats into the {4,8} we are. */
3263                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3264                     f &= ~SCF_WHILEM_VISITED_POS;
3265
3266                 /* This will finish on WHILEM, setting scan, or on NULL: */
3267                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3268                                       last, data, stopparen, recursed, NULL,
3269                                       (mincount == 0
3270                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3271
3272                 if (flags & SCF_DO_STCLASS)
3273                     data->start_class = oclass;
3274                 if (mincount == 0 || minnext == 0) {
3275                     if (flags & SCF_DO_STCLASS_OR) {
3276                         cl_or(pRExC_state, data->start_class, &this_class);
3277                     }
3278                     else if (flags & SCF_DO_STCLASS_AND) {
3279                         /* Switch to OR mode: cache the old value of
3280                          * data->start_class */
3281                         INIT_AND_WITHP;
3282                         StructCopy(data->start_class, and_withp,
3283                                    struct regnode_charclass_class);
3284                         flags &= ~SCF_DO_STCLASS_AND;
3285                         StructCopy(&this_class, data->start_class,
3286                                    struct regnode_charclass_class);
3287                         flags |= SCF_DO_STCLASS_OR;
3288                         data->start_class->flags |= ANYOF_EOS;
3289                     }
3290                 } else {                /* Non-zero len */
3291                     if (flags & SCF_DO_STCLASS_OR) {
3292                         cl_or(pRExC_state, data->start_class, &this_class);
3293                         cl_and(data->start_class, and_withp);
3294                     }
3295                     else if (flags & SCF_DO_STCLASS_AND)
3296                         cl_and(data->start_class, &this_class);
3297                     flags &= ~SCF_DO_STCLASS;
3298                 }
3299                 if (!scan)              /* It was not CURLYX, but CURLY. */
3300                     scan = next;
3301                 if ( /* ? quantifier ok, except for (?{ ... }) */
3302                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3303                     && (minnext == 0) && (deltanext == 0)
3304                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3305                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3306                 {
3307                     ckWARNreg(RExC_parse,
3308                               "Quantifier unexpected on zero-length expression");
3309                 }
3310
3311                 min += minnext * mincount;
3312                 is_inf_internal |= ((maxcount == REG_INFTY
3313                                      && (minnext + deltanext) > 0)
3314                                     || deltanext == I32_MAX);
3315                 is_inf |= is_inf_internal;
3316                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3317
3318                 /* Try powerful optimization CURLYX => CURLYN. */
3319                 if (  OP(oscan) == CURLYX && data
3320                       && data->flags & SF_IN_PAR
3321                       && !(data->flags & SF_HAS_EVAL)
3322                       && !deltanext && minnext == 1 ) {
3323                     /* Try to optimize to CURLYN.  */
3324                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3325                     regnode * const nxt1 = nxt;
3326 #ifdef DEBUGGING
3327                     regnode *nxt2;
3328 #endif
3329
3330                     /* Skip open. */
3331                     nxt = regnext(nxt);
3332                     if (!REGNODE_SIMPLE(OP(nxt))
3333                         && !(PL_regkind[OP(nxt)] == EXACT
3334                              && STR_LEN(nxt) == 1))
3335                         goto nogo;
3336 #ifdef DEBUGGING
3337                     nxt2 = nxt;
3338 #endif
3339                     nxt = regnext(nxt);
3340                     if (OP(nxt) != CLOSE)
3341                         goto nogo;
3342                     if (RExC_open_parens) {
3343                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3344                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3345                     }
3346                     /* Now we know that nxt2 is the only contents: */
3347                     oscan->flags = (U8)ARG(nxt);
3348                     OP(oscan) = CURLYN;
3349                     OP(nxt1) = NOTHING; /* was OPEN. */
3350
3351 #ifdef DEBUGGING
3352                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3353                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3354                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3355                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3356                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3357                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3358 #endif
3359                 }
3360               nogo:
3361
3362                 /* Try optimization CURLYX => CURLYM. */
3363                 if (  OP(oscan) == CURLYX && data
3364                       && !(data->flags & SF_HAS_PAR)
3365                       && !(data->flags & SF_HAS_EVAL)
3366                       && !deltanext     /* atom is fixed width */
3367                       && minnext != 0   /* CURLYM can't handle zero width */
3368                 ) {
3369                     /* XXXX How to optimize if data == 0? */
3370                     /* Optimize to a simpler form.  */
3371                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3372                     regnode *nxt2;
3373
3374                     OP(oscan) = CURLYM;
3375                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3376                             && (OP(nxt2) != WHILEM))
3377                         nxt = nxt2;
3378                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3379                     /* Need to optimize away parenths. */
3380                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3381                         /* Set the parenth number.  */
3382                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3383
3384                         oscan->flags = (U8)ARG(nxt);
3385                         if (RExC_open_parens) {
3386                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3387                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3388                         }
3389                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3390                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3391
3392 #ifdef DEBUGGING
3393                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3394                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3395                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3396                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3397 #endif
3398 #if 0
3399                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3400                             regnode *nnxt = regnext(nxt1);
3401                             if (nnxt == nxt) {
3402                                 if (reg_off_by_arg[OP(nxt1)])
3403                                     ARG_SET(nxt1, nxt2 - nxt1);
3404                                 else if (nxt2 - nxt1 < U16_MAX)
3405                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3406                                 else
3407                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3408                             }
3409                             nxt1 = nnxt;
3410                         }
3411 #endif
3412                         /* Optimize again: */
3413                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3414                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3415                     }
3416                     else
3417                         oscan->flags = 0;
3418                 }
3419                 else if ((OP(oscan) == CURLYX)
3420                          && (flags & SCF_WHILEM_VISITED_POS)
3421                          /* See the comment on a similar expression above.
3422                             However, this time it's not a subexpression
3423                             we care about, but the expression itself. */
3424                          && (maxcount == REG_INFTY)
3425                          && data && ++data->whilem_c < 16) {
3426                     /* This stays as CURLYX, we can put the count/of pair. */
3427                     /* Find WHILEM (as in regexec.c) */
3428                     regnode *nxt = oscan + NEXT_OFF(oscan);
3429
3430                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3431                         nxt += ARG(nxt);
3432                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3433                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3434                 }
3435                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3436                     pars++;
3437                 if (flags & SCF_DO_SUBSTR) {
3438                     SV *last_str = NULL;
3439                     int counted = mincount != 0;
3440
3441                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3442 #if defined(SPARC64_GCC_WORKAROUND)
3443                         I32 b = 0;
3444                         STRLEN l = 0;
3445                         const char *s = NULL;
3446                         I32 old = 0;
3447
3448                         if (pos_before >= data->last_start_min)
3449                             b = pos_before;
3450                         else
3451                             b = data->last_start_min;
3452
3453                         l = 0;
3454                         s = SvPV_const(data->last_found, l);
3455                         old = b - data->last_start_min;
3456
3457 #else
3458                         I32 b = pos_before >= data->last_start_min
3459                             ? pos_before : data->last_start_min;
3460                         STRLEN l;
3461                         const char * const s = SvPV_const(data->last_found, l);
3462                         I32 old = b - data->last_start_min;
3463 #endif
3464
3465                         if (UTF)
3466                             old = utf8_hop((U8*)s, old) - (U8*)s;
3467                         l -= old;
3468                         /* Get the added string: */
3469                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3470                         if (deltanext == 0 && pos_before == b) {
3471                             /* What was added is a constant string */
3472                             if (mincount > 1) {
3473                                 SvGROW(last_str, (mincount * l) + 1);
3474                                 repeatcpy(SvPVX(last_str) + l,
3475                                           SvPVX_const(last_str), l, mincount - 1);
3476                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3477                                 /* Add additional parts. */
3478                                 SvCUR_set(data->last_found,
3479                                           SvCUR(data->last_found) - l);
3480                                 sv_catsv(data->last_found, last_str);
3481                                 {
3482                                     SV * sv = data->last_found;
3483                                     MAGIC *mg =
3484                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3485                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3486                                     if (mg && mg->mg_len >= 0)
3487                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3488                                 }
3489                                 data->last_end += l * (mincount - 1);
3490                             }
3491                         } else {
3492                             /* start offset must point into the last copy */
3493                             data->last_start_min += minnext * (mincount - 1);
3494                             data->last_start_max += is_inf ? I32_MAX
3495                                 : (maxcount - 1) * (minnext + data->pos_delta);
3496                         }
3497                     }
3498                     /* It is counted once already... */
3499                     data->pos_min += minnext * (mincount - counted);
3500                     data->pos_delta += - counted * deltanext +
3501                         (minnext + deltanext) * maxcount - minnext * mincount;
3502                     if (mincount != maxcount) {
3503                          /* Cannot extend fixed substrings found inside
3504                             the group.  */
3505                         SCAN_COMMIT(pRExC_state,data,minlenp);
3506                         if (mincount && last_str) {
3507                             SV * const sv = data->last_found;
3508                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3509                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3510
3511                             if (mg)
3512                                 mg->mg_len = -1;
3513                             sv_setsv(sv, last_str);
3514                             data->last_end = data->pos_min;
3515                             data->last_start_min =
3516                                 data->pos_min - CHR_SVLEN(last_str);
3517                             data->last_start_max = is_inf
3518                                 ? I32_MAX
3519                                 : data->pos_min + data->pos_delta
3520                                 - CHR_SVLEN(last_str);
3521                         }
3522                         data->longest = &(data->longest_float);
3523                     }
3524                     SvREFCNT_dec(last_str);
3525                 }
3526                 if (data && (fl & SF_HAS_EVAL))
3527                     data->flags |= SF_HAS_EVAL;
3528               optimize_curly_tail:
3529                 if (OP(oscan) != CURLYX) {
3530                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3531                            && NEXT_OFF(next))
3532                         NEXT_OFF(oscan) += NEXT_OFF(next);
3533                 }
3534                 continue;
3535             default:                    /* REF and CLUMP only? */
3536                 if (flags & SCF_DO_SUBSTR) {
3537                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3538                     data->longest = &(data->longest_float);
3539                 }
3540                 is_inf = is_inf_internal = 1;
3541                 if (flags & SCF_DO_STCLASS_OR)
3542                     cl_anything(pRExC_state, data->start_class);
3543                 flags &= ~SCF_DO_STCLASS;
3544                 break;
3545             }
3546         }
3547         else if (OP(scan) == LNBREAK) {
3548             if (flags & SCF_DO_STCLASS) {
3549                 int value = 0;
3550                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3551                 if (flags & SCF_DO_STCLASS_AND) {
3552                     for (value = 0; value < 256; value++)
3553                         if (!is_VERTWS_cp(value))
3554                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3555                 }
3556                 else {
3557                     for (value = 0; value < 256; value++)
3558                         if (is_VERTWS_cp(value))
3559                             ANYOF_BITMAP_SET(data->start_class, value);
3560                 }
3561                 if (flags & SCF_DO_STCLASS_OR)
3562                     cl_and(data->start_class, and_withp);
3563                 flags &= ~SCF_DO_STCLASS;
3564             }
3565             min += 1;
3566             delta += 1;
3567             if (flags & SCF_DO_SUBSTR) {
3568                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3569                 data->pos_min += 1;
3570                 data->pos_delta += 1;
3571                 data->longest = &(data->longest_float);
3572             }
3573         }
3574         else if (OP(scan) == FOLDCHAR) {
3575             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3576             flags &= ~SCF_DO_STCLASS;
3577             min += 1;
3578             delta += d;
3579             if (flags & SCF_DO_SUBSTR) {
3580                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3581                 data->pos_min += 1;
3582                 data->pos_delta += d;
3583                 data->longest = &(data->longest_float);
3584             }
3585         }
3586         else if (REGNODE_SIMPLE(OP(scan))) {
3587             int value = 0;
3588
3589             if (flags & SCF_DO_SUBSTR) {
3590                 SCAN_COMMIT(pRExC_state,data,minlenp);
3591                 data->pos_min++;
3592             }
3593             min++;
3594             if (flags & SCF_DO_STCLASS) {
3595                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3596
3597                 /* Some of the logic below assumes that switching
3598                    locale on will only add false positives. */
3599                 switch (PL_regkind[OP(scan)]) {
3600                 case SANY:
3601                 default:
3602                   do_default:
3603                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3604                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3605                         cl_anything(pRExC_state, data->start_class);
3606                     break;
3607                 case REG_ANY:
3608                     if (OP(scan) == SANY)
3609                         goto do_default;
3610                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3611                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3612                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3613                         cl_anything(pRExC_state, data->start_class);
3614                     }
3615                     if (flags & SCF_DO_STCLASS_AND || !value)
3616                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3617                     break;
3618                 case ANYOF:
3619                     if (flags & SCF_DO_STCLASS_AND)
3620                         cl_and(data->start_class,
3621                                (struct regnode_charclass_class*)scan);
3622                     else
3623                         cl_or(pRExC_state, data->start_class,
3624                               (struct regnode_charclass_class*)scan);
3625                     break;
3626                 case ALNUM:
3627                     if (flags & SCF_DO_STCLASS_AND) {
3628                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3629                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3630                             if (FLAGS(scan) & USE_UNI) {
3631                                 for (value = 0; value < 256; value++) {
3632                                     if (!isWORDCHAR_L1(value)) {
3633                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3634                                     }
3635                                 }
3636                             } else {
3637                                 for (value = 0; value < 256; value++) {
3638                                     if (!isALNUM(value)) {
3639                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3640                                     }
3641                                 }
3642                             }
3643                         }
3644                     }
3645                     else {
3646                         if (data->start_class->flags & ANYOF_LOCALE)
3647                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3648                         else if (FLAGS(scan) & USE_UNI) {
3649                             for (value = 0; value < 256; value++) {
3650                                 if (isWORDCHAR_L1(value)) {
3651                                     ANYOF_BITMAP_SET(data->start_class, value);
3652                                 }
3653                             }
3654                         } else {
3655                             for (value = 0; value < 256; value++) {
3656                                 if (isALNUM(value)) {
3657                                     ANYOF_BITMAP_SET(data->start_class, value);
3658                                 }
3659                             }
3660                         }
3661                     }
3662                     break;
3663                 case ALNUML:
3664                     if (flags & SCF_DO_STCLASS_AND) {
3665                         if (data->start_class->flags & ANYOF_LOCALE)
3666                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3667                     }
3668                     else {
3669                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3670                         data->start_class->flags |= ANYOF_LOCALE;
3671                     }
3672                     break;
3673                 case NALNUM:
3674                     if (flags & SCF_DO_STCLASS_AND) {
3675                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3676                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3677                             if (FLAGS(scan) & USE_UNI) {
3678                                 for (value = 0; value < 256; value++) {
3679                                     if (isWORDCHAR_L1(value)) {
3680                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3681                                     }
3682                                 }
3683                             } else {
3684                                 for (value = 0; value < 256; value++) {
3685                                     if (isALNUM(value)) {
3686                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3687                                     }
3688                                 }
3689                             }
3690                         }
3691                     }
3692                     else {
3693                         if (data->start_class->flags & ANYOF_LOCALE)
3694                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3695                         else {
3696                             for (value = 0; value < 256; value++)
3697                                 if (!isALNUM(value))
3698                                     ANYOF_BITMAP_SET(data->start_class, value);
3699                         }
3700                     }
3701                     break;
3702                 case NALNUML:
3703                     if (flags & SCF_DO_STCLASS_AND) {
3704                         if (data->start_class->flags & ANYOF_LOCALE)
3705                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3706                     }
3707                     else {
3708                         data->start_class->flags |= ANYOF_LOCALE;
3709                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3710                     }
3711                     break;
3712                 case SPACE:
3713                     if (flags & SCF_DO_STCLASS_AND) {
3714                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3715                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3716                             if (FLAGS(scan) & USE_UNI) {
3717                                 for (value = 0; value < 256; value++) {
3718                                     if (!isSPACE_L1(value)) {
3719                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3720                                     }
3721                                 }
3722                             } else {
3723                                 for (value = 0; value < 256; value++) {
3724                                     if (!isSPACE(value)) {
3725                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3726                                     }
3727                                 }
3728                             }
3729                         }
3730                     }
3731                     else {
3732                         if (data->start_class->flags & ANYOF_LOCALE) {
3733                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3734                         }
3735                         else if (FLAGS(scan) & USE_UNI) {
3736                             for (value = 0; value < 256; value++) {
3737                                 if (isSPACE_L1(value)) {
3738                                     ANYOF_BITMAP_SET(data->start_class, value);
3739                                 }
3740                             }
3741                         } else {
3742                             for (value = 0; value < 256; value++) {
3743                                 if (isSPACE(value)) {
3744                                     ANYOF_BITMAP_SET(data->start_class, value);
3745                                 }
3746                             }
3747                         }
3748                     }
3749                     break;
3750                 case SPACEL:
3751                     if (flags & SCF_DO_STCLASS_AND) {
3752                         if (data->start_class->flags & ANYOF_LOCALE)
3753                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3754                     }
3755                     else {
3756                         data->start_class->flags |= ANYOF_LOCALE;
3757                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3758                     }
3759                     break;
3760                 case NSPACE:
3761                     if (flags & SCF_DO_STCLASS_AND) {
3762                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3763                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3764                             if (FLAGS(scan) & USE_UNI) {
3765                                 for (value = 0; value < 256; value++) {
3766                                     if (isSPACE_L1(value)) {
3767                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3768                                     }
3769                                 }
3770                             } else {
3771                                 for (value = 0; value < 256; value++) {
3772                                     if (isSPACE(value)) {
3773                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3774                                     }
3775                                 }
3776                             }
3777                         }
3778                     }
3779                     else {
3780                         if (data->start_class->flags & ANYOF_LOCALE)
3781                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3782                         else if (FLAGS(scan) & USE_UNI) {
3783                             for (value = 0; value < 256; value++) {
3784                                 if (!isSPACE_L1(value)) {
3785                                     ANYOF_BITMAP_SET(data->start_class, value);
3786                                 }
3787                             }
3788                         }
3789                         else {
3790                             for (value = 0; value < 256; value++) {
3791                                 if (!isSPACE(value)) {
3792                                     ANYOF_BITMAP_SET(data->start_class, value);
3793                                 }
3794                             }
3795                         }
3796                     }
3797                     break;
3798                 case NSPACEL:
3799                     if (flags & SCF_DO_STCLASS_AND) {
3800                         if (data->start_class->flags & ANYOF_LOCALE) {
3801                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3802                             for (value = 0; value < 256; value++)
3803                                 if (!isSPACE(value))
3804                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3805                         }
3806                     }
3807                     else {
3808                         data->start_class->flags |= ANYOF_LOCALE;
3809                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3810                     }
3811                     break;
3812                 case DIGIT:
3813                     if (flags & SCF_DO_STCLASS_AND) {
3814                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3815                         for (value = 0; value < 256; value++)
3816                             if (!isDIGIT(value))
3817                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3818                     }
3819                     else {
3820                         if (data->start_class->flags & ANYOF_LOCALE)
3821                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3822                         else {
3823                             for (value = 0; value < 256; value++)
3824                                 if (isDIGIT(value))
3825                                     ANYOF_BITMAP_SET(data->start_class, value);
3826                         }
3827                     }
3828                     break;
3829                 case NDIGIT:
3830                     if (flags & SCF_DO_STCLASS_AND) {
3831                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3832                         for (value = 0; value < 256; value++)
3833                             if (isDIGIT(value))
3834                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3835                     }
3836                     else {
3837                         if (data->start_class->flags & ANYOF_LOCALE)
3838                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3839                         else {
3840                             for (value = 0; value < 256; value++)
3841                                 if (!isDIGIT(value))
3842                                     ANYOF_BITMAP_SET(data->start_class, value);
3843                         }
3844                     }
3845                     break;
3846                 CASE_SYNST_FNC(VERTWS);
3847                 CASE_SYNST_FNC(HORIZWS);
3848
3849                 }
3850                 if (flags & SCF_DO_STCLASS_OR)
3851                     cl_and(data->start_class, and_withp);
3852                 flags &= ~SCF_DO_STCLASS;
3853             }
3854         }
3855         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3856             data->flags |= (OP(scan) == MEOL
3857                             ? SF_BEFORE_MEOL
3858                             : SF_BEFORE_SEOL);
3859         }
3860         else if (  PL_regkind[OP(scan)] == BRANCHJ
3861                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3862                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3863                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3864             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3865                 || OP(scan) == UNLESSM )
3866             {
3867                 /* Negative Lookahead/lookbehind
3868                    In this case we can't do fixed string optimisation.
3869                 */
3870
3871                 I32 deltanext, minnext, fake = 0;
3872                 regnode *nscan;
3873                 struct regnode_charclass_class intrnl;
3874                 int f = 0;
3875
3876                 data_fake.flags = 0;
3877                 if (data) {
3878                     data_fake.whilem_c = data->whilem_c;
3879                     data_fake.last_closep = data->last_closep;
3880                 }
3881                 else
3882                     data_fake.last_closep = &fake;
3883                 data_fake.pos_delta = delta;
3884                 if ( flags & SCF_DO_STCLASS && !scan->flags
3885                      && OP(scan) == IFMATCH ) { /* Lookahead */
3886                     cl_init(pRExC_state, &intrnl);
3887                     data_fake.start_class = &intrnl;
3888                     f |= SCF_DO_STCLASS_AND;
3889                 }
3890                 if (flags & SCF_WHILEM_VISITED_POS)
3891                     f |= SCF_WHILEM_VISITED_POS;
3892                 next = regnext(scan);
3893                 nscan = NEXTOPER(NEXTOPER(scan));
3894                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3895                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3896                 if (scan->flags) {
3897                     if (deltanext) {
3898                         FAIL("Variable length lookbehind not implemented");
3899                     }
3900                     else if (minnext > (I32)U8_MAX) {
3901                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3902                     }
3903                     scan->flags = (U8)minnext;
3904                 }
3905                 if (data) {
3906                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3907                         pars++;
3908                     if (data_fake.flags & SF_HAS_EVAL)
3909                         data->flags |= SF_HAS_EVAL;
3910                     data->whilem_c = data_fake.whilem_c;
3911                 }
3912                 if (f & SCF_DO_STCLASS_AND) {
3913                     if (flags & SCF_DO_STCLASS_OR) {
3914                         /* OR before, AND after: ideally we would recurse with
3915                          * data_fake to get the AND applied by study of the
3916                          * remainder of the pattern, and then derecurse;
3917                          * *** HACK *** for now just treat as "no information".
3918                          * See [perl #56690].
3919                          */
3920                         cl_init(pRExC_state, data->start_class);
3921                     }  else {
3922                         /* AND before and after: combine and continue */
3923                         const int was = (data->start_class->flags & ANYOF_EOS);
3924
3925                         cl_and(data->start_class, &intrnl);
3926                         if (was)
3927                             data->start_class->flags |= ANYOF_EOS;
3928                     }
3929                 }
3930             }
3931 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3932             else {
3933                 /* Positive Lookahead/lookbehind
3934                    In this case we can do fixed string optimisation,
3935                    but we must be careful about it. Note in the case of
3936                    lookbehind the positions will be offset by the minimum
3937                    length of the pattern, something we won't know about
3938                    until after the recurse.
3939                 */
3940                 I32 deltanext, fake = 0;
3941                 regnode *nscan;
3942                 struct regnode_charclass_class intrnl;
3943                 int f = 0;
3944                 /* We use SAVEFREEPV so that when the full compile
3945                     is finished perl will clean up the allocated
3946                     minlens when it's all done. This way we don't
3947                     have to worry about freeing them when we know
3948                     they wont be used, which would be a pain.
3949                  */
3950                 I32 *minnextp;
3951                 Newx( minnextp, 1, I32 );
3952                 SAVEFREEPV(minnextp);
3953
3954                 if (data) {
3955                     StructCopy(data, &data_fake, scan_data_t);
3956                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3957                         f |= SCF_DO_SUBSTR;
3958                         if (scan->flags)
3959                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3960                         data_fake.last_found=newSVsv(data->last_found);
3961                     }
3962                 }
3963                 else
3964                     data_fake.last_closep = &fake;
3965                 data_fake.flags = 0;
3966                 data_fake.pos_delta = delta;
3967                 if (is_inf)
3968                     data_fake.flags |= SF_IS_INF;
3969                 if ( flags & SCF_DO_STCLASS && !scan->flags
3970                      && OP(scan) == IFMATCH ) { /* Lookahead */
3971                     cl_init(pRExC_state, &intrnl);
3972                     data_fake.start_class = &intrnl;
3973                     f |= SCF_DO_STCLASS_AND;
3974                 }
3975                 if (flags & SCF_WHILEM_VISITED_POS)
3976                     f |= SCF_WHILEM_VISITED_POS;
3977                 next = regnext(scan);
3978                 nscan = NEXTOPER(NEXTOPER(scan));
3979
3980                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3981                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3982                 if (scan->flags) {
3983                     if (deltanext) {
3984                         FAIL("Variable length lookbehind not implemented");
3985                     }
3986                     else if (*minnextp > (I32)U8_MAX) {
3987                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3988                     }
3989                     scan->flags = (U8)*minnextp;
3990                 }
3991
3992                 *minnextp += min;
3993
3994                 if (f & SCF_DO_STCLASS_AND) {
3995                     const int was = (data->start_class->flags & ANYOF_EOS);
3996
3997                     cl_and(data->start_class, &intrnl);
3998                     if (was)
3999                         data->start_class->flags |= ANYOF_EOS;
4000                 }
4001                 if (data) {
4002                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4003                         pars++;
4004                     if (data_fake.flags & SF_HAS_EVAL)
4005                         data->flags |= SF_HAS_EVAL;
4006                     data->whilem_c = data_fake.whilem_c;
4007                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4008                         if (RExC_rx->minlen<*minnextp)
4009                             RExC_rx->minlen=*minnextp;
4010                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4011                         SvREFCNT_dec(data_fake.last_found);
4012
4013                         if ( data_fake.minlen_fixed != minlenp )
4014                         {
4015                             data->offset_fixed= data_fake.offset_fixed;
4016                             data->minlen_fixed= data_fake.minlen_fixed;
4017                             data->lookbehind_fixed+= scan->flags;
4018                         }
4019                         if ( data_fake.minlen_float != minlenp )
4020                         {
4021                             data->minlen_float= data_fake.minlen_float;
4022                             data->offset_float_min=data_fake.offset_float_min;
4023                             data->offset_float_max=data_fake.offset_float_max;
4024                             data->lookbehind_float+= scan->flags;
4025                         }
4026                     }
4027                 }
4028
4029
4030             }
4031 #endif
4032         }
4033         else if (OP(scan) == OPEN) {
4034             if (stopparen != (I32)ARG(scan))
4035                 pars++;
4036         }
4037         else if (OP(scan) == CLOSE) {
4038             if (stopparen == (I32)ARG(scan)) {
4039                 break;
4040             }
4041             if ((I32)ARG(scan) == is_par) {
4042                 next = regnext(scan);
4043
4044                 if ( next && (OP(next) != WHILEM) && next < last)
4045                     is_par = 0;         /* Disable optimization */
4046             }
4047             if (data)
4048                 *(data->last_closep) = ARG(scan);
4049         }
4050         else if (OP(scan) == EVAL) {
4051                 if (data)
4052                     data->flags |= SF_HAS_EVAL;
4053         }
4054         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4055             if (flags & SCF_DO_SUBSTR) {
4056                 SCAN_COMMIT(pRExC_state,data,minlenp);
4057                 flags &= ~SCF_DO_SUBSTR;
4058             }
4059             if (data && OP(scan)==ACCEPT) {
4060                 data->flags |= SCF_SEEN_ACCEPT;
4061                 if (stopmin > min)
4062                     stopmin = min;
4063             }
4064         }
4065         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4066         {
4067                 if (flags & SCF_DO_SUBSTR) {
4068                     SCAN_COMMIT(pRExC_state,data,minlenp);
4069                     data->longest = &(data->longest_float);
4070                 }
4071                 is_inf = is_inf_internal = 1;
4072                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4073                     cl_anything(pRExC_state, data->start_class);
4074                 flags &= ~SCF_DO_STCLASS;
4075         }
4076         else if (OP(scan) == GPOS) {
4077             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4078                 !(delta || is_inf || (data && data->pos_delta)))
4079             {
4080                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4081                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4082                 if (RExC_rx->gofs < (U32)min)
4083                     RExC_rx->gofs = min;
4084             } else {
4085                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4086                 RExC_rx->gofs = 0;
4087             }
4088         }
4089 #ifdef TRIE_STUDY_OPT
4090 #ifdef FULL_TRIE_STUDY
4091         else if (PL_regkind[OP(scan)] == TRIE) {
4092             /* NOTE - There is similar code to this block above for handling
4093                BRANCH nodes on the initial study.  If you change stuff here
4094                check there too. */
4095             regnode *trie_node= scan;
4096             regnode *tail= regnext(scan);
4097             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4098             I32 max1 = 0, min1 = I32_MAX;
4099             struct regnode_charclass_class accum;
4100
4101             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4102                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4103             if (flags & SCF_DO_STCLASS)
4104                 cl_init_zero(pRExC_state, &accum);
4105
4106             if (!trie->jump) {
4107                 min1= trie->minlen;
4108                 max1= trie->maxlen;
4109             } else {
4110                 const regnode *nextbranch= NULL;
4111                 U32 word;
4112
4113                 for ( word=1 ; word <= trie->wordcount ; word++)
4114                 {
4115                     I32 deltanext=0, minnext=0, f = 0, fake;
4116                     struct regnode_charclass_class this_class;
4117
4118                     data_fake.flags = 0;
4119                     if (data) {
4120                         data_fake.whilem_c = data->whilem_c;
4121                         data_fake.last_closep = data->last_closep;
4122                     }
4123                     else
4124                         data_fake.last_closep = &fake;
4125                     data_fake.pos_delta = delta;
4126                     if (flags & SCF_DO_STCLASS) {
4127                         cl_init(pRExC_state, &this_class);
4128                         data_fake.start_class = &this_class;
4129                         f = SCF_DO_STCLASS_AND;
4130                     }
4131                     if (flags & SCF_WHILEM_VISITED_POS)
4132                         f |= SCF_WHILEM_VISITED_POS;
4133
4134                     if (trie->jump[word]) {
4135                         if (!nextbranch)
4136                             nextbranch = trie_node + trie->jump[0];
4137                         scan= trie_node + trie->jump[word];
4138                         /* We go from the jump point to the branch that follows
4139                            it. Note this means we need the vestigal unused branches
4140                            even though they arent otherwise used.
4141                          */
4142                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4143                             &deltanext, (regnode *)nextbranch, &data_fake,
4144                             stopparen, recursed, NULL, f,depth+1);
4145                     }
4146                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4147                         nextbranch= regnext((regnode*)nextbranch);
4148
4149                     if (min1 > (I32)(minnext + trie->minlen))
4150                         min1 = minnext + trie->minlen;
4151                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4152                         max1 = minnext + deltanext + trie->maxlen;
4153                     if (deltanext == I32_MAX)
4154                         is_inf = is_inf_internal = 1;
4155
4156                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4157                         pars++;
4158                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4159                         if ( stopmin > min + min1)
4160                             stopmin = min + min1;
4161                         flags &= ~SCF_DO_SUBSTR;
4162                         if (data)
4163                             data->flags |= SCF_SEEN_ACCEPT;
4164                     }
4165                     if (data) {
4166                         if (data_fake.flags & SF_HAS_EVAL)
4167                             data->flags |= SF_HAS_EVAL;
4168                         data->whilem_c = data_fake.whilem_c;
4169                     }
4170                     if (flags & SCF_DO_STCLASS)
4171                         cl_or(pRExC_state, &accum, &this_class);
4172                 }
4173             }
4174             if (flags & SCF_DO_SUBSTR) {
4175                 data->pos_min += min1;
4176                 data->pos_delta += max1 - min1;
4177                 if (max1 != min1 || is_inf)
4178                     data->longest = &(data->longest_float);
4179             }
4180             min += min1;
4181             delta += max1 - min1;
4182             if (flags & SCF_DO_STCLASS_OR) {
4183                 cl_or(pRExC_state, data->start_class, &accum);
4184                 if (min1) {
4185                     cl_and(data->start_class, and_withp);
4186                     flags &= ~SCF_DO_STCLASS;
4187                 }
4188             }
4189             else if (flags & SCF_DO_STCLASS_AND) {
4190                 if (min1) {
4191                     cl_and(data->start_class, &accum);
4192                     flags &= ~SCF_DO_STCLASS;
4193                 }
4194                 else {
4195                     /* Switch to OR mode: cache the old value of
4196                      * data->start_class */
4197                     INIT_AND_WITHP;
4198                     StructCopy(data->start_class, and_withp,
4199                                struct regnode_charclass_class);
4200                     flags &= ~SCF_DO_STCLASS_AND;
4201                     StructCopy(&accum, data->start_class,
4202                                struct regnode_charclass_class);
4203                     flags |= SCF_DO_STCLASS_OR;
4204                     data->start_class->flags |= ANYOF_EOS;
4205                 }
4206             }
4207             scan= tail;
4208             continue;
4209         }
4210 #else
4211         else if (PL_regkind[OP(scan)] == TRIE) {
4212             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4213             U8*bang=NULL;
4214
4215             min += trie->minlen;
4216             delta += (trie->maxlen - trie->minlen);
4217             flags &= ~SCF_DO_STCLASS; /* xxx */
4218             if (flags & SCF_DO_SUBSTR) {
4219                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4220                 data->pos_min += trie->minlen;
4221                 data->pos_delta += (trie->maxlen - trie->minlen);
4222                 if (trie->maxlen != trie->minlen)
4223                     data->longest = &(data->longest_float);
4224             }
4225             if (trie->jump) /* no more substrings -- for now /grr*/
4226                 flags &= ~SCF_DO_SUBSTR;
4227         }
4228 #endif /* old or new */
4229 #endif /* TRIE_STUDY_OPT */
4230
4231         /* Else: zero-length, ignore. */
4232         scan = regnext(scan);
4233     }
4234     if (frame) {
4235         last = frame->last;
4236         scan = frame->next;
4237         stopparen = frame->stop;
4238         frame = frame->prev;
4239         goto fake_study_recurse;
4240     }
4241
4242   finish:
4243     assert(!frame);
4244     DEBUG_STUDYDATA("pre-fin:",data,depth);
4245
4246     *scanp = scan;
4247     *deltap = is_inf_internal ? I32_MAX : delta;
4248     if (flags & SCF_DO_SUBSTR && is_inf)
4249         data->pos_delta = I32_MAX - data->pos_min;
4250     if (is_par > (I32)U8_MAX)
4251         is_par = 0;
4252     if (is_par && pars==1 && data) {
4253         data->flags |= SF_IN_PAR;
4254         data->flags &= ~SF_HAS_PAR;
4255     }
4256     else if (pars && data) {
4257         data->flags |= SF_HAS_PAR;
4258         data->flags &= ~SF_IN_PAR;
4259     }
4260     if (flags & SCF_DO_STCLASS_OR)
4261         cl_and(data->start_class, and_withp);
4262     if (flags & SCF_TRIE_RESTUDY)
4263         data->flags |=  SCF_TRIE_RESTUDY;
4264
4265     DEBUG_STUDYDATA("post-fin:",data,depth);
4266
4267     return min < stopmin ? min : stopmin;
4268 }
4269
4270 STATIC U32
4271 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4272 {
4273     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4274
4275     PERL_ARGS_ASSERT_ADD_DATA;
4276
4277     Renewc(RExC_rxi->data,
4278            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4279            char, struct reg_data);
4280     if(count)
4281         Renew(RExC_rxi->data->what, count + n, U8);
4282     else
4283         Newx(RExC_rxi->data->what, n, U8);
4284     RExC_rxi->data->count = count + n;
4285     Copy(s, RExC_rxi->data->what + count, n, U8);
4286     return count;
4287 }
4288
4289 /*XXX: todo make this not included in a non debugging perl */
4290 #ifndef PERL_IN_XSUB_RE
4291 void
4292 Perl_reginitcolors(pTHX)
4293 {
4294     dVAR;
4295     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4296     if (s) {
4297         char *t = savepv(s);
4298         int i = 0;
4299         PL_colors[0] = t;
4300         while (++i < 6) {
4301             t = strchr(t, '\t');
4302             if (t) {
4303                 *t = '\0';
4304                 PL_colors[i] = ++t;
4305             }
4306             else
4307                 PL_colors[i] = t = (char *)"";
4308         }
4309     } else {
4310         int i = 0;
4311         while (i < 6)
4312             PL_colors[i++] = (char *)"";
4313     }
4314     PL_colorset = 1;
4315 }
4316 #endif
4317
4318
4319 #ifdef TRIE_STUDY_OPT
4320 #define CHECK_RESTUDY_GOTO                                  \
4321         if (                                                \
4322               (data.flags & SCF_TRIE_RESTUDY)               \
4323               && ! restudied++                              \
4324         )     goto reStudy
4325 #else
4326 #define CHECK_RESTUDY_GOTO
4327 #endif
4328
4329 /*
4330  - pregcomp - compile a regular expression into internal code
4331  *
4332  * We can't allocate space until we know how big the compiled form will be,
4333  * but we can't compile it (and thus know how big it is) until we've got a
4334  * place to put the code.  So we cheat:  we compile it twice, once with code
4335  * generation turned off and size counting turned on, and once "for real".
4336  * This also means that we don't allocate space until we are sure that the
4337  * thing really will compile successfully, and we never have to move the
4338  * code and thus invalidate pointers into it.  (Note that it has to be in
4339  * one piece because free() must be able to free it all.) [NB: not true in perl]
4340  *
4341  * Beware that the optimization-preparation code in here knows about some
4342  * of the structure of the compiled regexp.  [I'll say.]
4343  */
4344
4345
4346
4347 #ifndef PERL_IN_XSUB_RE
4348 #define RE_ENGINE_PTR &PL_core_reg_engine
4349 #else
4350 extern const struct regexp_engine my_reg_engine;
4351 #define RE_ENGINE_PTR &my_reg_engine
4352 #endif
4353
4354 #ifndef PERL_IN_XSUB_RE
4355 REGEXP *
4356 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4357 {
4358     dVAR;
4359     HV * const table = GvHV(PL_hintgv);
4360
4361     PERL_ARGS_ASSERT_PREGCOMP;
4362
4363     /* Dispatch a request to compile a regexp to correct
4364        regexp engine. */
4365     if (table) {
4366         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4367         GET_RE_DEBUG_FLAGS_DECL;
4368         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4369             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4370             DEBUG_COMPILE_r({
4371                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4372                     SvIV(*ptr));
4373             });
4374             return CALLREGCOMP_ENG(eng, pattern, flags);
4375         }
4376     }
4377     return Perl_re_compile(aTHX_ pattern, flags);
4378 }
4379 #endif
4380
4381 REGEXP *
4382 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4383 {
4384     dVAR;
4385     REGEXP *rx;
4386     struct regexp *r;
4387     register regexp_internal *ri;
4388     STRLEN plen;
4389     char  *exp;
4390     char* xend;
4391     regnode *scan;
4392     I32 flags;
4393     I32 minlen = 0;
4394     U32 pm_flags;
4395
4396     /* these are all flags - maybe they should be turned
4397      * into a single int with different bit masks */
4398     I32 sawlookahead = 0;
4399     I32 sawplus = 0;
4400     I32 sawopen = 0;
4401     bool used_setjump = FALSE;
4402
4403     U8 jump_ret = 0;
4404     dJMPENV;
4405     scan_data_t data;
4406     RExC_state_t RExC_state;
4407     RExC_state_t * const pRExC_state = &RExC_state;
4408 #ifdef TRIE_STUDY_OPT
4409     int restudied;
4410     RExC_state_t copyRExC_state;
4411 #endif
4412     GET_RE_DEBUG_FLAGS_DECL;
4413
4414     PERL_ARGS_ASSERT_RE_COMPILE;
4415
4416     DEBUG_r(if (!PL_colorset) reginitcolors());
4417
4418     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4419
4420     /****************** LONG JUMP TARGET HERE***********************/
4421     /* Longjmp back to here if have to switch in midstream to utf8 */
4422     if (! RExC_orig_utf8) {
4423         JMPENV_PUSH(jump_ret);
4424         used_setjump = TRUE;
4425     }
4426
4427     if (jump_ret == 0) {    /* First time through */
4428         exp = SvPV(pattern, plen);
4429         xend = exp + plen;
4430         /* ignore the utf8ness if the pattern is 0 length */
4431         if (plen == 0) {
4432             RExC_utf8 = RExC_orig_utf8 = 0;
4433         }
4434
4435         DEBUG_COMPILE_r({
4436             SV *dsv= sv_newmortal();
4437             RE_PV_QUOTED_DECL(s, RExC_utf8,
4438                 dsv, exp, plen, 60);
4439             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4440                            PL_colors[4],PL_colors[5],s);
4441         });
4442     }
4443     else {  /* longjumped back */
4444         STRLEN len = plen;
4445
4446         /* If the cause for the longjmp was other than changing to utf8, pop
4447          * our own setjmp, and longjmp to the correct handler */
4448         if (jump_ret != UTF8_LONGJMP) {
4449             JMPENV_POP;
4450             JMPENV_JUMP(jump_ret);
4451         }
4452
4453         GET_RE_DEBUG_FLAGS;
4454
4455         /* It's possible to write a regexp in ascii that represents Unicode
4456         codepoints outside of the byte range, such as via \x{100}. If we
4457         detect such a sequence we have to convert the entire pattern to utf8
4458         and then recompile, as our sizing calculation will have been based
4459         on 1 byte == 1 character, but we will need to use utf8 to encode
4460         at least some part of the pattern, and therefore must convert the whole
4461         thing.
4462         -- dmq */
4463         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4464             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4465         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4466         xend = exp + len;
4467         RExC_orig_utf8 = RExC_utf8 = 1;
4468         SAVEFREEPV(exp);
4469     }
4470
4471 #ifdef TRIE_STUDY_OPT
4472     restudied = 0;
4473 #endif
4474
4475     /* Set to use unicode semantics if the pattern is in utf8 and has the
4476      * 'dual' charset specified, as it means unicode when utf8  */
4477     pm_flags = orig_pm_flags;
4478     if (RExC_utf8  && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
4479         pm_flags |= RXf_PMf_UNICODE;
4480     }
4481
4482     RExC_precomp = exp;
4483     RExC_flags = pm_flags;
4484     RExC_sawback = 0;
4485
4486     RExC_seen = 0;
4487     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4488     RExC_seen_evals = 0;
4489     RExC_extralen = 0;
4490
4491     /* First pass: determine size, legality. */
4492     RExC_parse = exp;
4493     RExC_start = exp;
4494     RExC_end = xend;
4495     RExC_naughty = 0;
4496     RExC_npar = 1;
4497     RExC_nestroot = 0;
4498     RExC_size = 0L;
4499     RExC_emit = &PL_regdummy;
4500     RExC_whilem_seen = 0;
4501     RExC_open_parens = NULL;
4502     RExC_close_parens = NULL;
4503     RExC_opend = NULL;
4504     RExC_paren_names = NULL;
4505 #ifdef DEBUGGING
4506     RExC_paren_name_list = NULL;
4507 #endif
4508     RExC_recurse = NULL;
4509     RExC_recurse_count = 0;
4510
4511 #if 0 /* REGC() is (currently) a NOP at the first pass.
4512        * Clever compilers notice this and complain. --jhi */
4513     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4514 #endif
4515     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4516     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4517         RExC_precomp = NULL;
4518         return(NULL);
4519     }
4520
4521     /* Here, finished first pass.  Get rid of any added setjmp */
4522     if (used_setjump) {
4523         JMPENV_POP;
4524     }
4525     DEBUG_PARSE_r({
4526         PerlIO_printf(Perl_debug_log,
4527             "Required size %"IVdf" nodes\n"
4528             "Starting second pass (creation)\n",
4529             (IV)RExC_size);
4530         RExC_lastnum=0;
4531         RExC_lastparse=NULL;
4532     });
4533     /* Small enough for pointer-storage convention?
4534        If extralen==0, this means that we will not need long jumps. */
4535     if (RExC_size >= 0x10000L && RExC_extralen)
4536         RExC_size += RExC_extralen;
4537     else
4538         RExC_extralen = 0;
4539     if (RExC_whilem_seen > 15)
4540         RExC_whilem_seen = 15;
4541
4542     /* Allocate space and zero-initialize. Note, the two step process
4543        of zeroing when in debug mode, thus anything assigned has to
4544        happen after that */
4545     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4546     r = (struct regexp*)SvANY(rx);
4547     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4548          char, regexp_internal);
4549     if ( r == NULL || ri == NULL )
4550         FAIL("Regexp out of space");
4551 #ifdef DEBUGGING
4552     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4553     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4554 #else
4555     /* bulk initialize base fields with 0. */
4556     Zero(ri, sizeof(regexp_internal), char);
4557 #endif
4558
4559     /* non-zero initialization begins here */
4560     RXi_SET( r, ri );
4561     r->engine= RE_ENGINE_PTR;
4562     r->extflags = pm_flags;
4563     {
4564         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4565         bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
4566
4567         /* The caret is output if there are any defaults: if not all the STD
4568          * flags are set, or if no character set specifier is needed */
4569         bool has_default =
4570                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4571                     || ! has_charset);
4572         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4573         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4574                             >> RXf_PMf_STD_PMMOD_SHIFT);
4575         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4576         char *p;
4577         /* Allocate for the worst case, which is all the std flags are turned
4578          * on.  If more precision is desired, we could do a population count of
4579          * the flags set.  This could be done with a small lookup table, or by
4580          * shifting, masking and adding, or even, when available, assembly
4581          * language for a machine-language population count.
4582          * We never output a minus, as all those are defaults, so are
4583          * covered by the caret */
4584         const STRLEN wraplen = plen + has_p + has_runon
4585             + has_default       /* If needs a caret */
4586             + has_charset       /* If needs a character set specifier */
4587             + (sizeof(STD_PAT_MODS) - 1)
4588             + (sizeof("(?:)") - 1);
4589
4590         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4591         SvPOK_on(rx);
4592         SvFLAGS(rx) |= SvUTF8(pattern);
4593         *p++='('; *p++='?';
4594
4595         /* If a default, cover it using the caret */
4596         if (has_default) {
4597             *p++= DEFAULT_PAT_MOD;
4598         }
4599         if (has_charset) {
4600             if (r->extflags & RXf_PMf_LOCALE) {
4601                 *p++ = LOCALE_PAT_MOD;
4602             } else {
4603                 *p++ = UNICODE_PAT_MOD;
4604             }
4605         }
4606         if (has_p)
4607             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4608         {
4609             char ch;
4610             while((ch = *fptr++)) {
4611                 if(reganch & 1)
4612                     *p++ = ch;
4613                 reganch >>= 1;
4614             }
4615         }
4616
4617         *p++ = ':';
4618         Copy(RExC_precomp, p, plen, char);
4619         assert ((RX_WRAPPED(rx) - p) < 16);
4620         r->pre_prefix = p - RX_WRAPPED(rx);
4621         p += plen;
4622         if (has_runon)
4623             *p++ = '\n';
4624         *p++ = ')';
4625         *p = 0;
4626         SvCUR_set(rx, p - SvPVX_const(rx));
4627     }
4628
4629     r->intflags = 0;
4630     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4631
4632     if (RExC_seen & REG_SEEN_RECURSE) {
4633         Newxz(RExC_open_parens, RExC_npar,regnode *);
4634         SAVEFREEPV(RExC_open_parens);
4635         Newxz(RExC_close_parens,RExC_npar,regnode *);
4636         SAVEFREEPV(RExC_close_parens);
4637     }
4638
4639     /* Useful during FAIL. */
4640 #ifdef RE_TRACK_PATTERN_OFFSETS
4641     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4642     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4643                           "%s %"UVuf" bytes for offset annotations.\n",
4644                           ri->u.offsets ? "Got" : "Couldn't get",
4645                           (UV)((2*RExC_size+1) * sizeof(U32))));
4646 #endif
4647     SetProgLen(ri,RExC_size);
4648     RExC_rx_sv = rx;
4649     RExC_rx = r;
4650     RExC_rxi = ri;
4651
4652     /* Second pass: emit code. */
4653     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4654     RExC_parse = exp;
4655     RExC_end = xend;
4656     RExC_naughty = 0;
4657     RExC_npar = 1;
4658     RExC_emit_start = ri->program;
4659     RExC_emit = ri->program;
4660     RExC_emit_bound = ri->program + RExC_size + 1;
4661
4662     /* Store the count of eval-groups for security checks: */
4663     RExC_rx->seen_evals = RExC_seen_evals;
4664     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4665     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4666         ReREFCNT_dec(rx);
4667         return(NULL);
4668     }
4669     /* XXXX To minimize changes to RE engine we always allocate
4670        3-units-long substrs field. */
4671     Newx(r->substrs, 1, struct reg_substr_data);
4672     if (RExC_recurse_count) {
4673         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4674         SAVEFREEPV(RExC_recurse);
4675     }
4676
4677 reStudy:
4678     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4679     Zero(r->substrs, 1, struct reg_substr_data);
4680
4681 #ifdef TRIE_STUDY_OPT
4682     if (!restudied) {
4683         StructCopy(&zero_scan_data, &data, scan_data_t);
4684         copyRExC_state = RExC_state;
4685     } else {
4686         U32 seen=RExC_seen;
4687         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4688
4689         RExC_state = copyRExC_state;
4690         if (seen & REG_TOP_LEVEL_BRANCHES)
4691             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4692         else
4693             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4694         if (data.last_found) {
4695             SvREFCNT_dec(data.longest_fixed);
4696             SvREFCNT_dec(data.longest_float);
4697             SvREFCNT_dec(data.last_found);
4698         }
4699         StructCopy(&zero_scan_data, &data, scan_data_t);
4700     }
4701 #else
4702     StructCopy(&zero_scan_data, &data, scan_data_t);
4703 #endif
4704
4705     /* Dig out information for optimizations. */
4706     r->extflags = RExC_flags; /* was pm_op */
4707     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4708
4709     if (UTF)
4710         SvUTF8_on(rx);  /* Unicode in it? */
4711     ri->regstclass = NULL;
4712     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4713         r->intflags |= PREGf_NAUGHTY;
4714     scan = ri->program + 1;             /* First BRANCH. */
4715
4716     /* testing for BRANCH here tells us whether there is "must appear"
4717        data in the pattern. If there is then we can use it for optimisations */
4718     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4719         I32 fake;
4720         STRLEN longest_float_length, longest_fixed_length;
4721         struct regnode_charclass_class ch_class; /* pointed to by data */
4722         int stclass_flag;
4723         I32 last_close = 0; /* pointed to by data */
4724         regnode *first= scan;
4725         regnode *first_next= regnext(first);
4726         /*
4727          * Skip introductions and multiplicators >= 1
4728          * so that we can extract the 'meat' of the pattern that must
4729          * match in the large if() sequence following.
4730          * NOTE that EXACT is NOT covered here, as it is normally
4731          * picked up by the optimiser separately.
4732          *
4733          * This is unfortunate as the optimiser isnt handling lookahead
4734          * properly currently.
4735          *
4736          */
4737         while ((OP(first) == OPEN && (sawopen = 1)) ||
4738                /* An OR of *one* alternative - should not happen now. */
4739             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4740             /* for now we can't handle lookbehind IFMATCH*/
4741             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4742             (OP(first) == PLUS) ||
4743             (OP(first) == MINMOD) ||
4744                /* An {n,m} with n>0 */
4745             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4746             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4747         {
4748                 /*
4749                  * the only op that could be a regnode is PLUS, all the rest
4750                  * will be regnode_1 or regnode_2.
4751                  *
4752                  */
4753                 if (OP(first) == PLUS)
4754                     sawplus = 1;
4755                 else
4756                     first += regarglen[OP(first)];
4757
4758                 first = NEXTOPER(first);
4759                 first_next= regnext(first);
4760         }
4761
4762         /* Starting-point info. */
4763       again:
4764         DEBUG_PEEP("first:",first,0);
4765         /* Ignore EXACT as we deal with it later. */
4766         if (PL_regkind[OP(first)] == EXACT) {
4767             if (OP(first) == EXACT)
4768                 NOOP;   /* Empty, get anchored substr later. */
4769             else
4770                 ri->regstclass = first;
4771         }
4772 #ifdef TRIE_STCLASS
4773         else if (PL_regkind[OP(first)] == TRIE &&
4774                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4775         {
4776             regnode *trie_op;
4777             /* this can happen only on restudy */
4778             if ( OP(first) == TRIE ) {
4779                 struct regnode_1 *trieop = (struct regnode_1 *)
4780                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4781                 StructCopy(first,trieop,struct regnode_1);
4782                 trie_op=(regnode *)trieop;
4783             } else {
4784                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4785                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4786                 StructCopy(first,trieop,struct regnode_charclass);
4787                 trie_op=(regnode *)trieop;
4788             }
4789             OP(trie_op)+=2;
4790             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4791             ri->regstclass = trie_op;
4792         }
4793 #endif
4794         else if (REGNODE_SIMPLE(OP(first)))
4795             ri->regstclass = first;
4796         else if (PL_regkind[OP(first)] == BOUND ||
4797                  PL_regkind[OP(first)] == NBOUND)
4798             ri->regstclass = first;
4799         else if (PL_regkind[OP(first)] == BOL) {
4800             r->extflags |= (OP(first) == MBOL
4801                            ? RXf_ANCH_MBOL
4802                            : (OP(first) == SBOL
4803                               ? RXf_ANCH_SBOL
4804                               : RXf_ANCH_BOL));
4805             first = NEXTOPER(first);
4806             goto again;
4807         }
4808         else if (OP(first) == GPOS) {
4809             r->extflags |= RXf_ANCH_GPOS;
4810             first = NEXTOPER(first);
4811             goto again;
4812         }
4813         else if ((!sawopen || !RExC_sawback) &&
4814             (OP(first) == STAR &&
4815             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4816             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4817         {
4818             /* turn .* into ^.* with an implied $*=1 */
4819             const int type =
4820                 (OP(NEXTOPER(first)) == REG_ANY)
4821                     ? RXf_ANCH_MBOL
4822                     : RXf_ANCH_SBOL;
4823             r->extflags |= type;
4824             r->intflags |= PREGf_IMPLICIT;
4825             first = NEXTOPER(first);
4826             goto again;
4827         }
4828         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4829             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4830             /* x+ must match at the 1st pos of run of x's */
4831             r->intflags |= PREGf_SKIP;
4832
4833         /* Scan is after the zeroth branch, first is atomic matcher. */
4834 #ifdef TRIE_STUDY_OPT
4835         DEBUG_PARSE_r(
4836             if (!restudied)
4837                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4838                               (IV)(first - scan + 1))
4839         );
4840 #else
4841         DEBUG_PARSE_r(
4842             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4843                 (IV)(first - scan + 1))
4844         );
4845 #endif
4846
4847
4848         /*
4849         * If there's something expensive in the r.e., find the
4850         * longest literal string that must appear and make it the
4851         * regmust.  Resolve ties in favor of later strings, since
4852         * the regstart check works with the beginning of the r.e.
4853         * and avoiding duplication strengthens checking.  Not a
4854         * strong reason, but sufficient in the absence of others.
4855         * [Now we resolve ties in favor of the earlier string if
4856         * it happens that c_offset_min has been invalidated, since the
4857         * earlier string may buy us something the later one won't.]
4858         */
4859
4860         data.longest_fixed = newSVpvs("");
4861         data.longest_float = newSVpvs("");
4862         data.last_found = newSVpvs("");
4863         data.longest = &(data.longest_fixed);
4864         first = scan;
4865         if (!ri->regstclass) {
4866             cl_init(pRExC_state, &ch_class);
4867             data.start_class = &ch_class;
4868             stclass_flag = SCF_DO_STCLASS_AND;
4869         } else                          /* XXXX Check for BOUND? */
4870             stclass_flag = 0;
4871         data.last_closep = &last_close;
4872
4873         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4874             &data, -1, NULL, NULL,
4875             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4876
4877
4878         CHECK_RESTUDY_GOTO;
4879
4880
4881         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4882              && data.last_start_min == 0 && data.last_end > 0
4883              && !RExC_seen_zerolen
4884              && !(RExC_seen & REG_SEEN_VERBARG)
4885              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4886             r->extflags |= RXf_CHECK_ALL;
4887         scan_commit(pRExC_state, &data,&minlen,0);
4888         SvREFCNT_dec(data.last_found);
4889
4890         /* Note that code very similar to this but for anchored string
4891            follows immediately below, changes may need to be made to both.
4892            Be careful.
4893          */
4894         longest_float_length = CHR_SVLEN(data.longest_float);
4895         if (longest_float_length
4896             || (data.flags & SF_FL_BEFORE_EOL
4897                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4898                     || (RExC_flags & RXf_PMf_MULTILINE))))
4899         {
4900             I32 t,ml;
4901
4902             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4903                 && data.offset_fixed == data.offset_float_min
4904                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4905                     goto remove_float;          /* As in (a)+. */
4906
4907             /* copy the information about the longest float from the reg_scan_data
4908                over to the program. */
4909             if (SvUTF8(data.longest_float)) {
4910                 r->float_utf8 = data.longest_float;
4911                 r->float_substr = NULL;
4912             } else {
4913                 r->float_substr = data.longest_float;
4914                 r->float_utf8 = NULL;
4915             }
4916             /* float_end_shift is how many chars that must be matched that
4917                follow this item. We calculate it ahead of time as once the
4918                lookbehind offset is added in we lose the ability to correctly
4919                calculate it.*/
4920             ml = data.minlen_float ? *(data.minlen_float)
4921                                    : (I32)longest_float_length;
4922             r->float_end_shift = ml - data.offset_float_min
4923                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4924                 + data.lookbehind_float;
4925             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4926             r->float_max_offset = data.offset_float_max;
4927             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4928                 r->float_max_offset -= data.lookbehind_float;
4929
4930             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4931                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4932                            || (RExC_flags & RXf_PMf_MULTILINE)));
4933             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4934         }
4935         else {
4936           remove_float:
4937             r->float_substr = r->float_utf8 = NULL;
4938             SvREFCNT_dec(data.longest_float);
4939             longest_float_length = 0;
4940         }
4941
4942         /* Note that code very similar to this but for floating string
4943            is immediately above, changes may need to be made to both.
4944            Be careful.
4945          */
4946         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4947         if (longest_fixed_length
4948             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4949                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4950                     || (RExC_flags & RXf_PMf_MULTILINE))))
4951         {
4952             I32 t,ml;
4953
4954             /* copy the information about the longest fixed
4955                from the reg_scan_data over to the program. */
4956             if (SvUTF8(data.longest_fixed)) {
4957                 r->anchored_utf8 = data.longest_fixed;
4958                 r->anchored_substr = NULL;
4959             } else {
4960                 r->anchored_substr = data.longest_fixed;
4961                 r->anchored_utf8 = NULL;
4962             }
4963             /* fixed_end_shift is how many chars that must be matched that
4964                follow this item. We calculate it ahead of time as once the
4965                lookbehind offset is added in we lose the ability to correctly
4966                calculate it.*/
4967             ml = data.minlen_fixed ? *(data.minlen_fixed)
4968                                    : (I32)longest_fixed_length;
4969             r->anchored_end_shift = ml - data.offset_fixed
4970                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4971                 + data.lookbehind_fixed;
4972             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4973
4974             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4975                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4976                      || (RExC_flags & RXf_PMf_MULTILINE)));
4977             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4978         }
4979         else {
4980             r->anchored_substr = r->anchored_utf8 = NULL;
4981             SvREFCNT_dec(data.longest_fixed);
4982             longest_fixed_length = 0;
4983         }
4984         if (ri->regstclass
4985             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4986             ri->regstclass = NULL;
4987         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4988             && stclass_flag
4989             && !(data.start_class->flags & ANYOF_EOS)
4990             && !cl_is_anything(data.start_class))
4991         {
4992             const U32 n = add_data(pRExC_state, 1, "f");
4993
4994             Newx(RExC_rxi->data->data[n], 1,
4995                 struct regnode_charclass_class);
4996             StructCopy(data.start_class,
4997                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4998                        struct regnode_charclass_class);
4999             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5000             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5001             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5002                       regprop(r, sv, (regnode*)data.start_class);
5003                       PerlIO_printf(Perl_debug_log,
5004                                     "synthetic stclass \"%s\".\n",
5005                                     SvPVX_const(sv));});
5006         }
5007
5008         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5009         if (longest_fixed_length > longest_float_length) {
5010             r->check_end_shift = r->anchored_end_shift;
5011             r->check_substr = r->anchored_substr;
5012             r->check_utf8 = r->anchored_utf8;
5013             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5014             if (r->extflags & RXf_ANCH_SINGLE)
5015                 r->extflags |= RXf_NOSCAN;
5016         }
5017         else {
5018             r->check_end_shift = r->float_end_shift;
5019             r->check_substr = r->float_substr;
5020             r->check_utf8 = r->float_utf8;
5021             r->check_offset_min = r->float_min_offset;
5022             r->check_offset_max = r->float_max_offset;
5023         }
5024         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5025            This should be changed ASAP!  */
5026         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5027             r->extflags |= RXf_USE_INTUIT;
5028             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5029                 r->extflags |= RXf_INTUIT_TAIL;
5030         }
5031         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5032         if ( (STRLEN)minlen < longest_float_length )
5033             minlen= longest_float_length;
5034         if ( (STRLEN)minlen < longest_fixed_length )
5035             minlen= longest_fixed_length;
5036         */
5037     }
5038     else {
5039         /* Several toplevels. Best we can is to set minlen. */
5040         I32 fake;
5041         struct regnode_charclass_class ch_class;
5042         I32 last_close = 0;
5043
5044         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5045
5046         scan = ri->program + 1;
5047         cl_init(pRExC_state, &ch_class);
5048         data.start_class = &ch_class;
5049         data.last_closep = &last_close;
5050
5051
5052         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5053             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5054
5055         CHECK_RESTUDY_GOTO;
5056
5057         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5058                 = r->float_substr = r->float_utf8 = NULL;
5059         if (!(data.start_class->flags & ANYOF_EOS)
5060             && !cl_is_anything(data.start_class))
5061         {
5062             const U32 n = add_data(pRExC_state, 1, "f");
5063
5064             Newx(RExC_rxi->data->data[n], 1,
5065                 struct regnode_charclass_class);
5066             StructCopy(data.start_class,
5067                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5068                        struct regnode_charclass_class);
5069             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5070             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5071             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5072                       regprop(r, sv, (regnode*)data.start_class);
5073                       PerlIO_printf(Perl_debug_log,
5074                                     "synthetic stclass \"%s\".\n",
5075                                     SvPVX_const(sv));});
5076         }
5077     }
5078
5079     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5080        the "real" pattern. */
5081     DEBUG_OPTIMISE_r({
5082         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5083                       (IV)minlen, (IV)r->minlen);
5084     });
5085     r->minlenret = minlen;
5086     if (r->minlen < minlen)
5087         r->minlen = minlen;
5088
5089     if (RExC_seen & REG_SEEN_GPOS)
5090         r->extflags |= RXf_GPOS_SEEN;
5091     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5092         r->extflags |= RXf_LOOKBEHIND_SEEN;
5093     if (RExC_seen & REG_SEEN_EVAL)
5094         r->extflags |= RXf_EVAL_SEEN;
5095     if (RExC_seen & REG_SEEN_CANY)
5096         r->extflags |= RXf_CANY_SEEN;
5097     if (RExC_seen & REG_SEEN_VERBARG)
5098         r->intflags |= PREGf_VERBARG_SEEN;
5099     if (RExC_seen & REG_SEEN_CUTGROUP)
5100         r->intflags |= PREGf_CUTGROUP_SEEN;
5101     if (RExC_paren_names)
5102         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5103     else
5104         RXp_PAREN_NAMES(r) = NULL;
5105
5106 #ifdef STUPID_PATTERN_CHECKS
5107     if (RX_PRELEN(rx) == 0)
5108         r->extflags |= RXf_NULL;
5109     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5110         /* XXX: this should happen BEFORE we compile */
5111         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5112     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5113         r->extflags |= RXf_WHITE;
5114     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5115         r->extflags |= RXf_START_ONLY;
5116 #else
5117     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5118             /* XXX: this should happen BEFORE we compile */
5119             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5120     else {
5121         regnode *first = ri->program + 1;
5122         U8 fop = OP(first);
5123         U8 nop = OP(NEXTOPER(first));
5124
5125         if (PL_regkind[fop] == NOTHING && nop == END)
5126             r->extflags |= RXf_NULL;
5127         else if (PL_regkind[fop] == BOL && nop == END)
5128             r->extflags |= RXf_START_ONLY;
5129         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5130             r->extflags |= RXf_WHITE;
5131     }
5132 #endif
5133 #ifdef DEBUGGING
5134     if (RExC_paren_names) {
5135         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5136         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5137     } else
5138 #endif
5139         ri->name_list_idx = 0;
5140
5141     if (RExC_recurse_count) {
5142         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5143             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5144             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5145         }
5146     }
5147     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5148     /* assume we don't need to swap parens around before we match */
5149
5150     DEBUG_DUMP_r({
5151         PerlIO_printf(Perl_debug_log,"Final program:\n");
5152         regdump(r);
5153     });
5154 #ifdef RE_TRACK_PATTERN_OFFSETS
5155     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5156         const U32 len = ri->u.offsets[0];
5157         U32 i;
5158         GET_RE_DEBUG_FLAGS_DECL;
5159         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5160         for (i = 1; i <= len; i++) {
5161             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5162                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5163                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5164             }
5165         PerlIO_printf(Perl_debug_log, "\n");
5166     });
5167 #endif
5168     return rx;
5169 }
5170
5171 #undef RE_ENGINE_PTR
5172
5173
5174 SV*
5175 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5176                     const U32 flags)
5177 {
5178     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5179
5180     PERL_UNUSED_ARG(value);
5181
5182     if (flags & RXapif_FETCH) {
5183         return reg_named_buff_fetch(rx, key, flags);
5184     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5185         Perl_croak_no_modify(aTHX);
5186         return NULL;
5187     } else if (flags & RXapif_EXISTS) {
5188         return reg_named_buff_exists(rx, key, flags)
5189             ? &PL_sv_yes
5190             : &PL_sv_no;
5191     } else if (flags & RXapif_REGNAMES) {
5192         return reg_named_buff_all(rx, flags);
5193     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5194         return reg_named_buff_scalar(rx, flags);
5195     } else {
5196         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5197         return NULL;
5198     }
5199 }
5200
5201 SV*
5202 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5203                          const U32 flags)
5204 {
5205     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5206     PERL_UNUSED_ARG(lastkey);
5207
5208     if (flags & RXapif_FIRSTKEY)
5209         return reg_named_buff_firstkey(rx, flags);
5210     else if (flags & RXapif_NEXTKEY)
5211         return reg_named_buff_nextkey(rx, flags);
5212     else {
5213         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5214         return NULL;
5215     }
5216 }
5217
5218 SV*
5219 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5220                           const U32 flags)
5221 {
5222     AV *retarray = NULL;
5223     SV *ret;
5224     struct regexp *const rx = (struct regexp *)SvANY(r);
5225
5226     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5227
5228     if (flags & RXapif_ALL)
5229         retarray=newAV();
5230
5231     if (rx && RXp_PAREN_NAMES(rx)) {
5232         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5233         if (he_str) {
5234             IV i;
5235             SV* sv_dat=HeVAL(he_str);
5236             I32 *nums=(I32*)SvPVX(sv_dat);
5237             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5238                 if ((I32)(rx->nparens) >= nums[i]
5239                     && rx->offs[nums[i]].start != -1
5240                     && rx->offs[nums[i]].end != -1)
5241                 {
5242                     ret = newSVpvs("");
5243                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5244                     if (!retarray)
5245                         return ret;
5246                 } else {
5247                     ret = newSVsv(&PL_sv_undef);
5248                 }
5249                 if (retarray)
5250                     av_push(retarray, ret);
5251             }
5252             if (retarray)
5253                 return newRV_noinc(MUTABLE_SV(retarray));
5254         }
5255     }
5256     return NULL;
5257 }
5258
5259 bool
5260 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5261                            const U32 flags)
5262 {
5263     struct regexp *const rx = (struct regexp *)SvANY(r);
5264
5265     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5266
5267     if (rx && RXp_PAREN_NAMES(rx)) {
5268         if (flags & RXapif_ALL) {
5269             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5270         } else {
5271             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5272             if (sv) {
5273                 SvREFCNT_dec(sv);
5274                 return TRUE;
5275             } else {
5276                 return FALSE;
5277             }
5278         }
5279     } else {
5280         return FALSE;
5281     }
5282 }
5283
5284 SV*
5285 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5286 {
5287     struct regexp *const rx = (struct regexp *)SvANY(r);
5288
5289     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5290
5291     if ( rx && RXp_PAREN_NAMES(rx) ) {
5292         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5293
5294         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5295     } else {
5296         return FALSE;
5297     }
5298 }
5299
5300 SV*
5301 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5302 {
5303     struct regexp *const rx = (struct regexp *)SvANY(r);
5304     GET_RE_DEBUG_FLAGS_DECL;
5305
5306     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5307
5308     if (rx && RXp_PAREN_NAMES(rx)) {
5309         HV *hv = RXp_PAREN_NAMES(rx);
5310         HE *temphe;
5311         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5312             IV i;
5313             IV parno = 0;
5314             SV* sv_dat = HeVAL(temphe);
5315             I32 *nums = (I32*)SvPVX(sv_dat);
5316             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5317                 if ((I32)(rx->lastparen) >= nums[i] &&
5318                     rx->offs[nums[i]].start != -1 &&
5319                     rx->offs[nums[i]].end != -1)
5320                 {
5321                     parno = nums[i];
5322                     break;
5323                 }
5324             }
5325             if (parno || flags & RXapif_ALL) {
5326                 return newSVhek(HeKEY_hek(temphe));
5327             }
5328         }
5329     }
5330     return NULL;
5331 }
5332
5333 SV*
5334 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5335 {
5336     SV *ret;
5337     AV *av;
5338     I32 length;
5339     struct regexp *const rx = (struct regexp *)SvANY(r);
5340
5341     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5342
5343     if (rx && RXp_PAREN_NAMES(rx)) {
5344         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5345             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5346         } else if (flags & RXapif_ONE) {
5347             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5348             av = MUTABLE_AV(SvRV(ret));
5349             length = av_len(av);
5350             SvREFCNT_dec(ret);
5351             return newSViv(length + 1);
5352         } else {
5353             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5354             return NULL;
5355         }
5356     }
5357     return &PL_sv_undef;
5358 }
5359
5360 SV*
5361 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5362 {
5363     struct regexp *const rx = (struct regexp *)SvANY(r);
5364     AV *av = newAV();
5365
5366     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5367
5368     if (rx && RXp_PAREN_NAMES(rx)) {
5369         HV *hv= RXp_PAREN_NAMES(rx);
5370         HE *temphe;
5371         (void)hv_iterinit(hv);
5372         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5373             IV i;
5374             IV parno = 0;
5375             SV* sv_dat = HeVAL(temphe);
5376             I32 *nums = (I32*)SvPVX(sv_dat);
5377             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5378                 if ((I32)(rx->lastparen) >= nums[i] &&
5379                     rx->offs[nums[i]].start != -1 &&
5380                     rx->offs[nums[i]].end != -1)
5381                 {
5382                     parno = nums[i];
5383                     break;
5384                 }
5385             }
5386             if (parno || flags & RXapif_ALL) {
5387                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5388             }
5389         }
5390     }
5391
5392     return newRV_noinc(MUTABLE_SV(av));
5393 }
5394
5395 void
5396 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5397                              SV * const sv)
5398 {
5399     struct regexp *const rx = (struct regexp *)SvANY(r);
5400     char *s = NULL;
5401     I32 i = 0;
5402     I32 s1, t1;
5403
5404     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5405
5406     if (!rx->subbeg) {
5407         sv_setsv(sv,&PL_sv_undef);
5408         return;
5409     }
5410     else
5411     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5412         /* $` */
5413         i = rx->offs[0].start;
5414         s = rx->subbeg;
5415     }
5416     else
5417     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5418         /* $' */
5419         s = rx->subbeg + rx->offs[0].end;
5420         i = rx->sublen - rx->offs[0].end;
5421     }
5422     else
5423     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5424         (s1 = rx->offs[paren].start) != -1 &&
5425         (t1 = rx->offs[paren].end) != -1)
5426     {
5427         /* $& $1 ... */
5428         i = t1 - s1;
5429         s = rx->subbeg + s1;
5430     } else {
5431         sv_setsv(sv,&PL_sv_undef);
5432         return;
5433     }
5434     assert(rx->sublen >= (s - rx->subbeg) + i );
5435     if (i >= 0) {
5436         const int oldtainted = PL_tainted;
5437         TAINT_NOT;
5438         sv_setpvn(sv, s, i);
5439         PL_tainted = oldtainted;
5440         if ( (rx->extflags & RXf_CANY_SEEN)
5441             ? (RXp_MATCH_UTF8(rx)
5442                         && (!i || is_utf8_string((U8*)s, i)))
5443             : (RXp_MATCH_UTF8(rx)) )
5444         {
5445             SvUTF8_on(sv);
5446         }
5447         else
5448             SvUTF8_off(sv);
5449         if (PL_tainting) {
5450             if (RXp_MATCH_TAINTED(rx)) {
5451                 if (SvTYPE(sv) >= SVt_PVMG) {
5452                     MAGIC* const mg = SvMAGIC(sv);
5453                     MAGIC* mgt;
5454                     PL_tainted = 1;
5455                     SvMAGIC_set(sv, mg->mg_moremagic);
5456                     SvTAINT(sv);
5457                     if ((mgt = SvMAGIC(sv))) {
5458                         mg->mg_moremagic = mgt;
5459                         SvMAGIC_set(sv, mg);
5460                     }
5461                 } else {
5462                     PL_tainted = 1;
5463                     SvTAINT(sv);
5464                 }
5465             } else
5466                 SvTAINTED_off(sv);
5467         }
5468     } else {
5469         sv_setsv(sv,&PL_sv_undef);
5470         return;
5471     }
5472 }
5473
5474 void
5475 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5476                                                          SV const * const value)
5477 {
5478     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5479
5480     PERL_UNUSED_ARG(rx);
5481     PERL_UNUSED_ARG(paren);
5482     PERL_UNUSED_ARG(value);
5483
5484     if (!PL_localizing)
5485         Perl_croak_no_modify(aTHX);
5486 }
5487
5488 I32
5489 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5490                               const I32 paren)
5491 {
5492     struct regexp *const rx = (struct regexp *)SvANY(r);
5493     I32 i;
5494     I32 s1, t1;
5495
5496     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5497
5498     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5499         switch (paren) {
5500       /* $` / ${^PREMATCH} */
5501       case RX_BUFF_IDX_PREMATCH:
5502         if (rx->offs[0].start != -1) {
5503                         i = rx->offs[0].start;
5504                         if (i > 0) {
5505                                 s1 = 0;
5506                                 t1 = i;
5507                                 goto getlen;
5508                         }
5509             }
5510         return 0;
5511       /* $' / ${^POSTMATCH} */
5512       case RX_BUFF_IDX_POSTMATCH:
5513             if (rx->offs[0].end != -1) {
5514                         i = rx->sublen - rx->offs[0].end;
5515                         if (i > 0) {
5516                                 s1 = rx->offs[0].end;
5517                                 t1 = rx->sublen;
5518                                 goto getlen;
5519                         }
5520             }
5521         return 0;
5522       /* $& / ${^MATCH}, $1, $2, ... */
5523       default:
5524             if (paren <= (I32)rx->nparens &&
5525             (s1 = rx->offs[paren].start) != -1 &&
5526             (t1 = rx->offs[paren].end) != -1)
5527             {
5528             i = t1 - s1;
5529             goto getlen;
5530         } else {
5531             if (ckWARN(WARN_UNINITIALIZED))
5532                 report_uninit((const SV *)sv);
5533             return 0;
5534         }
5535     }
5536   getlen:
5537     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5538         const char * const s = rx->subbeg + s1;
5539         const U8 *ep;
5540         STRLEN el;
5541
5542         i = t1 - s1;
5543         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5544                         i = el;
5545     }
5546     return i;
5547 }
5548
5549 SV*
5550 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5551 {
5552     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5553         PERL_UNUSED_ARG(rx);
5554         if (0)
5555             return NULL;
5556         else
5557             return newSVpvs("Regexp");
5558 }
5559
5560 /* Scans the name of a named buffer from the pattern.
5561  * If flags is REG_RSN_RETURN_NULL returns null.
5562  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5563  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5564  * to the parsed name as looked up in the RExC_paren_names hash.
5565  * If there is an error throws a vFAIL().. type exception.
5566  */
5567
5568 #define REG_RSN_RETURN_NULL    0
5569 #define REG_RSN_RETURN_NAME    1
5570 #define REG_RSN_RETURN_DATA    2
5571
5572 STATIC SV*
5573 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5574 {
5575     char *name_start = RExC_parse;
5576
5577     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5578
5579     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5580          /* skip IDFIRST by using do...while */
5581         if (UTF)
5582             do {
5583                 RExC_parse += UTF8SKIP(RExC_parse);
5584             } while (isALNUM_utf8((U8*)RExC_parse));
5585         else
5586             do {
5587                 RExC_parse++;
5588             } while (isALNUM(*RExC_parse));
5589     }
5590
5591     if ( flags ) {
5592         SV* sv_name
5593             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5594                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5595         if ( flags == REG_RSN_RETURN_NAME)
5596             return sv_name;
5597         else if (flags==REG_RSN_RETURN_DATA) {
5598             HE *he_str = NULL;
5599             SV *sv_dat = NULL;
5600             if ( ! sv_name )      /* should not happen*/
5601                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5602             if (RExC_paren_names)
5603                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5604             if ( he_str )
5605                 sv_dat = HeVAL(he_str);
5606             if ( ! sv_dat )
5607                 vFAIL("Reference to nonexistent named group");
5608             return sv_dat;
5609         }
5610         else {
5611             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5612         }
5613         /* NOT REACHED */
5614     }
5615     return NULL;
5616 }
5617
5618 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5619     int rem=(int)(RExC_end - RExC_parse);                       \
5620     int cut;                                                    \
5621     int num;                                                    \
5622     int iscut=0;                                                \
5623     if (rem>10) {                                               \
5624         rem=10;                                                 \
5625         iscut=1;                                                \
5626     }                                                           \
5627     cut=10-rem;                                                 \
5628     if (RExC_lastparse!=RExC_parse)                             \
5629         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5630             rem, RExC_parse,                                    \
5631             cut + 4,                                            \
5632             iscut ? "..." : "<"                                 \
5633         );                                                      \
5634     else                                                        \
5635         PerlIO_printf(Perl_debug_log,"%16s","");                \
5636                                                                 \
5637     if (SIZE_ONLY)                                              \
5638        num = RExC_size + 1;                                     \
5639     else                                                        \
5640        num=REG_NODE_NUM(RExC_emit);                             \
5641     if (RExC_lastnum!=num)                                      \
5642        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5643     else                                                        \
5644        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5645     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5646         (int)((depth*2)), "",                                   \
5647         (funcname)                                              \
5648     );                                                          \
5649     RExC_lastnum=num;                                           \
5650     RExC_lastparse=RExC_parse;                                  \
5651 })
5652
5653
5654
5655 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5656     DEBUG_PARSE_MSG((funcname));                            \
5657     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5658 })
5659 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5660     DEBUG_PARSE_MSG((funcname));                            \
5661     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5662 })
5663 /*
5664  - reg - regular expression, i.e. main body or parenthesized thing
5665  *
5666  * Caller must absorb opening parenthesis.
5667  *
5668  * Combining parenthesis handling with the base level of regular expression
5669  * is a trifle forced, but the need to tie the tails of the branches to what
5670  * follows makes it hard to avoid.
5671  */
5672 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5673 #ifdef DEBUGGING
5674 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5675 #else
5676 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5677 #endif
5678
5679 STATIC regnode *
5680 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5681     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5682 {
5683     dVAR;
5684     register regnode *ret;              /* Will be the head of the group. */
5685     register regnode *br;
5686     register regnode *lastbr;
5687     register regnode *ender = NULL;
5688     register I32 parno = 0;
5689     I32 flags;
5690     U32 oregflags = RExC_flags;
5691     bool have_branch = 0;
5692     bool is_open = 0;
5693     I32 freeze_paren = 0;
5694     I32 after_freeze = 0;
5695
5696     /* for (?g), (?gc), and (?o) warnings; warning
5697        about (?c) will warn about (?g) -- japhy    */
5698
5699 #define WASTED_O  0x01
5700 #define WASTED_G  0x02
5701 #define WASTED_C  0x04
5702 #define WASTED_GC (0x02|0x04)
5703     I32 wastedflags = 0x00;
5704
5705     char * parse_start = RExC_parse; /* MJD */
5706     char * const oregcomp_parse = RExC_parse;
5707
5708     GET_RE_DEBUG_FLAGS_DECL;
5709
5710     PERL_ARGS_ASSERT_REG;
5711     DEBUG_PARSE("reg ");
5712
5713     *flagp = 0;                         /* Tentatively. */
5714
5715
5716     /* Make an OPEN node, if parenthesized. */
5717     if (paren) {
5718         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5719             char *start_verb = RExC_parse;
5720             STRLEN verb_len = 0;
5721             char *start_arg = NULL;
5722             unsigned char op = 0;
5723             int argok = 1;
5724             int internal_argval = 0; /* internal_argval is only useful if !argok */
5725             while ( *RExC_parse && *RExC_parse != ')' ) {
5726                 if ( *RExC_parse == ':' ) {
5727                     start_arg = RExC_parse + 1;
5728                     break;
5729                 }
5730                 RExC_parse++;
5731             }
5732             ++start_verb;
5733             verb_len = RExC_parse - start_verb;
5734             if ( start_arg ) {
5735                 RExC_parse++;
5736                 while ( *RExC_parse && *RExC_parse != ')' )
5737                     RExC_parse++;
5738                 if ( *RExC_parse != ')' )
5739                     vFAIL("Unterminated verb pattern argument");
5740                 if ( RExC_parse == start_arg )
5741                     start_arg = NULL;
5742             } else {
5743                 if ( *RExC_parse != ')' )
5744                     vFAIL("Unterminated verb pattern");
5745             }
5746
5747             switch ( *start_verb ) {
5748             case 'A':  /* (*ACCEPT) */
5749                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5750                     op = ACCEPT;
5751                     internal_argval = RExC_nestroot;
5752                 }
5753                 break;
5754             case 'C':  /* (*COMMIT) */
5755                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5756                     op = COMMIT;
5757                 break;
5758             case 'F':  /* (*FAIL) */
5759                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5760                     op = OPFAIL;
5761                     argok = 0;
5762                 }
5763                 break;
5764             case ':':  /* (*:NAME) */
5765             case 'M':  /* (*MARK:NAME) */
5766                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5767                     op = MARKPOINT;
5768                     argok = -1;
5769                 }
5770                 break;
5771             case 'P':  /* (*PRUNE) */
5772                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5773                     op = PRUNE;
5774                 break;
5775             case 'S':   /* (*SKIP) */
5776                 if ( memEQs(start_verb,verb_len,"SKIP") )
5777                     op = SKIP;
5778                 break;
5779             case 'T':  /* (*THEN) */
5780                 /* [19:06] <TimToady> :: is then */
5781                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5782                     op = CUTGROUP;
5783                     RExC_seen |= REG_SEEN_CUTGROUP;
5784                 }
5785                 break;
5786             }
5787             if ( ! op ) {
5788                 RExC_parse++;
5789                 vFAIL3("Unknown verb pattern '%.*s'",
5790                     verb_len, start_verb);
5791             }
5792             if ( argok ) {
5793                 if ( start_arg && internal_argval ) {
5794                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5795                         verb_len, start_verb);
5796                 } else if ( argok < 0 && !start_arg ) {
5797                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5798                         verb_len, start_verb);
5799                 } else {
5800                     ret = reganode(pRExC_state, op, internal_argval);
5801                     if ( ! internal_argval && ! SIZE_ONLY ) {
5802                         if (start_arg) {
5803                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5804                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5805                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5806                             ret->flags = 0;
5807                         } else {
5808                             ret->flags = 1;
5809                         }
5810                     }
5811                 }
5812                 if (!internal_argval)
5813                     RExC_seen |= REG_SEEN_VERBARG;
5814             } else if ( start_arg ) {
5815                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5816                         verb_len, start_verb);
5817             } else {
5818                 ret = reg_node(pRExC_state, op);
5819             }
5820             nextchar(pRExC_state);
5821             return ret;
5822         } else
5823         if (*RExC_parse == '?') { /* (?...) */
5824             bool is_logical = 0;
5825             const char * const seqstart = RExC_parse;
5826             bool has_use_defaults = FALSE;
5827
5828             RExC_parse++;
5829             paren = *RExC_parse++;
5830             ret = NULL;                 /* For look-ahead/behind. */
5831             switch (paren) {
5832
5833             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5834                 paren = *RExC_parse++;
5835                 if ( paren == '<')         /* (?P<...>) named capture */
5836                     goto named_capture;
5837                 else if (paren == '>') {   /* (?P>name) named recursion */
5838                     goto named_recursion;
5839                 }
5840                 else if (paren == '=') {   /* (?P=...)  named backref */
5841                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5842                        you change this make sure you change that */
5843                     char* name_start = RExC_parse;
5844                     U32 num = 0;
5845                     SV *sv_dat = reg_scan_name(pRExC_state,
5846                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5847                     if (RExC_parse == name_start || *RExC_parse != ')')
5848                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5849
5850                     if (!SIZE_ONLY) {
5851                         num = add_data( pRExC_state, 1, "S" );
5852                         RExC_rxi->data->data[num]=(void*)sv_dat;
5853                         SvREFCNT_inc_simple_void(sv_dat);
5854                     }
5855                     RExC_sawback = 1;
5856                     ret = reganode(pRExC_state,
5857                                    ((! FOLD)
5858                                      ? NREF
5859                                      : (UNI_SEMANTICS)
5860                                        ? NREFFU
5861                                        : (LOC)
5862                                          ? NREFFL
5863                                          : NREFF),
5864                                     num);
5865                     *flagp |= HASWIDTH;
5866
5867                     Set_Node_Offset(ret, parse_start+1);
5868                     Set_Node_Cur_Length(ret); /* MJD */
5869
5870                     nextchar(pRExC_state);
5871                     return ret;
5872                 }
5873                 RExC_parse++;
5874                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5875                 /*NOTREACHED*/
5876             case '<':           /* (?<...) */
5877                 if (*RExC_parse == '!')
5878                     paren = ',';
5879                 else if (*RExC_parse != '=')
5880               named_capture:
5881                 {               /* (?<...>) */
5882                     char *name_start;
5883                     SV *svname;
5884                     paren= '>';
5885             case '\'':          /* (?'...') */
5886                     name_start= RExC_parse;
5887                     svname = reg_scan_name(pRExC_state,
5888                         SIZE_ONLY ?  /* reverse test from the others */
5889                         REG_RSN_RETURN_NAME :
5890                         REG_RSN_RETURN_NULL);
5891                     if (RExC_parse == name_start) {
5892                         RExC_parse++;
5893                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5894                         /*NOTREACHED*/
5895                     }
5896                     if (*RExC_parse != paren)
5897                         vFAIL2("Sequence (?%c... not terminated",
5898                             paren=='>' ? '<' : paren);
5899                     if (SIZE_ONLY) {
5900                         HE *he_str;
5901                         SV *sv_dat = NULL;
5902                         if (!svname) /* shouldnt happen */
5903                             Perl_croak(aTHX_
5904                                 "panic: reg_scan_name returned NULL");
5905                         if (!RExC_paren_names) {
5906                             RExC_paren_names= newHV();
5907                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5908 #ifdef DEBUGGING
5909                             RExC_paren_name_list= newAV();
5910                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5911 #endif
5912                         }
5913                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5914                         if ( he_str )
5915                             sv_dat = HeVAL(he_str);
5916                         if ( ! sv_dat ) {
5917                             /* croak baby croak */
5918                             Perl_croak(aTHX_
5919                                 "panic: paren_name hash element allocation failed");
5920                         } else if ( SvPOK(sv_dat) ) {
5921                             /* (?|...) can mean we have dupes so scan to check
5922                                its already been stored. Maybe a flag indicating
5923                                we are inside such a construct would be useful,
5924                                but the arrays are likely to be quite small, so
5925                                for now we punt -- dmq */
5926                             IV count = SvIV(sv_dat);
5927                             I32 *pv = (I32*)SvPVX(sv_dat);
5928                             IV i;
5929                             for ( i = 0 ; i < count ; i++ ) {
5930                                 if ( pv[i] == RExC_npar ) {
5931                                     count = 0;
5932                                     break;
5933                                 }
5934                             }
5935                             if ( count ) {
5936                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5937                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5938                                 pv[count] = RExC_npar;
5939                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5940                             }
5941                         } else {
5942                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5943                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5944                             SvIOK_on(sv_dat);
5945                             SvIV_set(sv_dat, 1);
5946                         }
5947 #ifdef DEBUGGING
5948                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5949                             SvREFCNT_dec(svname);
5950 #endif
5951
5952                         /*sv_dump(sv_dat);*/
5953                     }
5954                     nextchar(pRExC_state);
5955                     paren = 1;
5956                     goto capturing_parens;
5957                 }
5958                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5959                 RExC_parse++;
5960             case '=':           /* (?=...) */
5961                 RExC_seen_zerolen++;
5962                 break;
5963             case '!':           /* (?!...) */
5964                 RExC_seen_zerolen++;
5965                 if (*RExC_parse == ')') {
5966                     ret=reg_node(pRExC_state, OPFAIL);
5967                     nextchar(pRExC_state);
5968                     return ret;
5969                 }
5970                 break;
5971             case '|':           /* (?|...) */
5972                 /* branch reset, behave like a (?:...) except that
5973                    buffers in alternations share the same numbers */
5974                 paren = ':';
5975                 after_freeze = freeze_paren = RExC_npar;
5976                 break;
5977             case ':':           /* (?:...) */
5978             case '>':           /* (?>...) */
5979                 break;
5980             case '$':           /* (?$...) */
5981             case '@':           /* (?@...) */
5982                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5983                 break;
5984             case '#':           /* (?#...) */
5985                 while (*RExC_parse && *RExC_parse != ')')
5986                     RExC_parse++;
5987                 if (*RExC_parse != ')')
5988                     FAIL("Sequence (?#... not terminated");
5989                 nextchar(pRExC_state);
5990                 *flagp = TRYAGAIN;
5991                 return NULL;
5992             case '0' :           /* (?0) */
5993             case 'R' :           /* (?R) */
5994                 if (*RExC_parse != ')')
5995                     FAIL("Sequence (?R) not terminated");
5996                 ret = reg_node(pRExC_state, GOSTART);
5997                 *flagp |= POSTPONED;
5998                 nextchar(pRExC_state);
5999                 return ret;
6000                 /*notreached*/
6001             { /* named and numeric backreferences */
6002                 I32 num;
6003             case '&':            /* (?&NAME) */
6004                 parse_start = RExC_parse - 1;
6005               named_recursion:
6006                 {
6007                     SV *sv_dat = reg_scan_name(pRExC_state,
6008                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6009                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6010                 }
6011                 goto gen_recurse_regop;
6012                 /* NOT REACHED */
6013             case '+':
6014                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6015                     RExC_parse++;
6016                     vFAIL("Illegal pattern");
6017                 }
6018                 goto parse_recursion;
6019                 /* NOT REACHED*/
6020             case '-': /* (?-1) */
6021                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6022                     RExC_parse--; /* rewind to let it be handled later */
6023                     goto parse_flags;
6024                 }
6025                 /*FALLTHROUGH */
6026             case '1': case '2': case '3': case '4': /* (?1) */
6027             case '5': case '6': case '7': case '8': case '9':
6028                 RExC_parse--;
6029               parse_recursion:
6030                 num = atoi(RExC_parse);
6031                 parse_start = RExC_parse - 1; /* MJD */
6032                 if (*RExC_parse == '-')
6033                     RExC_parse++;
6034                 while (isDIGIT(*RExC_parse))
6035                         RExC_parse++;
6036                 if (*RExC_parse!=')')
6037                     vFAIL("Expecting close bracket");
6038
6039               gen_recurse_regop:
6040                 if ( paren == '-' ) {
6041                     /*
6042                     Diagram of capture buffer numbering.
6043                     Top line is the normal capture buffer numbers
6044                     Bottom line is the negative indexing as from
6045                     the X (the (?-2))
6046
6047                     +   1 2    3 4 5 X          6 7
6048                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6049                     -   5 4    3 2 1 X          x x
6050
6051                     */
6052                     num = RExC_npar + num;
6053                     if (num < 1)  {
6054                         RExC_parse++;
6055                         vFAIL("Reference to nonexistent group");
6056                     }
6057                 } else if ( paren == '+' ) {
6058                     num = RExC_npar + num - 1;
6059                 }
6060
6061                 ret = reganode(pRExC_state, GOSUB, num);
6062                 if (!SIZE_ONLY) {
6063                     if (num > (I32)RExC_rx->nparens) {
6064                         RExC_parse++;
6065                         vFAIL("Reference to nonexistent group");
6066                     }
6067                     ARG2L_SET( ret, RExC_recurse_count++);
6068                     RExC_emit++;
6069                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6070                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6071                 } else {
6072                     RExC_size++;
6073                 }
6074                 RExC_seen |= REG_SEEN_RECURSE;
6075                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6076                 Set_Node_Offset(ret, parse_start); /* MJD */
6077
6078                 *flagp |= POSTPONED;
6079                 nextchar(pRExC_state);
6080                 return ret;
6081             } /* named and numeric backreferences */
6082             /* NOT REACHED */
6083
6084             case '?':           /* (??...) */
6085                 is_logical = 1;
6086                 if (*RExC_parse != '{') {
6087                     RExC_parse++;
6088                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6089                     /*NOTREACHED*/
6090                 }
6091                 *flagp |= POSTPONED;
6092                 paren = *RExC_parse++;
6093                 /* FALL THROUGH */
6094             case '{':           /* (?{...}) */
6095             {
6096                 I32 count = 1;
6097                 U32 n = 0;
6098                 char c;
6099                 char *s = RExC_parse;
6100
6101                 RExC_seen_zerolen++;
6102                 RExC_seen |= REG_SEEN_EVAL;
6103                 while (count && (c = *RExC_parse)) {
6104                     if (c == '\\') {
6105                         if (RExC_parse[1])
6106                             RExC_parse++;
6107                     }
6108                     else if (c == '{')
6109                         count++;
6110                     else if (c == '}')
6111                         count--;
6112                     RExC_parse++;
6113                 }
6114                 if (*RExC_parse != ')') {
6115                     RExC_parse = s;
6116                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6117                 }
6118                 if (!SIZE_ONLY) {
6119                     PAD *pad;
6120                     OP_4tree *sop, *rop;
6121                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6122
6123                     ENTER;
6124                     Perl_save_re_context(aTHX);
6125                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6126                     sop->op_private |= OPpREFCOUNTED;
6127                     /* re_dup will OpREFCNT_inc */
6128                     OpREFCNT_set(sop, 1);
6129                     LEAVE;
6130
6131                     n = add_data(pRExC_state, 3, "nop");
6132                     RExC_rxi->data->data[n] = (void*)rop;
6133                     RExC_rxi->data->data[n+1] = (void*)sop;
6134                     RExC_rxi->data->data[n+2] = (void*)pad;
6135                     SvREFCNT_dec(sv);
6136                 }
6137                 else {                                          /* First pass */
6138                     if (PL_reginterp_cnt < ++RExC_seen_evals
6139                         && IN_PERL_RUNTIME)
6140                         /* No compiled RE interpolated, has runtime
6141                            components ===> unsafe.  */
6142                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6143                     if (PL_tainting && PL_tainted)
6144                         FAIL("Eval-group in insecure regular expression");
6145 #if PERL_VERSION > 8
6146                     if (IN_PERL_COMPILETIME)
6147                         PL_cv_has_eval = 1;
6148 #endif
6149                 }
6150
6151                 nextchar(pRExC_state);
6152                 if (is_logical) {
6153                     ret = reg_node(pRExC_state, LOGICAL);
6154                     if (!SIZE_ONLY)
6155                         ret->flags = 2;
6156                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6157                     /* deal with the length of this later - MJD */
6158                     return ret;
6159                 }
6160                 ret = reganode(pRExC_state, EVAL, n);
6161                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6162                 Set_Node_Offset(ret, parse_start);
6163                 return ret;
6164             }
6165             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6166             {
6167                 int is_define= 0;
6168                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6169                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6170                         || RExC_parse[1] == '<'
6171                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6172                         I32 flag;
6173
6174                         ret = reg_node(pRExC_state, LOGICAL);
6175                         if (!SIZE_ONLY)
6176                             ret->flags = 1;
6177                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6178                         goto insert_if;
6179                     }
6180                 }
6181                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6182                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6183                 {
6184                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6185                     char *name_start= RExC_parse++;
6186                     U32 num = 0;
6187                     SV *sv_dat=reg_scan_name(pRExC_state,
6188                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6189                     if (RExC_parse == name_start || *RExC_parse != ch)
6190                         vFAIL2("Sequence (?(%c... not terminated",
6191                             (ch == '>' ? '<' : ch));
6192                     RExC_parse++;
6193                     if (!SIZE_ONLY) {
6194                         num = add_data( pRExC_state, 1, "S" );
6195                         RExC_rxi->data->data[num]=(void*)sv_dat;
6196                         SvREFCNT_inc_simple_void(sv_dat);
6197                     }
6198                     ret = reganode(pRExC_state,NGROUPP,num);
6199                     goto insert_if_check_paren;
6200                 }
6201                 else if (RExC_parse[0] == 'D' &&
6202                          RExC_parse[1] == 'E' &&
6203                          RExC_parse[2] == 'F' &&
6204                          RExC_parse[3] == 'I' &&
6205                          RExC_parse[4] == 'N' &&
6206                          RExC_parse[5] == 'E')
6207                 {
6208                     ret = reganode(pRExC_state,DEFINEP,0);
6209                     RExC_parse +=6 ;
6210                     is_define = 1;
6211                     goto insert_if_check_paren;
6212                 }
6213                 else if (RExC_parse[0] == 'R') {
6214                     RExC_parse++;
6215                     parno = 0;
6216                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6217                         parno = atoi(RExC_parse++);
6218                         while (isDIGIT(*RExC_parse))
6219                             RExC_parse++;
6220                     } else if (RExC_parse[0] == '&') {
6221                         SV *sv_dat;
6222                         RExC_parse++;
6223                         sv_dat = reg_scan_name(pRExC_state,
6224                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6225                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6226                     }
6227                     ret = reganode(pRExC_state,INSUBP,parno);
6228                     goto insert_if_check_paren;
6229                 }
6230                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6231                     /* (?(1)...) */
6232                     char c;
6233                     parno = atoi(RExC_parse++);
6234
6235                     while (isDIGIT(*RExC_parse))
6236                         RExC_parse++;
6237                     ret = reganode(pRExC_state, GROUPP, parno);
6238
6239                  insert_if_check_paren:
6240                     if ((c = *nextchar(pRExC_state)) != ')')
6241                         vFAIL("Switch condition not recognized");
6242                   insert_if:
6243                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6244                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6245                     if (br == NULL)
6246                         br = reganode(pRExC_state, LONGJMP, 0);
6247                     else
6248                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6249                     c = *nextchar(pRExC_state);
6250                     if (flags&HASWIDTH)
6251                         *flagp |= HASWIDTH;
6252                     if (c == '|') {
6253                         if (is_define)
6254                             vFAIL("(?(DEFINE)....) does not allow branches");
6255                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6256                         regbranch(pRExC_state, &flags, 1,depth+1);
6257                         REGTAIL(pRExC_state, ret, lastbr);
6258                         if (flags&HASWIDTH)
6259                             *flagp |= HASWIDTH;
6260                         c = *nextchar(pRExC_state);
6261                     }
6262                     else
6263                         lastbr = NULL;
6264                     if (c != ')')
6265                         vFAIL("Switch (?(condition)... contains too many branches");
6266                     ender = reg_node(pRExC_state, TAIL);
6267                     REGTAIL(pRExC_state, br, ender);
6268                     if (lastbr) {
6269                         REGTAIL(pRExC_state, lastbr, ender);
6270                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6271                     }
6272                     else
6273                         REGTAIL(pRExC_state, ret, ender);
6274                     RExC_size++; /* XXX WHY do we need this?!!
6275                                     For large programs it seems to be required
6276                                     but I can't figure out why. -- dmq*/
6277                     return ret;
6278                 }
6279                 else {
6280                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6281                 }
6282             }
6283             case 0:
6284                 RExC_parse--; /* for vFAIL to print correctly */
6285                 vFAIL("Sequence (? incomplete");
6286                 break;
6287             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6288                                        that follow */
6289                 has_use_defaults = TRUE;
6290                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6291                 if (RExC_utf8) {    /* But the default for a utf8 pattern is
6292                                        unicode semantics */
6293                     RExC_flags |= RXf_PMf_UNICODE;
6294                 }
6295                 goto parse_flags;
6296             default:
6297                 --RExC_parse;
6298                 parse_flags:      /* (?i) */
6299             {
6300                 U32 posflags = 0, negflags = 0;
6301                 U32 *flagsp = &posflags;
6302                 bool has_charset_modifier = 0;
6303
6304                 while (*RExC_parse) {
6305                     /* && strchr("iogcmsx", *RExC_parse) */
6306                     /* (?g), (?gc) and (?o) are useless here
6307                        and must be globally applied -- japhy */
6308                     switch (*RExC_parse) {
6309                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6310                     case LOCALE_PAT_MOD:
6311                         if (has_charset_modifier || flagsp == &negflags) {
6312                             goto fail_modifiers;
6313                         }
6314                         posflags |= RXf_PMf_LOCALE;
6315                         negflags |= RXf_PMf_UNICODE;
6316                         has_charset_modifier = 1;
6317                         break;
6318                     case UNICODE_PAT_MOD:
6319                         if (has_charset_modifier || flagsp == &negflags) {
6320                             goto fail_modifiers;
6321                         }
6322                         posflags |= RXf_PMf_UNICODE;
6323                         negflags |= RXf_PMf_LOCALE;
6324                         has_charset_modifier = 1;
6325                         break;
6326                     case DUAL_PAT_MOD:
6327                         if (has_use_defaults
6328                             || has_charset_modifier
6329                             || flagsp == &negflags)
6330                         {
6331                             goto fail_modifiers;
6332                         }
6333
6334                         /* The dual charset means unicode semantics if the
6335                          * pattern (or target, not known until runtime) are
6336                          * utf8 */
6337                         if (RExC_utf8) {
6338                             posflags |= RXf_PMf_UNICODE;
6339                             negflags |= RXf_PMf_LOCALE;
6340                         }
6341                         else {
6342                             negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6343                         }
6344                         has_charset_modifier = 1;
6345                         break;
6346                     case ONCE_PAT_MOD: /* 'o' */
6347                     case GLOBAL_PAT_MOD: /* 'g' */
6348                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6349                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6350                             if (! (wastedflags & wflagbit) ) {
6351                                 wastedflags |= wflagbit;
6352                                 vWARN5(
6353                                     RExC_parse + 1,
6354                                     "Useless (%s%c) - %suse /%c modifier",
6355                                     flagsp == &negflags ? "?-" : "?",
6356                                     *RExC_parse,
6357                                     flagsp == &negflags ? "don't " : "",
6358                                     *RExC_parse
6359                                 );
6360                             }
6361                         }
6362                         break;
6363
6364                     case CONTINUE_PAT_MOD: /* 'c' */
6365                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6366                             if (! (wastedflags & WASTED_C) ) {
6367                                 wastedflags |= WASTED_GC;
6368                                 vWARN3(
6369                                     RExC_parse + 1,
6370                                     "Useless (%sc) - %suse /gc modifier",
6371                                     flagsp == &negflags ? "?-" : "?",
6372                                     flagsp == &negflags ? "don't " : ""
6373                                 );
6374                             }
6375                         }
6376                         break;
6377                     case KEEPCOPY_PAT_MOD: /* 'p' */
6378                         if (flagsp == &negflags) {
6379                             if (SIZE_ONLY)
6380                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6381                         } else {
6382                             *flagsp |= RXf_PMf_KEEPCOPY;
6383                         }
6384                         break;
6385                     case '-':
6386                         /* A flag is a default iff it is following a minus, so
6387                          * if there is a minus, it means will be trying to
6388                          * re-specify a default which is an error */
6389                         if (has_use_defaults || flagsp == &negflags) {
6390             fail_modifiers:
6391                             RExC_parse++;
6392                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6393                             /*NOTREACHED*/
6394                         }
6395                         flagsp = &negflags;
6396                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6397                         break;
6398                     case ':':
6399                         paren = ':';
6400                         /*FALLTHROUGH*/
6401                     case ')':
6402                         RExC_flags |= posflags;
6403                         RExC_flags &= ~negflags;
6404                         if (paren != ':') {
6405                             oregflags |= posflags;
6406                             oregflags &= ~negflags;
6407                         }
6408                         nextchar(pRExC_state);
6409                         if (paren != ':') {
6410                             *flagp = TRYAGAIN;
6411                             return NULL;
6412                         } else {
6413                             ret = NULL;
6414                             goto parse_rest;
6415                         }
6416                         /*NOTREACHED*/
6417                     default:
6418                         RExC_parse++;
6419                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6420                         /*NOTREACHED*/
6421                     }
6422                     ++RExC_parse;
6423                 }
6424             }} /* one for the default block, one for the switch */
6425         }
6426         else {                  /* (...) */
6427           capturing_parens:
6428             parno = RExC_npar;
6429             RExC_npar++;
6430
6431             ret = reganode(pRExC_state, OPEN, parno);
6432             if (!SIZE_ONLY ){
6433                 if (!RExC_nestroot)
6434                     RExC_nestroot = parno;
6435                 if (RExC_seen & REG_SEEN_RECURSE
6436                     && !RExC_open_parens[parno-1])
6437                 {
6438                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6439                         "Setting open paren #%"IVdf" to %d\n",
6440                         (IV)parno, REG_NODE_NUM(ret)));
6441                     RExC_open_parens[parno-1]= ret;
6442                 }
6443             }
6444             Set_Node_Length(ret, 1); /* MJD */
6445             Set_Node_Offset(ret, RExC_parse); /* MJD */
6446             is_open = 1;
6447         }
6448     }
6449     else                        /* ! paren */
6450         ret = NULL;
6451
6452    parse_rest:
6453     /* Pick up the branches, linking them together. */
6454     parse_start = RExC_parse;   /* MJD */
6455     br = regbranch(pRExC_state, &flags, 1,depth+1);
6456
6457     if (freeze_paren) {
6458         if (RExC_npar > after_freeze)
6459             after_freeze = RExC_npar;
6460         RExC_npar = freeze_paren;
6461     }
6462
6463     /*     branch_len = (paren != 0); */
6464
6465     if (br == NULL)
6466         return(NULL);
6467     if (*RExC_parse == '|') {
6468         if (!SIZE_ONLY && RExC_extralen) {
6469             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6470         }
6471         else {                  /* MJD */
6472             reginsert(pRExC_state, BRANCH, br, depth+1);
6473             Set_Node_Length(br, paren != 0);
6474             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6475         }
6476         have_branch = 1;
6477         if (SIZE_ONLY)
6478             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6479     }
6480     else if (paren == ':') {
6481         *flagp |= flags&SIMPLE;
6482     }
6483     if (is_open) {                              /* Starts with OPEN. */
6484         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6485     }
6486     else if (paren != '?')              /* Not Conditional */
6487         ret = br;
6488     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6489     lastbr = br;
6490     while (*RExC_parse == '|') {
6491         if (!SIZE_ONLY && RExC_extralen) {
6492             ender = reganode(pRExC_state, LONGJMP,0);
6493             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6494         }
6495         if (SIZE_ONLY)
6496             RExC_extralen += 2;         /* Account for LONGJMP. */
6497         nextchar(pRExC_state);
6498         if (freeze_paren) {
6499             if (RExC_npar > after_freeze)
6500                 after_freeze = RExC_npar;
6501             RExC_npar = freeze_paren;
6502         }
6503         br = regbranch(pRExC_state, &flags, 0, depth+1);
6504
6505         if (br == NULL)
6506             return(NULL);
6507         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6508         lastbr = br;
6509         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6510     }
6511
6512     if (have_branch || paren != ':') {
6513         /* Make a closing node, and hook it on the end. */
6514         switch (paren) {
6515         case ':':
6516             ender = reg_node(pRExC_state, TAIL);
6517             break;
6518         case 1:
6519             ender = reganode(pRExC_state, CLOSE, parno);
6520             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6521                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6522                         "Setting close paren #%"IVdf" to %d\n",
6523                         (IV)parno, REG_NODE_NUM(ender)));
6524                 RExC_close_parens[parno-1]= ender;
6525                 if (RExC_nestroot == parno)
6526                     RExC_nestroot = 0;
6527             }
6528             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6529             Set_Node_Length(ender,1); /* MJD */
6530             break;
6531         case '<':
6532         case ',':
6533         case '=':
6534         case '!':
6535             *flagp &= ~HASWIDTH;
6536             /* FALL THROUGH */
6537         case '>':
6538             ender = reg_node(pRExC_state, SUCCEED);
6539             break;
6540         case 0:
6541             ender = reg_node(pRExC_state, END);
6542             if (!SIZE_ONLY) {
6543                 assert(!RExC_opend); /* there can only be one! */
6544                 RExC_opend = ender;
6545             }
6546             break;
6547         }
6548         REGTAIL(pRExC_state, lastbr, ender);
6549
6550         if (have_branch && !SIZE_ONLY) {
6551             if (depth==1)
6552                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6553
6554             /* Hook the tails of the branches to the closing node. */
6555             for (br = ret; br; br = regnext(br)) {
6556                 const U8 op = PL_regkind[OP(br)];
6557                 if (op == BRANCH) {
6558                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6559                 }
6560                 else if (op == BRANCHJ) {
6561                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6562                 }
6563             }
6564         }
6565     }
6566
6567     {
6568         const char *p;
6569         static const char parens[] = "=!<,>";
6570
6571         if (paren && (p = strchr(parens, paren))) {
6572             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6573             int flag = (p - parens) > 1;
6574
6575             if (paren == '>')
6576                 node = SUSPEND, flag = 0;
6577             reginsert(pRExC_state, node,ret, depth+1);
6578             Set_Node_Cur_Length(ret);
6579             Set_Node_Offset(ret, parse_start + 1);
6580             ret->flags = flag;
6581             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6582         }
6583     }
6584
6585     /* Check for proper termination. */
6586     if (paren) {
6587         RExC_flags = oregflags;
6588         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6589             RExC_parse = oregcomp_parse;
6590             vFAIL("Unmatched (");
6591         }
6592     }
6593     else if (!paren && RExC_parse < RExC_end) {
6594         if (*RExC_parse == ')') {
6595             RExC_parse++;
6596             vFAIL("Unmatched )");
6597         }
6598         else
6599             FAIL("Junk on end of regexp");      /* "Can't happen". */
6600         /* NOTREACHED */
6601     }
6602     if (after_freeze)
6603         RExC_npar = after_freeze;
6604     return(ret);
6605 }
6606
6607 /*
6608  - regbranch - one alternative of an | operator
6609  *
6610  * Implements the concatenation operator.
6611  */
6612 STATIC regnode *
6613 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6614 {
6615     dVAR;
6616     register regnode *ret;
6617     register regnode *chain = NULL;
6618     register regnode *latest;
6619     I32 flags = 0, c = 0;
6620     GET_RE_DEBUG_FLAGS_DECL;
6621
6622     PERL_ARGS_ASSERT_REGBRANCH;
6623
6624     DEBUG_PARSE("brnc");
6625
6626     if (first)
6627         ret = NULL;
6628     else {
6629         if (!SIZE_ONLY && RExC_extralen)
6630             ret = reganode(pRExC_state, BRANCHJ,0);
6631         else {
6632             ret = reg_node(pRExC_state, BRANCH);
6633             Set_Node_Length(ret, 1);
6634         }
6635     }
6636
6637     if (!first && SIZE_ONLY)
6638         RExC_extralen += 1;                     /* BRANCHJ */
6639
6640     *flagp = WORST;                     /* Tentatively. */
6641
6642     RExC_parse--;
6643     nextchar(pRExC_state);
6644     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6645         flags &= ~TRYAGAIN;
6646         latest = regpiece(pRExC_state, &flags,depth+1);
6647         if (latest == NULL) {
6648             if (flags & TRYAGAIN)
6649                 continue;
6650             return(NULL);
6651         }
6652         else if (ret == NULL)
6653             ret = latest;
6654         *flagp |= flags&(HASWIDTH|POSTPONED);
6655         if (chain == NULL)      /* First piece. */
6656             *flagp |= flags&SPSTART;
6657         else {
6658             RExC_naughty++;
6659             REGTAIL(pRExC_state, chain, latest);
6660         }
6661         chain = latest;
6662         c++;
6663     }
6664     if (chain == NULL) {        /* Loop ran zero times. */
6665         chain = reg_node(pRExC_state, NOTHING);
6666         if (ret == NULL)
6667             ret = chain;
6668     }
6669     if (c == 1) {
6670         *flagp |= flags&SIMPLE;
6671     }
6672
6673     return ret;
6674 }
6675
6676 /*
6677  - regpiece - something followed by possible [*+?]
6678  *
6679  * Note that the branching code sequences used for ? and the general cases
6680  * of * and + are somewhat optimized:  they use the same NOTHING node as
6681  * both the endmarker for their branch list and the body of the last branch.
6682  * It might seem that this node could be dispensed with entirely, but the
6683  * endmarker role is not redundant.
6684  */
6685 STATIC regnode *
6686 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6687 {
6688     dVAR;
6689     register regnode *ret;
6690     register char op;
6691     register char *next;
6692     I32 flags;
6693     const char * const origparse = RExC_parse;
6694     I32 min;
6695     I32 max = REG_INFTY;
6696     char *parse_start;
6697     const char *maxpos = NULL;
6698     GET_RE_DEBUG_FLAGS_DECL;
6699
6700     PERL_ARGS_ASSERT_REGPIECE;
6701
6702     DEBUG_PARSE("piec");
6703
6704     ret = regatom(pRExC_state, &flags,depth+1);
6705     if (ret == NULL) {
6706         if (flags & TRYAGAIN)
6707             *flagp |= TRYAGAIN;
6708         return(NULL);
6709     }
6710
6711     op = *RExC_parse;
6712
6713     if (op == '{' && regcurly(RExC_parse)) {
6714         maxpos = NULL;
6715         parse_start = RExC_parse; /* MJD */
6716         next = RExC_parse + 1;
6717         while (isDIGIT(*next) || *next == ',') {
6718             if (*next == ',') {
6719                 if (maxpos)
6720                     break;
6721                 else
6722                     maxpos = next;
6723             }
6724             next++;
6725         }
6726         if (*next == '}') {             /* got one */
6727             if (!maxpos)
6728                 maxpos = next;
6729             RExC_parse++;
6730             min = atoi(RExC_parse);
6731             if (*maxpos == ',')
6732                 maxpos++;
6733             else
6734                 maxpos = RExC_parse;
6735             max = atoi(maxpos);
6736             if (!max && *maxpos != '0')
6737                 max = REG_INFTY;                /* meaning "infinity" */
6738             else if (max >= REG_INFTY)
6739                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6740             RExC_parse = next;
6741             nextchar(pRExC_state);
6742
6743         do_curly:
6744             if ((flags&SIMPLE)) {
6745                 RExC_naughty += 2 + RExC_naughty / 2;
6746                 reginsert(pRExC_state, CURLY, ret, depth+1);
6747                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6748                 Set_Node_Cur_Length(ret);
6749             }
6750             else {
6751                 regnode * const w = reg_node(pRExC_state, WHILEM);
6752
6753                 w->flags = 0;
6754                 REGTAIL(pRExC_state, ret, w);
6755                 if (!SIZE_ONLY && RExC_extralen) {
6756                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6757                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6758                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6759                 }
6760                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6761                                 /* MJD hk */
6762                 Set_Node_Offset(ret, parse_start+1);
6763                 Set_Node_Length(ret,
6764                                 op == '{' ? (RExC_parse - parse_start) : 1);
6765
6766                 if (!SIZE_ONLY && RExC_extralen)
6767                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6768                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6769                 if (SIZE_ONLY)
6770                     RExC_whilem_seen++, RExC_extralen += 3;
6771                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6772             }
6773             ret->flags = 0;
6774
6775             if (min > 0)
6776                 *flagp = WORST;
6777             if (max > 0)
6778                 *flagp |= HASWIDTH;
6779             if (max < min)
6780                 vFAIL("Can't do {n,m} with n > m");
6781             if (!SIZE_ONLY) {
6782                 ARG1_SET(ret, (U16)min);
6783                 ARG2_SET(ret, (U16)max);
6784             }
6785
6786             goto nest_check;
6787         }
6788     }
6789
6790     if (!ISMULT1(op)) {
6791         *flagp = flags;
6792         return(ret);
6793     }
6794
6795 #if 0                           /* Now runtime fix should be reliable. */
6796
6797     /* if this is reinstated, don't forget to put this back into perldiag:
6798
6799             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6800
6801            (F) The part of the regexp subject to either the * or + quantifier
6802            could match an empty string. The {#} shows in the regular
6803            expression about where the problem was discovered.
6804
6805     */
6806
6807     if (!(flags&HASWIDTH) && op != '?')
6808       vFAIL("Regexp *+ operand could be empty");
6809 #endif
6810
6811     parse_start = RExC_parse;
6812     nextchar(pRExC_state);
6813
6814     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6815
6816     if (op == '*' && (flags&SIMPLE)) {
6817         reginsert(pRExC_state, STAR, ret, depth+1);
6818         ret->flags = 0;
6819         RExC_naughty += 4;
6820     }
6821     else if (op == '*') {
6822         min = 0;
6823         goto do_curly;
6824     }
6825     else if (op == '+' && (flags&SIMPLE)) {
6826         reginsert(pRExC_state, PLUS, ret, depth+1);
6827         ret->flags = 0;
6828         RExC_naughty += 3;
6829     }
6830     else if (op == '+') {
6831         min = 1;
6832         goto do_curly;
6833     }
6834     else if (op == '?') {
6835         min = 0; max = 1;
6836         goto do_curly;
6837     }
6838   nest_check:
6839     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6840         ckWARN3reg(RExC_parse,
6841                    "%.*s matches null string many times",
6842                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6843                    origparse);
6844     }
6845
6846     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6847         nextchar(pRExC_state);
6848         reginsert(pRExC_state, MINMOD, ret, depth+1);
6849         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6850     }
6851 #ifndef REG_ALLOW_MINMOD_SUSPEND
6852     else
6853 #endif
6854     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6855         regnode *ender;
6856         nextchar(pRExC_state);
6857         ender = reg_node(pRExC_state, SUCCEED);
6858         REGTAIL(pRExC_state, ret, ender);
6859         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6860         ret->flags = 0;
6861         ender = reg_node(pRExC_state, TAIL);
6862         REGTAIL(pRExC_state, ret, ender);
6863         /*ret= ender;*/
6864     }
6865
6866     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6867         RExC_parse++;
6868         vFAIL("Nested quantifiers");
6869     }
6870
6871     return(ret);
6872 }
6873
6874
6875 /* reg_namedseq(pRExC_state,UVp)
6876
6877    This is expected to be called by a parser routine that has
6878    recognized '\N' and needs to handle the rest. RExC_parse is
6879    expected to point at the first char following the N at the time
6880    of the call.
6881
6882    The \N may be inside (indicated by valuep not being NULL) or outside a
6883    character class.
6884
6885    \N may begin either a named sequence, or if outside a character class, mean
6886    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6887    attempted to decide which, and in the case of a named sequence converted it
6888    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6889    where c1... are the characters in the sequence.  For single-quoted regexes,
6890    the tokenizer passes the \N sequence through unchanged; this code will not
6891    attempt to determine this nor expand those.  The net effect is that if the
6892    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6893    signals that this \N occurrence means to match a non-newline.
6894
6895    Only the \N{U+...} form should occur in a character class, for the same
6896    reason that '.' inside a character class means to just match a period: it
6897    just doesn't make sense.
6898
6899    If valuep is non-null then it is assumed that we are parsing inside
6900    of a charclass definition and the first codepoint in the resolved
6901    string is returned via *valuep and the routine will return NULL.
6902    In this mode if a multichar string is returned from the charnames
6903    handler, a warning will be issued, and only the first char in the
6904    sequence will be examined. If the string returned is zero length
6905    then the value of *valuep is undefined and NON-NULL will
6906    be returned to indicate failure. (This will NOT be a valid pointer
6907    to a regnode.)
6908
6909    If valuep is null then it is assumed that we are parsing normal text and a
6910    new EXACT node is inserted into the program containing the resolved string,
6911    and a pointer to the new node is returned.  But if the string is zero length
6912    a NOTHING node is emitted instead.
6913
6914    On success RExC_parse is set to the char following the endbrace.
6915    Parsing failures will generate a fatal error via vFAIL(...)
6916  */
6917 STATIC regnode *
6918 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6919 {
6920     char * endbrace;    /* '}' following the name */
6921     regnode *ret = NULL;
6922 #ifdef DEBUGGING
6923     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6924 #endif
6925     char* p;
6926
6927     GET_RE_DEBUG_FLAGS_DECL;
6928
6929     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6930
6931     GET_RE_DEBUG_FLAGS;
6932
6933     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6934      * modifier.  The other meaning does not */
6935     p = (RExC_flags & RXf_PMf_EXTENDED)
6936         ? regwhite( pRExC_state, RExC_parse )
6937         : RExC_parse;
6938
6939     /* Disambiguate between \N meaning a named character versus \N meaning
6940      * [^\n].  The former is assumed when it can't be the latter. */
6941     if (*p != '{' || regcurly(p)) {
6942         RExC_parse = p;
6943         if (valuep) {
6944             /* no bare \N in a charclass */
6945             vFAIL("\\N in a character class must be a named character: \\N{...}");
6946         }
6947         nextchar(pRExC_state);
6948         ret = reg_node(pRExC_state, REG_ANY);
6949         *flagp |= HASWIDTH|SIMPLE;
6950         RExC_naughty++;
6951         RExC_parse--;
6952         Set_Node_Length(ret, 1); /* MJD */
6953         return ret;
6954     }
6955
6956     /* Here, we have decided it should be a named sequence */
6957
6958     /* The test above made sure that the next real character is a '{', but
6959      * under the /x modifier, it could be separated by space (or a comment and
6960      * \n) and this is not allowed (for consistency with \x{...} and the
6961      * tokenizer handling of \N{NAME}). */
6962     if (*RExC_parse != '{') {
6963         vFAIL("Missing braces on \\N{}");
6964     }
6965
6966     RExC_parse++;       /* Skip past the '{' */
6967
6968     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6969         || ! (endbrace == RExC_parse            /* nothing between the {} */
6970               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
6971                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6972     {
6973         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
6974         vFAIL("\\N{NAME} must be resolved by the lexer");
6975     }
6976
6977     if (endbrace == RExC_parse) {   /* empty: \N{} */
6978         if (! valuep) {
6979             RExC_parse = endbrace + 1;
6980             return reg_node(pRExC_state,NOTHING);
6981         }
6982
6983         if (SIZE_ONLY) {
6984             ckWARNreg(RExC_parse,
6985                     "Ignoring zero length \\N{} in character class"
6986             );
6987             RExC_parse = endbrace + 1;
6988         }
6989         *valuep = 0;
6990         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6991     }
6992
6993     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
6994     RExC_parse += 2;    /* Skip past the 'U+' */
6995
6996     if (valuep) {   /* In a bracketed char class */
6997         /* We only pay attention to the first char of
6998         multichar strings being returned. I kinda wonder
6999         if this makes sense as it does change the behaviour
7000         from earlier versions, OTOH that behaviour was broken
7001         as well. XXX Solution is to recharacterize as
7002         [rest-of-class]|multi1|multi2... */
7003
7004         STRLEN length_of_hex;
7005         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7006             | PERL_SCAN_DISALLOW_PREFIX
7007             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7008
7009         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7010         if (endchar < endbrace) {
7011             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7012         }
7013
7014         length_of_hex = (STRLEN)(endchar - RExC_parse);
7015         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7016
7017         /* The tokenizer should have guaranteed validity, but it's possible to
7018          * bypass it by using single quoting, so check */
7019         if (length_of_hex == 0
7020             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7021         {
7022             RExC_parse += length_of_hex;        /* Includes all the valid */
7023             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7024                             ? UTF8SKIP(RExC_parse)
7025                             : 1;
7026             /* Guard against malformed utf8 */
7027             if (RExC_parse >= endchar) RExC_parse = endchar;
7028             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7029         }
7030
7031         RExC_parse = endbrace + 1;
7032         if (endchar == endbrace) return NULL;
7033
7034         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7035     }
7036     else {      /* Not a char class */
7037         char *s;            /* String to put in generated EXACT node */
7038         STRLEN len = 0;     /* Its current byte length */
7039         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7040                                stream */
7041
7042         ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7043                                                    : (LOC)
7044                                                       ? EXACTFL
7045                                                       : UNI_SEMANTICS
7046                                                         ? EXACTFU
7047                                                         : EXACTF));
7048         s= STRING(ret);
7049
7050         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7051          * the input which is of the form now 'c1.c2.c3...}' until find the
7052          * ending brace or exceed length 255.  The characters that exceed this
7053          * limit are dropped.  The limit could be relaxed should it become
7054          * desirable by reparsing this as (?:\N{NAME}), so could generate
7055          * multiple EXACT nodes, as is done for just regular input.  But this
7056          * is primarily a named character, and not intended to be a huge long
7057          * string, so 255 bytes should be good enough */
7058         while (1) {
7059             STRLEN length_of_hex;
7060             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7061                             | PERL_SCAN_DISALLOW_PREFIX
7062                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7063             UV cp;  /* Ord of current character */
7064
7065             /* Code points are separated by dots.  If none, there is only one
7066              * code point, and is terminated by the brace */
7067             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7068
7069             /* The values are Unicode even on EBCDIC machines */
7070             length_of_hex = (STRLEN)(endchar - RExC_parse);
7071             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7072             if ( length_of_hex == 0
7073                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7074             {
7075                 RExC_parse += length_of_hex;        /* Includes all the valid */
7076                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7077                                 ? UTF8SKIP(RExC_parse)
7078                                 : 1;
7079                 /* Guard against malformed utf8 */
7080                 if (RExC_parse >= endchar) RExC_parse = endchar;
7081                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7082             }
7083
7084             if (! FOLD) {       /* Not folding, just append to the string */
7085                 STRLEN unilen;
7086
7087                 /* Quit before adding this character if would exceed limit */
7088                 if (len + UNISKIP(cp) > U8_MAX) break;
7089
7090                 unilen = reguni(pRExC_state, cp, s);
7091                 if (unilen > 0) {
7092                     s   += unilen;
7093                     len += unilen;
7094                 }
7095             } else {    /* Folding, output the folded equivalent */
7096                 STRLEN foldlen,numlen;
7097                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7098                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7099
7100                 /* Quit before exceeding size limit */
7101                 if (len + foldlen > U8_MAX) break;
7102
7103                 for (foldbuf = tmpbuf;
7104                     foldlen;
7105                     foldlen -= numlen)
7106                 {
7107                     cp = utf8_to_uvchr(foldbuf, &numlen);
7108                     if (numlen > 0) {
7109                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7110                         s       += unilen;
7111                         len     += unilen;
7112                         /* In EBCDIC the numlen and unilen can differ. */
7113                         foldbuf += numlen;
7114                         if (numlen >= foldlen)
7115                             break;
7116                     }
7117                     else
7118                         break; /* "Can't happen." */
7119                 }
7120             }
7121
7122             /* Point to the beginning of the next character in the sequence. */
7123             RExC_parse = endchar + 1;
7124
7125             /* Quit if no more characters */
7126             if (RExC_parse >= endbrace) break;
7127         }
7128
7129
7130         if (SIZE_ONLY) {
7131             if (RExC_parse < endbrace) {
7132                 ckWARNreg(RExC_parse - 1,
7133                           "Using just the first characters returned by \\N{}");
7134             }
7135
7136             RExC_size += STR_SZ(len);
7137         } else {
7138             STR_LEN(ret) = len;
7139             RExC_emit += STR_SZ(len);
7140         }
7141
7142         RExC_parse = endbrace + 1;
7143
7144         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7145                                with malformed in t/re/pat_advanced.t */
7146         RExC_parse --;
7147         Set_Node_Cur_Length(ret); /* MJD */
7148         nextchar(pRExC_state);
7149     }
7150
7151     return ret;
7152 }
7153
7154
7155 /*
7156  * reg_recode
7157  *
7158  * It returns the code point in utf8 for the value in *encp.
7159  *    value: a code value in the source encoding
7160  *    encp:  a pointer to an Encode object
7161  *
7162  * If the result from Encode is not a single character,
7163  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7164  */
7165 STATIC UV
7166 S_reg_recode(pTHX_ const char value, SV **encp)
7167 {
7168     STRLEN numlen = 1;
7169     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7170     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7171     const STRLEN newlen = SvCUR(sv);
7172     UV uv = UNICODE_REPLACEMENT;
7173
7174     PERL_ARGS_ASSERT_REG_RECODE;
7175
7176     if (newlen)
7177         uv = SvUTF8(sv)
7178              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7179              : *(U8*)s;
7180
7181     if (!newlen || numlen != newlen) {
7182         uv = UNICODE_REPLACEMENT;
7183         *encp = NULL;
7184     }
7185     return uv;
7186 }
7187
7188
7189 /*
7190  - regatom - the lowest level
7191
7192    Try to identify anything special at the start of the pattern. If there
7193    is, then handle it as required. This may involve generating a single regop,
7194    such as for an assertion; or it may involve recursing, such as to
7195    handle a () structure.
7196
7197    If the string doesn't start with something special then we gobble up
7198    as much literal text as we can.
7199
7200    Once we have been able to handle whatever type of thing started the
7201    sequence, we return.
7202
7203    Note: we have to be careful with escapes, as they can be both literal
7204    and special, and in the case of \10 and friends can either, depending
7205    on context. Specifically there are two seperate switches for handling
7206    escape sequences, with the one for handling literal escapes requiring
7207    a dummy entry for all of the special escapes that are actually handled
7208    by the other.
7209 */
7210
7211 STATIC regnode *
7212 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7213 {
7214     dVAR;
7215     register regnode *ret = NULL;
7216     I32 flags;
7217     char *parse_start = RExC_parse;
7218     GET_RE_DEBUG_FLAGS_DECL;
7219     DEBUG_PARSE("atom");
7220     *flagp = WORST;             /* Tentatively. */
7221
7222     PERL_ARGS_ASSERT_REGATOM;
7223
7224 tryagain:
7225     switch ((U8)*RExC_parse) {
7226     case '^':
7227         RExC_seen_zerolen++;
7228         nextchar(pRExC_state);
7229         if (RExC_flags & RXf_PMf_MULTILINE)
7230             ret = reg_node(pRExC_state, MBOL);
7231         else if (RExC_flags & RXf_PMf_SINGLELINE)
7232             ret = reg_node(pRExC_state, SBOL);
7233         else
7234             ret = reg_node(pRExC_state, BOL);
7235         Set_Node_Length(ret, 1); /* MJD */
7236         break;
7237     case '$':
7238         nextchar(pRExC_state);
7239         if (*RExC_parse)
7240             RExC_seen_zerolen++;
7241         if (RExC_flags & RXf_PMf_MULTILINE)
7242             ret = reg_node(pRExC_state, MEOL);
7243         else if (RExC_flags & RXf_PMf_SINGLELINE)
7244             ret = reg_node(pRExC_state, SEOL);
7245         else
7246             ret = reg_node(pRExC_state, EOL);
7247         Set_Node_Length(ret, 1); /* MJD */
7248         break;
7249     case '.':
7250         nextchar(pRExC_state);
7251         if (RExC_flags & RXf_PMf_SINGLELINE)
7252             ret = reg_node(pRExC_state, SANY);
7253         else
7254             ret = reg_node(pRExC_state, REG_ANY);
7255         *flagp |= HASWIDTH|SIMPLE;
7256         RExC_naughty++;
7257         Set_Node_Length(ret, 1); /* MJD */
7258         break;
7259     case '[':
7260     {
7261         char * const oregcomp_parse = ++RExC_parse;
7262         ret = regclass(pRExC_state,depth+1);
7263         if (*RExC_parse != ']') {
7264             RExC_parse = oregcomp_parse;
7265             vFAIL("Unmatched [");
7266         }
7267         nextchar(pRExC_state);
7268         *flagp |= HASWIDTH|SIMPLE;
7269         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7270         break;
7271     }
7272     case '(':
7273         nextchar(pRExC_state);
7274         ret = reg(pRExC_state, 1, &flags,depth+1);
7275         if (ret == NULL) {
7276                 if (flags & TRYAGAIN) {
7277                     if (RExC_parse == RExC_end) {
7278                          /* Make parent create an empty node if needed. */
7279                         *flagp |= TRYAGAIN;
7280                         return(NULL);
7281                     }
7282                     goto tryagain;
7283                 }
7284                 return(NULL);
7285         }
7286         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7287         break;
7288     case '|':
7289     case ')':
7290         if (flags & TRYAGAIN) {
7291             *flagp |= TRYAGAIN;
7292             return NULL;
7293         }
7294         vFAIL("Internal urp");
7295                                 /* Supposed to be caught earlier. */
7296         break;
7297     case '{':
7298         if (!regcurly(RExC_parse)) {
7299             RExC_parse++;
7300             goto defchar;
7301         }
7302         /* FALL THROUGH */
7303     case '?':
7304     case '+':
7305     case '*':
7306         RExC_parse++;
7307         vFAIL("Quantifier follows nothing");
7308         break;
7309     case LATIN_SMALL_LETTER_SHARP_S:
7310     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7311     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7312 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7313 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
7314     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7315 #endif
7316         do_foldchar:
7317         if (!LOC && FOLD) {
7318             U32 len,cp;
7319             len=0; /* silence a spurious compiler warning */
7320             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7321                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7322                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7323                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7324                 Set_Node_Length(ret, 1); /* MJD */
7325                 nextchar(pRExC_state); /* kill whitespace under /x */
7326                 return ret;
7327             }
7328         }
7329         goto outer_default;
7330     case '\\':
7331         /* Special Escapes
7332
7333            This switch handles escape sequences that resolve to some kind
7334            of special regop and not to literal text. Escape sequnces that
7335            resolve to literal text are handled below in the switch marked
7336            "Literal Escapes".
7337
7338            Every entry in this switch *must* have a corresponding entry
7339            in the literal escape switch. However, the opposite is not
7340            required, as the default for this switch is to jump to the
7341            literal text handling code.
7342         */
7343         switch ((U8)*++RExC_parse) {
7344         case LATIN_SMALL_LETTER_SHARP_S:
7345         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7346         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7347                    goto do_foldchar;
7348         /* Special Escapes */
7349         case 'A':
7350             RExC_seen_zerolen++;
7351             ret = reg_node(pRExC_state, SBOL);
7352             *flagp |= SIMPLE;
7353             goto finish_meta_pat;
7354         case 'G':
7355             ret = reg_node(pRExC_state, GPOS);
7356             RExC_seen |= REG_SEEN_GPOS;
7357             *flagp |= SIMPLE;
7358             goto finish_meta_pat;
7359         case 'K':
7360             RExC_seen_zerolen++;
7361             ret = reg_node(pRExC_state, KEEPS);
7362             *flagp |= SIMPLE;
7363             /* XXX:dmq : disabling in-place substitution seems to
7364              * be necessary here to avoid cases of memory corruption, as
7365              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7366              */
7367             RExC_seen |= REG_SEEN_LOOKBEHIND;
7368             goto finish_meta_pat;
7369         case 'Z':
7370             ret = reg_node(pRExC_state, SEOL);
7371             *flagp |= SIMPLE;
7372             RExC_seen_zerolen++;                /* Do not optimize RE away */
7373             goto finish_meta_pat;
7374         case 'z':
7375             ret = reg_node(pRExC_state, EOS);
7376             *flagp |= SIMPLE;
7377             RExC_seen_zerolen++;                /* Do not optimize RE away */
7378             goto finish_meta_pat;
7379         case 'C':
7380             ret = reg_node(pRExC_state, CANY);
7381             RExC_seen |= REG_SEEN_CANY;
7382             *flagp |= HASWIDTH|SIMPLE;
7383             goto finish_meta_pat;
7384         case 'X':
7385             ret = reg_node(pRExC_state, CLUMP);
7386             *flagp |= HASWIDTH;
7387             goto finish_meta_pat;
7388         case 'w':
7389             if (LOC) {
7390                 ret = reg_node(pRExC_state, (U8)(ALNUML));
7391             } else {
7392                 ret = reg_node(pRExC_state, (U8)(ALNUM));
7393                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7394             }
7395             *flagp |= HASWIDTH|SIMPLE;
7396             goto finish_meta_pat;
7397         case 'W':
7398             if (LOC) {
7399                 ret = reg_node(pRExC_state, (U8)(NALNUML));
7400             } else {
7401                 ret = reg_node(pRExC_state, (U8)(NALNUM));
7402                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7403             }
7404             *flagp |= HASWIDTH|SIMPLE;
7405             goto finish_meta_pat;
7406         case 'b':
7407             RExC_seen_zerolen++;
7408             RExC_seen |= REG_SEEN_LOOKBEHIND;
7409             if (LOC) {
7410                 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7411             } else {
7412                 ret = reg_node(pRExC_state, (U8)(BOUND));
7413                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7414             }
7415             *flagp |= SIMPLE;
7416             goto finish_meta_pat;
7417         case 'B':
7418             RExC_seen_zerolen++;
7419             RExC_seen |= REG_SEEN_LOOKBEHIND;
7420             if (LOC) {
7421                 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7422             } else {
7423                 ret = reg_node(pRExC_state, (U8)(NBOUND));
7424                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7425             }
7426             *flagp |= SIMPLE;
7427             goto finish_meta_pat;
7428         case 's':
7429             if (LOC) {
7430                 ret = reg_node(pRExC_state, (U8)(SPACEL));
7431             } else {
7432                 ret = reg_node(pRExC_state, (U8)(SPACE));
7433                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7434             }
7435             *flagp |= HASWIDTH|SIMPLE;
7436             goto finish_meta_pat;
7437         case 'S':
7438             if (LOC) {
7439                 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7440             } else {
7441                 ret = reg_node(pRExC_state, (U8)(NSPACE));
7442                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7443             }
7444             *flagp |= HASWIDTH|SIMPLE;
7445             goto finish_meta_pat;
7446         case 'd':
7447             if (LOC) {
7448                 ret = reg_node(pRExC_state, (U8)(DIGITL));
7449             } else {
7450                 ret = reg_node(pRExC_state, (U8)(DIGIT));
7451             }
7452             *flagp |= HASWIDTH|SIMPLE;
7453             goto finish_meta_pat;
7454         case 'D':
7455             if (LOC) {
7456                 ret = reg_node(pRExC_state, (U8)(NDIGITL));
7457             } else {
7458                 ret = reg_node(pRExC_state, (U8)(NDIGIT));
7459             }
7460             *flagp |= HASWIDTH|SIMPLE;
7461             goto finish_meta_pat;
7462         case 'R':
7463             ret = reg_node(pRExC_state, LNBREAK);
7464             *flagp |= HASWIDTH|SIMPLE;
7465             goto finish_meta_pat;
7466         case 'h':
7467             ret = reg_node(pRExC_state, HORIZWS);
7468             *flagp |= HASWIDTH|SIMPLE;
7469             goto finish_meta_pat;
7470         case 'H':
7471             ret = reg_node(pRExC_state, NHORIZWS);
7472             *flagp |= HASWIDTH|SIMPLE;
7473             goto finish_meta_pat;
7474         case 'v':
7475             ret = reg_node(pRExC_state, VERTWS);
7476             *flagp |= HASWIDTH|SIMPLE;
7477             goto finish_meta_pat;
7478         case 'V':
7479             ret = reg_node(pRExC_state, NVERTWS);
7480             *flagp |= HASWIDTH|SIMPLE;
7481          finish_meta_pat:
7482             nextchar(pRExC_state);
7483             Set_Node_Length(ret, 2); /* MJD */
7484             break;
7485         case 'p':
7486         case 'P':
7487             {
7488                 char* const oldregxend = RExC_end;
7489 #ifdef DEBUGGING
7490                 char* parse_start = RExC_parse - 2;
7491 #endif
7492
7493                 if (RExC_parse[1] == '{') {
7494                   /* a lovely hack--pretend we saw [\pX] instead */
7495                     RExC_end = strchr(RExC_parse, '}');
7496                     if (!RExC_end) {
7497                         const U8 c = (U8)*RExC_parse;
7498                         RExC_parse += 2;
7499                         RExC_end = oldregxend;
7500                         vFAIL2("Missing right brace on \\%c{}", c);
7501                     }
7502                     RExC_end++;
7503                 }
7504                 else {
7505                     RExC_end = RExC_parse + 2;
7506                     if (RExC_end > oldregxend)
7507                         RExC_end = oldregxend;
7508                 }
7509                 RExC_parse--;
7510
7511                 ret = regclass(pRExC_state,depth+1);
7512
7513                 RExC_end = oldregxend;
7514                 RExC_parse--;
7515
7516                 Set_Node_Offset(ret, parse_start + 2);
7517                 Set_Node_Cur_Length(ret);
7518                 nextchar(pRExC_state);
7519                 *flagp |= HASWIDTH|SIMPLE;
7520             }
7521             break;
7522         case 'N':
7523             /* Handle \N and \N{NAME} here and not below because it can be
7524             multicharacter. join_exact() will join them up later on.
7525             Also this makes sure that things like /\N{BLAH}+/ and
7526             \N{BLAH} being multi char Just Happen. dmq*/
7527             ++RExC_parse;
7528             ret= reg_namedseq(pRExC_state, NULL, flagp);
7529             break;
7530         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7531         parse_named_seq:
7532         {
7533             char ch= RExC_parse[1];
7534             if (ch != '<' && ch != '\'' && ch != '{') {
7535                 RExC_parse++;
7536                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7537             } else {
7538                 /* this pretty much dupes the code for (?P=...) in reg(), if
7539                    you change this make sure you change that */
7540                 char* name_start = (RExC_parse += 2);
7541                 U32 num = 0;
7542                 SV *sv_dat = reg_scan_name(pRExC_state,
7543                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7544                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7545                 if (RExC_parse == name_start || *RExC_parse != ch)
7546                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7547
7548                 if (!SIZE_ONLY) {
7549                     num = add_data( pRExC_state, 1, "S" );
7550                     RExC_rxi->data->data[num]=(void*)sv_dat;
7551                     SvREFCNT_inc_simple_void(sv_dat);
7552                 }
7553
7554                 RExC_sawback = 1;
7555                 ret = reganode(pRExC_state,
7556                                ((! FOLD)
7557                                  ? NREF
7558                                  : (UNI_SEMANTICS)
7559                                    ? NREFFU
7560                                    : (LOC)
7561                                      ? NREFFL
7562                                      : NREFF),
7563                                 num);
7564                 *flagp |= HASWIDTH;
7565
7566                 /* override incorrect value set in reganode MJD */
7567                 Set_Node_Offset(ret, parse_start+1);
7568                 Set_Node_Cur_Length(ret); /* MJD */
7569                 nextchar(pRExC_state);
7570
7571             }
7572             break;
7573         }
7574         case 'g':
7575         case '1': case '2': case '3': case '4':
7576         case '5': case '6': case '7': case '8': case '9':
7577             {
7578                 I32 num;
7579                 bool isg = *RExC_parse == 'g';
7580                 bool isrel = 0;
7581                 bool hasbrace = 0;
7582                 if (isg) {
7583                     RExC_parse++;
7584                     if (*RExC_parse == '{') {
7585                         RExC_parse++;
7586                         hasbrace = 1;
7587                     }
7588                     if (*RExC_parse == '-') {
7589                         RExC_parse++;
7590                         isrel = 1;
7591                     }
7592                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7593                         if (isrel) RExC_parse--;
7594                         RExC_parse -= 2;
7595                         goto parse_named_seq;
7596                 }   }
7597                 num = atoi(RExC_parse);
7598                 if (isg && num == 0)
7599                     vFAIL("Reference to invalid group 0");
7600                 if (isrel) {
7601                     num = RExC_npar - num;
7602                     if (num < 1)
7603                         vFAIL("Reference to nonexistent or unclosed group");
7604                 }
7605                 if (!isg && num > 9 && num >= RExC_npar)
7606                     goto defchar;
7607                 else {
7608                     char * const parse_start = RExC_parse - 1; /* MJD */
7609                     while (isDIGIT(*RExC_parse))
7610                         RExC_parse++;
7611                     if (parse_start == RExC_parse - 1)
7612                         vFAIL("Unterminated \\g... pattern");
7613                     if (hasbrace) {
7614                         if (*RExC_parse != '}')
7615                             vFAIL("Unterminated \\g{...} pattern");
7616                         RExC_parse++;
7617                     }
7618                     if (!SIZE_ONLY) {
7619                         if (num > (I32)RExC_rx->nparens)
7620                             vFAIL("Reference to nonexistent group");
7621                     }
7622                     RExC_sawback = 1;
7623                     ret = reganode(pRExC_state,
7624                                    ((! FOLD)
7625                                      ? REF
7626                                      : (UNI_SEMANTICS)
7627                                        ? REFFU
7628                                        : (LOC)
7629                                          ? REFFL
7630                                          : REFF),
7631                                     num);
7632                     *flagp |= HASWIDTH;
7633
7634                     /* override incorrect value set in reganode MJD */
7635                     Set_Node_Offset(ret, parse_start+1);
7636                     Set_Node_Cur_Length(ret); /* MJD */
7637                     RExC_parse--;
7638                     nextchar(pRExC_state);
7639                 }
7640             }
7641             break;
7642         case '\0':
7643             if (RExC_parse >= RExC_end)
7644                 FAIL("Trailing \\");
7645             /* FALL THROUGH */
7646         default:
7647             /* Do not generate "unrecognized" warnings here, we fall
7648                back into the quick-grab loop below */
7649             parse_start--;
7650             goto defchar;
7651         }
7652         break;
7653
7654     case '#':
7655         if (RExC_flags & RXf_PMf_EXTENDED) {
7656             if ( reg_skipcomment( pRExC_state ) )
7657                 goto tryagain;
7658         }
7659         /* FALL THROUGH */
7660
7661     default:
7662         outer_default:{
7663             register STRLEN len;
7664             register UV ender;
7665             register char *p;
7666             char *s;
7667             STRLEN foldlen;
7668             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7669
7670             parse_start = RExC_parse - 1;
7671
7672             RExC_parse++;
7673
7674         defchar:
7675             ender = 0;
7676             ret = reg_node(pRExC_state,
7677                            (U8) ((! FOLD) ? EXACT
7678                                           : (LOC)
7679                                              ? EXACTFL
7680                                              : (UNI_SEMANTICS)
7681                                                ? EXACTFU
7682                                                : EXACTF)
7683                     );
7684             s = STRING(ret);
7685             for (len = 0, p = RExC_parse - 1;
7686               len < 127 && p < RExC_end;
7687               len++)
7688             {
7689                 char * const oldp = p;
7690
7691                 if (RExC_flags & RXf_PMf_EXTENDED)
7692                     p = regwhite( pRExC_state, p );
7693                 switch ((U8)*p) {
7694                 case LATIN_SMALL_LETTER_SHARP_S:
7695                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7696                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7697                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7698                                 goto normal_default;
7699                 case '^':
7700                 case '$':
7701                 case '.':
7702                 case '[':
7703                 case '(':
7704                 case ')':
7705                 case '|':
7706                     goto loopdone;
7707                 case '\\':
7708                     /* Literal Escapes Switch
7709
7710                        This switch is meant to handle escape sequences that
7711                        resolve to a literal character.
7712
7713                        Every escape sequence that represents something
7714                        else, like an assertion or a char class, is handled
7715                        in the switch marked 'Special Escapes' above in this
7716                        routine, but also has an entry here as anything that
7717                        isn't explicitly mentioned here will be treated as
7718                        an unescaped equivalent literal.
7719                     */
7720
7721                     switch ((U8)*++p) {
7722                     /* These are all the special escapes. */
7723                     case LATIN_SMALL_LETTER_SHARP_S:
7724                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7725                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7726                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7727                                 goto normal_default;
7728                     case 'A':             /* Start assertion */
7729                     case 'b': case 'B':   /* Word-boundary assertion*/
7730                     case 'C':             /* Single char !DANGEROUS! */
7731                     case 'd': case 'D':   /* digit class */
7732                     case 'g': case 'G':   /* generic-backref, pos assertion */
7733                     case 'h': case 'H':   /* HORIZWS */
7734                     case 'k': case 'K':   /* named backref, keep marker */
7735                     case 'N':             /* named char sequence */
7736                     case 'p': case 'P':   /* Unicode property */
7737                               case 'R':   /* LNBREAK */
7738                     case 's': case 'S':   /* space class */
7739                     case 'v': case 'V':   /* VERTWS */
7740                     case 'w': case 'W':   /* word class */
7741                     case 'X':             /* eXtended Unicode "combining character sequence" */
7742                     case 'z': case 'Z':   /* End of line/string assertion */
7743                         --p;
7744                         goto loopdone;
7745
7746                     /* Anything after here is an escape that resolves to a
7747                        literal. (Except digits, which may or may not)
7748                      */
7749                     case 'n':
7750                         ender = '\n';
7751                         p++;
7752                         break;
7753                     case 'r':
7754                         ender = '\r';
7755                         p++;
7756                         break;
7757                     case 't':
7758                         ender = '\t';
7759                         p++;
7760                         break;
7761                     case 'f':
7762                         ender = '\f';
7763                         p++;
7764                         break;
7765                     case 'e':
7766                           ender = ASCII_TO_NATIVE('\033');
7767                         p++;
7768                         break;
7769                     case 'a':
7770                           ender = ASCII_TO_NATIVE('\007');
7771                         p++;
7772                         break;
7773                     case 'o':
7774                         {
7775                             STRLEN brace_len = len;
7776                             UV result;
7777                             const char* error_msg;
7778
7779                             bool valid = grok_bslash_o(p,
7780                                                        &result,
7781                                                        &brace_len,
7782                                                        &error_msg,
7783                                                        1);
7784                             p += brace_len;
7785                             if (! valid) {
7786                                 RExC_parse = p; /* going to die anyway; point
7787                                                    to exact spot of failure */
7788                                 vFAIL(error_msg);
7789                             }
7790                             else
7791                             {
7792                                 ender = result;
7793                             }
7794                             if (PL_encoding && ender < 0x100) {
7795                                 goto recode_encoding;
7796                             }
7797                             if (ender > 0xff) {
7798                                 REQUIRE_UTF8;
7799                             }
7800                             break;
7801                         }
7802                     case 'x':
7803                         if (*++p == '{') {
7804                             char* const e = strchr(p, '}');
7805
7806                             if (!e) {
7807                                 RExC_parse = p + 1;
7808                                 vFAIL("Missing right brace on \\x{}");
7809                             }
7810                             else {
7811                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7812                                     | PERL_SCAN_DISALLOW_PREFIX;
7813                                 STRLEN numlen = e - p - 1;
7814                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7815                                 if (ender > 0xff)
7816                                     REQUIRE_UTF8;
7817                                 p = e + 1;
7818                             }
7819                         }
7820                         else {
7821                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7822                             STRLEN numlen = 2;
7823                             ender = grok_hex(p, &numlen, &flags, NULL);
7824                             p += numlen;
7825                         }
7826                         if (PL_encoding && ender < 0x100)
7827                             goto recode_encoding;
7828                         break;
7829                     case 'c':
7830                         p++;
7831                         ender = grok_bslash_c(*p++, SIZE_ONLY);
7832                         break;
7833                     case '0': case '1': case '2': case '3':case '4':
7834                     case '5': case '6': case '7': case '8':case '9':
7835                         if (*p == '0' ||
7836                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7837                         {
7838                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7839                             STRLEN numlen = 3;
7840                             ender = grok_oct(p, &numlen, &flags, NULL);
7841                             if (ender > 0xff) {
7842                                 REQUIRE_UTF8;
7843                             }
7844                             p += numlen;
7845                         }
7846                         else {
7847                             --p;
7848                             goto loopdone;
7849                         }
7850                         if (PL_encoding && ender < 0x100)
7851                             goto recode_encoding;
7852                         break;
7853                     recode_encoding:
7854                         {
7855                             SV* enc = PL_encoding;
7856                             ender = reg_recode((const char)(U8)ender, &enc);
7857                             if (!enc && SIZE_ONLY)
7858                                 ckWARNreg(p, "Invalid escape in the specified encoding");
7859                             REQUIRE_UTF8;
7860                         }
7861                         break;
7862                     case '\0':
7863                         if (p >= RExC_end)
7864                             FAIL("Trailing \\");
7865                         /* FALL THROUGH */
7866                     default:
7867                         if (!SIZE_ONLY&& isALPHA(*p))
7868                             ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7869                         goto normal_default;
7870                     }
7871                     break;
7872                 default:
7873                   normal_default:
7874                     if (UTF8_IS_START(*p) && UTF) {
7875                         STRLEN numlen;
7876                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7877                                                &numlen, UTF8_ALLOW_DEFAULT);
7878                         p += numlen;
7879                     }
7880                     else
7881                         ender = *p++;
7882                     break;
7883                 }
7884                 if ( RExC_flags & RXf_PMf_EXTENDED)
7885                     p = regwhite( pRExC_state, p );
7886                 if (UTF && FOLD) {
7887                     /* Prime the casefolded buffer. */
7888                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7889                 }
7890                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7891                     if (len)
7892                         p = oldp;
7893                     else if (UTF) {
7894                          if (FOLD) {
7895                               /* Emit all the Unicode characters. */
7896                               STRLEN numlen;
7897                               for (foldbuf = tmpbuf;
7898                                    foldlen;
7899                                    foldlen -= numlen) {
7900                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7901                                    if (numlen > 0) {
7902                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7903                                         s       += unilen;
7904                                         len     += unilen;
7905                                         /* In EBCDIC the numlen
7906                                          * and unilen can differ. */
7907                                         foldbuf += numlen;
7908                                         if (numlen >= foldlen)
7909                                              break;
7910                                    }
7911                                    else
7912                                         break; /* "Can't happen." */
7913                               }
7914                          }
7915                          else {
7916                               const STRLEN unilen = reguni(pRExC_state, ender, s);
7917                               if (unilen > 0) {
7918                                    s   += unilen;
7919                                    len += unilen;
7920                               }
7921                          }
7922                     }
7923                     else {
7924                         len++;
7925                         REGC((char)ender, s++);
7926                     }
7927                     break;
7928                 }
7929                 if (UTF) {
7930                      if (FOLD) {
7931                           /* Emit all the Unicode characters. */
7932                           STRLEN numlen;
7933                           for (foldbuf = tmpbuf;
7934                                foldlen;
7935                                foldlen -= numlen) {
7936                                ender = utf8_to_uvchr(foldbuf, &numlen);
7937                                if (numlen > 0) {
7938                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
7939                                     len     += unilen;
7940                                     s       += unilen;
7941                                     /* In EBCDIC the numlen
7942                                      * and unilen can differ. */
7943                                     foldbuf += numlen;
7944                                     if (numlen >= foldlen)
7945                                          break;
7946                                }
7947                                else
7948                                     break;
7949                           }
7950                      }
7951                      else {
7952                           const STRLEN unilen = reguni(pRExC_state, ender, s);
7953                           if (unilen > 0) {
7954                                s   += unilen;
7955                                len += unilen;
7956                           }
7957                      }
7958                      len--;
7959                 }
7960                 else
7961                     REGC((char)ender, s++);
7962             }
7963         loopdone:
7964             RExC_parse = p - 1;
7965             Set_Node_Cur_Length(ret); /* MJD */
7966             nextchar(pRExC_state);
7967             {
7968                 /* len is STRLEN which is unsigned, need to copy to signed */
7969                 IV iv = len;
7970                 if (iv < 0)
7971                     vFAIL("Internal disaster");
7972             }
7973             if (len > 0)
7974                 *flagp |= HASWIDTH;
7975             if (len == 1 && UNI_IS_INVARIANT(ender))
7976                 *flagp |= SIMPLE;
7977
7978             if (SIZE_ONLY)
7979                 RExC_size += STR_SZ(len);
7980             else {
7981                 STR_LEN(ret) = len;
7982                 RExC_emit += STR_SZ(len);
7983             }
7984         }
7985         break;
7986     }
7987
7988     return(ret);
7989 }
7990
7991 STATIC char *
7992 S_regwhite( RExC_state_t *pRExC_state, char *p )
7993 {
7994     const char *e = RExC_end;
7995
7996     PERL_ARGS_ASSERT_REGWHITE;
7997
7998     while (p < e) {
7999         if (isSPACE(*p))
8000             ++p;
8001         else if (*p == '#') {
8002             bool ended = 0;
8003             do {
8004                 if (*p++ == '\n') {
8005                     ended = 1;
8006                     break;
8007                 }
8008             } while (p < e);
8009             if (!ended)
8010                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8011         }
8012         else
8013             break;
8014     }
8015     return p;
8016 }
8017
8018 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8019    Character classes ([:foo:]) can also be negated ([:^foo:]).
8020    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8021    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
8022    but trigger failures because they are currently unimplemented. */
8023
8024 #define POSIXCC_DONE(c)   ((c) == ':')
8025 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8026 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8027
8028 STATIC I32
8029 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
8030 {
8031     dVAR;
8032     I32 namedclass = OOB_NAMEDCLASS;
8033
8034     PERL_ARGS_ASSERT_REGPPOSIXCC;
8035
8036     if (value == '[' && RExC_parse + 1 < RExC_end &&
8037         /* I smell either [: or [= or [. -- POSIX has been here, right? */
8038         POSIXCC(UCHARAT(RExC_parse))) {
8039         const char c = UCHARAT(RExC_parse);
8040         char* const s = RExC_parse++;
8041
8042         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8043             RExC_parse++;
8044         if (RExC_parse == RExC_end)
8045             /* Grandfather lone [:, [=, [. */
8046             RExC_parse = s;
8047         else {
8048             const char* const t = RExC_parse++; /* skip over the c */
8049             assert(*t == c);
8050
8051             if (UCHARAT(RExC_parse) == ']') {
8052                 const char *posixcc = s + 1;
8053                 RExC_parse++; /* skip over the ending ] */
8054
8055                 if (*s == ':') {
8056                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8057                     const I32 skip = t - posixcc;
8058
8059                     /* Initially switch on the length of the name.  */
8060                     switch (skip) {
8061                     case 4:
8062                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8063                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
8064                         break;
8065                     case 5:
8066                         /* Names all of length 5.  */
8067                         /* alnum alpha ascii blank cntrl digit graph lower
8068                            print punct space upper  */
8069                         /* Offset 4 gives the best switch position.  */
8070                         switch (posixcc[4]) {
8071                         case 'a':
8072                             if (memEQ(posixcc, "alph", 4)) /* alpha */
8073                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
8074                             break;
8075                         case 'e':
8076                             if (memEQ(posixcc, "spac", 4)) /* space */
8077                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
8078                             break;
8079                         case 'h':
8080                             if (memEQ(posixcc, "grap", 4)) /* graph */
8081                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
8082                             break;
8083                         case 'i':
8084                             if (memEQ(posixcc, "asci", 4)) /* ascii */
8085                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
8086                             break;
8087                         case 'k':
8088                             if (memEQ(posixcc, "blan", 4)) /* blank */
8089                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
8090                             break;
8091                         case 'l':
8092                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8093                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
8094                             break;
8095                         case 'm':
8096                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
8097                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
8098                             break;
8099                         case 'r':
8100                             if (memEQ(posixcc, "lowe", 4)) /* lower */
8101                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8102                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
8103                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8104                             break;
8105                         case 't':
8106                             if (memEQ(posixcc, "digi", 4)) /* digit */
8107                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8108                             else if (memEQ(posixcc, "prin", 4)) /* print */
8109                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8110                             else if (memEQ(posixcc, "punc", 4)) /* punct */
8111                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8112                             break;
8113                         }
8114                         break;
8115                     case 6:
8116                         if (memEQ(posixcc, "xdigit", 6))
8117                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8118                         break;
8119                     }
8120
8121                     if (namedclass == OOB_NAMEDCLASS)
8122                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8123                                       t - s - 1, s + 1);
8124                     assert (posixcc[skip] == ':');
8125                     assert (posixcc[skip+1] == ']');
8126                 } else if (!SIZE_ONLY) {
8127                     /* [[=foo=]] and [[.foo.]] are still future. */
8128
8129                     /* adjust RExC_parse so the warning shows after
8130                        the class closes */
8131                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8132                         RExC_parse++;
8133                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8134                 }
8135             } else {
8136                 /* Maternal grandfather:
8137                  * "[:" ending in ":" but not in ":]" */
8138                 RExC_parse = s;
8139             }
8140         }
8141     }
8142
8143     return namedclass;
8144 }
8145
8146 STATIC void
8147 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8148 {
8149     dVAR;
8150
8151     PERL_ARGS_ASSERT_CHECKPOSIXCC;
8152
8153     if (POSIXCC(UCHARAT(RExC_parse))) {
8154         const char *s = RExC_parse;
8155         const char  c = *s++;
8156
8157         while (isALNUM(*s))
8158             s++;
8159         if (*s && c == *s && s[1] == ']') {
8160             ckWARN3reg(s+2,
8161                        "POSIX syntax [%c %c] belongs inside character classes",
8162                        c, c);
8163
8164             /* [[=foo=]] and [[.foo.]] are still future. */
8165             if (POSIXCC_NOTYET(c)) {
8166                 /* adjust RExC_parse so the error shows after
8167                    the class closes */
8168                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8169                     NOOP;
8170                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8171             }
8172         }
8173     }
8174 }
8175
8176 /* No locale test */
8177 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
8178 ANYOF_##NAME:                                           \
8179         for (value = 0; value < 256; value++)           \
8180             if (TEST)                                   \
8181                 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8182     yesno = '+';                                        \
8183     what = WORD;                                        \
8184     break;                                              \
8185 case ANYOF_N##NAME:                                     \
8186         for (value = 0; value < 256; value++)           \
8187             if (!TEST)                                  \
8188                 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8189     yesno = '!';                                        \
8190     what = WORD;                                        \
8191     break
8192
8193 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8194  * there are two tests passed in, to use depending on that. There aren't any
8195  * cases where the label is different from the name, so no need for that
8196  * parameter */
8197 #define _C_C_T_(NAME,TEST_8,TEST_7,WORD)       \
8198 ANYOF_##NAME:                                           \
8199     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
8200     else if (UNI_SEMANTICS) {                           \
8201         for (value = 0; value < 256; value++) {         \
8202             if (TEST_8) stored +=                       \
8203                       S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8204         }                                               \
8205     }                                                   \
8206     else {                                              \
8207         for (value = 0; value < 128; value++) {         \
8208             if (TEST_7) stored +=                       \
8209                        S_set_regclass_bit(aTHX_ pRExC_state, ret, UNI_TO_NATIVE(value)); \
8210         }                                               \
8211     }                                                   \
8212     yesno = '+';                                        \
8213     what = WORD;                                        \
8214     break;                                              \
8215 case ANYOF_N##NAME:                                     \
8216     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
8217     else if (UNI_SEMANTICS) {                           \
8218         for (value = 0; value < 256; value++) {         \
8219             if (! TEST_8) stored +=                     \
8220                         S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8221         }                                               \
8222     }                                                   \
8223     else {                                              \
8224         for (value = 0; value < 128; value++) {         \
8225             if (! TEST_7) stored +=                     \
8226                         S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8227         }                                               \
8228         for (value = 128; value < 256; value++) {         \
8229                         S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8230         }                                               \
8231     }                                                   \
8232     yesno = '!';                                        \
8233     what = WORD;                                        \
8234     break
8235
8236 /*
8237    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8238    so that it is possible to override the option here without having to
8239    rebuild the entire core. as we are required to do if we change regcomp.h
8240    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8241 */
8242 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8243 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8244 #endif
8245
8246 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8247 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8248 #else
8249 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8250 #endif
8251
8252 STATIC U8
8253 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8254 {
8255
8256     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8257      * Locale folding is done at run-time, so this function should not be
8258      * called for nodes that are for locales.
8259      *
8260      * This function simply sets the bit corresponding to the fold of the input
8261      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
8262      * 'F' is 'f'.
8263      *
8264      * It also sets any necessary flags, and returns the number of bits that
8265      * actually changed from 0 to 1 */
8266
8267     U8 stored = 0;
8268     U8 fold;
8269
8270     fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
8271                            : PL_fold[value];
8272
8273     /* It assumes the bit for 'value' has already been set */
8274     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8275         ANYOF_BITMAP_SET(node, fold);
8276         stored++;
8277     }
8278
8279     /* The fold of the German sharp s is two ASCII characters, so isn't in the
8280      * bitmap and doesn't have to be in utf8, but we only process it if unicode
8281      * semantics are called for */
8282     if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
8283         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
8284     }
8285     else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8286              || (! UNI_SEMANTICS
8287                  && ! isASCII(value)
8288                  && PL_fold_latin1[value] != value))
8289     {   /* A character that has a fold outside of Latin1 matches outside the
8290            bitmap, but only when the target string is utf8.  Similarly when we
8291            don't have unicode semantics for the above ASCII Latin-1 characters,
8292            and they have a fold, they should match if the target is utf8, and
8293            not otherwise */
8294         ANYOF_FLAGS(node) |= ANYOF_UTF8;
8295     }
8296
8297     return stored;
8298 }
8299
8300
8301 PERL_STATIC_INLINE U8
8302 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8303 {
8304     /* This inline function sets a bit in the bitmap if not already set, and if
8305      * appropriate, its fold, returning the number of bits that actually
8306      * changed from 0 to 1 */
8307
8308     U8 stored;
8309
8310     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
8311         return 0;
8312     }
8313
8314     ANYOF_BITMAP_SET(node, value);
8315     stored = 1;
8316
8317     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
8318         stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8319     }
8320
8321     return stored;
8322 }
8323
8324 /*
8325    parse a class specification and produce either an ANYOF node that
8326    matches the pattern or if the pattern matches a single char only and
8327    that char is < 256 and we are case insensitive then we produce an
8328    EXACT node instead.
8329 */
8330
8331 STATIC regnode *
8332 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8333 {
8334     dVAR;
8335     register UV nextvalue;
8336     register IV prevvalue = OOB_UNICODE;
8337     register IV range = 0;
8338     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8339     register regnode *ret;
8340     STRLEN numlen;
8341     IV namedclass;
8342     char *rangebegin = NULL;
8343     bool need_class = 0;
8344     SV *listsv = NULL;
8345     UV n;
8346     AV* unicode_alternate  = NULL;
8347 #ifdef EBCDIC
8348     UV literal_endpoint = 0;
8349 #endif
8350     UV stored = 0;  /* how many chars stored in the bitmap */
8351
8352     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8353         case we need to change the emitted regop to an EXACT. */
8354     const char * orig_parse = RExC_parse;
8355     GET_RE_DEBUG_FLAGS_DECL;
8356
8357     PERL_ARGS_ASSERT_REGCLASS;
8358 #ifndef DEBUGGING
8359     PERL_UNUSED_ARG(depth);
8360 #endif
8361
8362     DEBUG_PARSE("clas");
8363
8364     /* Assume we are going to generate an ANYOF node. */
8365     ret = reganode(pRExC_state, ANYOF, 0);
8366
8367     if (!SIZE_ONLY)
8368         ANYOF_FLAGS(ret) = 0;
8369
8370     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
8371         RExC_naughty++;
8372         RExC_parse++;
8373         if (!SIZE_ONLY)
8374             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8375     }
8376
8377     if (SIZE_ONLY) {
8378         RExC_size += ANYOF_SKIP;
8379 #ifdef ANYOF_ADD_LOC_SKIP
8380         if (LOC) {
8381             RExC_size += ANYOF_ADD_LOC_SKIP;
8382         }
8383 #endif
8384         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8385     }
8386     else {
8387         RExC_emit += ANYOF_SKIP;
8388         if (LOC) {
8389             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8390 #ifdef ANYOF_ADD_LOC_SKIP
8391             RExC_emit += ANYOF_ADD_LOC_SKIP;
8392 #endif
8393         }
8394         ANYOF_BITMAP_ZERO(ret);
8395         listsv = newSVpvs("# comment\n");
8396     }
8397
8398     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8399
8400     if (!SIZE_ONLY && POSIXCC(nextvalue))
8401         checkposixcc(pRExC_state);
8402
8403     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8404     if (UCHARAT(RExC_parse) == ']')
8405         goto charclassloop;
8406
8407 parseit:
8408     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8409
8410     charclassloop:
8411
8412         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8413
8414         if (!range)
8415             rangebegin = RExC_parse;
8416         if (UTF) {
8417             value = utf8n_to_uvchr((U8*)RExC_parse,
8418                                    RExC_end - RExC_parse,
8419                                    &numlen, UTF8_ALLOW_DEFAULT);
8420             RExC_parse += numlen;
8421         }
8422         else
8423             value = UCHARAT(RExC_parse++);
8424
8425         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8426         if (value == '[' && POSIXCC(nextvalue))
8427             namedclass = regpposixcc(pRExC_state, value);
8428         else if (value == '\\') {
8429             if (UTF) {
8430                 value = utf8n_to_uvchr((U8*)RExC_parse,
8431                                    RExC_end - RExC_parse,
8432                                    &numlen, UTF8_ALLOW_DEFAULT);
8433                 RExC_parse += numlen;
8434             }
8435             else
8436                 value = UCHARAT(RExC_parse++);
8437             /* Some compilers cannot handle switching on 64-bit integer
8438              * values, therefore value cannot be an UV.  Yes, this will
8439              * be a problem later if we want switch on Unicode.
8440              * A similar issue a little bit later when switching on
8441              * namedclass. --jhi */
8442             switch ((I32)value) {
8443             case 'w':   namedclass = ANYOF_ALNUM;       break;
8444             case 'W':   namedclass = ANYOF_NALNUM;      break;
8445             case 's':   namedclass = ANYOF_SPACE;       break;
8446             case 'S':   namedclass = ANYOF_NSPACE;      break;
8447             case 'd':   namedclass = ANYOF_DIGIT;       break;
8448             case 'D':   namedclass = ANYOF_NDIGIT;      break;
8449             case 'v':   namedclass = ANYOF_VERTWS;      break;
8450             case 'V':   namedclass = ANYOF_NVERTWS;     break;
8451             case 'h':   namedclass = ANYOF_HORIZWS;     break;
8452             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
8453             case 'N':  /* Handle \N{NAME} in class */
8454                 {
8455                     /* We only pay attention to the first char of
8456                     multichar strings being returned. I kinda wonder
8457                     if this makes sense as it does change the behaviour
8458                     from earlier versions, OTOH that behaviour was broken
8459                     as well. */
8460                     UV v; /* value is register so we cant & it /grrr */
8461                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8462                         goto parseit;
8463                     }
8464                     value= v;
8465                 }
8466                 break;
8467             case 'p':
8468             case 'P':
8469                 {
8470                 char *e;
8471                 if (RExC_parse >= RExC_end)
8472                     vFAIL2("Empty \\%c{}", (U8)value);
8473                 if (*RExC_parse == '{') {
8474                     const U8 c = (U8)value;
8475                     e = strchr(RExC_parse++, '}');
8476                     if (!e)
8477                         vFAIL2("Missing right brace on \\%c{}", c);
8478                     while (isSPACE(UCHARAT(RExC_parse)))
8479                         RExC_parse++;
8480                     if (e == RExC_parse)
8481                         vFAIL2("Empty \\%c{}", c);
8482                     n = e - RExC_parse;
8483                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8484                         n--;
8485                 }
8486                 else {
8487                     e = RExC_parse;
8488                     n = 1;
8489                 }
8490                 if (!SIZE_ONLY) {
8491                     if (UCHARAT(RExC_parse) == '^') {
8492                          RExC_parse++;
8493                          n--;
8494                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8495                          while (isSPACE(UCHARAT(RExC_parse))) {
8496                               RExC_parse++;
8497                               n--;
8498                          }
8499                     }
8500                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8501                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8502                 }
8503                 RExC_parse = e + 1;
8504
8505                 /* The \p could match something in the Latin1 range, hence
8506                  * something that isn't utf8 */
8507                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
8508                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8509                 }
8510                 break;
8511             case 'n':   value = '\n';                   break;
8512             case 'r':   value = '\r';                   break;
8513             case 't':   value = '\t';                   break;
8514             case 'f':   value = '\f';                   break;
8515             case 'b':   value = '\b';                   break;
8516             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8517             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8518             case 'o':
8519                 RExC_parse--;   /* function expects to be pointed at the 'o' */
8520                 {
8521                     const char* error_msg;
8522                     bool valid = grok_bslash_o(RExC_parse,
8523                                                &value,
8524                                                &numlen,
8525                                                &error_msg,
8526                                                SIZE_ONLY);
8527                     RExC_parse += numlen;
8528                     if (! valid) {
8529                         vFAIL(error_msg);
8530                     }
8531                 }
8532                 if (PL_encoding && value < 0x100) {
8533                     goto recode_encoding;
8534                 }
8535                 break;
8536             case 'x':
8537                 if (*RExC_parse == '{') {
8538                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8539                         | PERL_SCAN_DISALLOW_PREFIX;
8540                     char * const e = strchr(RExC_parse++, '}');
8541                     if (!e)
8542                         vFAIL("Missing right brace on \\x{}");
8543
8544                     numlen = e - RExC_parse;
8545                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8546                     RExC_parse = e + 1;
8547                 }
8548                 else {
8549                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8550                     numlen = 2;
8551                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8552                     RExC_parse += numlen;
8553                 }
8554                 if (PL_encoding && value < 0x100)
8555                     goto recode_encoding;
8556                 break;
8557             case 'c':
8558                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8559                 break;
8560             case '0': case '1': case '2': case '3': case '4':
8561             case '5': case '6': case '7':
8562                 {
8563                     /* Take 1-3 octal digits */
8564                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8565                     numlen = 3;
8566                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8567                     RExC_parse += numlen;
8568                     if (PL_encoding && value < 0x100)
8569                         goto recode_encoding;
8570                     break;
8571                 }
8572             recode_encoding:
8573                 {
8574                     SV* enc = PL_encoding;
8575                     value = reg_recode((const char)(U8)value, &enc);
8576                     if (!enc && SIZE_ONLY)
8577                         ckWARNreg(RExC_parse,
8578                                   "Invalid escape in the specified encoding");
8579                     break;
8580                 }
8581             default:
8582                 /* Allow \_ to not give an error */
8583                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8584                     ckWARN2reg(RExC_parse,
8585                                "Unrecognized escape \\%c in character class passed through",
8586                                (int)value);
8587                 }
8588                 break;
8589             }
8590         } /* end of \blah */
8591 #ifdef EBCDIC
8592         else
8593             literal_endpoint++;
8594 #endif
8595
8596         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8597
8598             /* What matches in a locale is not known until runtime, so need to
8599              * (one time per class) allocate extra space to pass to regexec.
8600              * The space will contain a bit for each named class that is to be
8601              * matched against.  This isn't needed for \p{} and pseudo-classes,
8602              * as they are not affected by locale, and hence are dealt with
8603              * separately */
8604             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8605                 need_class = 1;
8606                 if (SIZE_ONLY) {
8607 #ifdef ANYOF_CLASS_ADD_SKIP
8608                     RExC_size += ANYOF_CLASS_ADD_SKIP;
8609 #endif
8610                 }
8611                 else {
8612 #ifdef ANYOF_CLASS_ADD_SKIP
8613                     RExC_emit += ANYOF_CLASS_ADD_SKIP;
8614 #endif
8615                     ANYOF_CLASS_ZERO(ret);
8616                 }
8617                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8618             }
8619
8620             /* a bad range like a-\d, a-[:digit:] ? */
8621             if (range) {
8622                 if (!SIZE_ONLY) {
8623                     const int w =
8624                         RExC_parse >= rangebegin ?
8625                         RExC_parse - rangebegin : 0;
8626                     ckWARN4reg(RExC_parse,
8627                                "False [] range \"%*.*s\"",
8628                                w, w, rangebegin);
8629
8630                     if (prevvalue < 256) {
8631                         stored +=
8632                          S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
8633                         stored +=
8634                          S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8635                     }
8636                     else {
8637                         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8638                         Perl_sv_catpvf(aTHX_ listsv,
8639                            "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8640                     }
8641                 }
8642
8643                 range = 0; /* this was not a true range */
8644             }
8645
8646
8647
8648             if (!SIZE_ONLY) {
8649                 const char *what = NULL;
8650                 char yesno = 0;
8651
8652                 /* Possible truncation here but in some 64-bit environments
8653                  * the compiler gets heartburn about switch on 64-bit values.
8654                  * A similar issue a little earlier when switching on value.
8655                  * --jhi */
8656                 switch ((I32)namedclass) {
8657
8658                 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8659                 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8660                 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8661                 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8662                 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8663                 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8664                 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8665                 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8666                 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8667                 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
8668 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8669                 /* \s, \w match all unicode if utf8. */
8670                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8671                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8672 #else
8673                 /* \s, \w match ascii and locale only */
8674                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8675                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8676 #endif
8677                 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
8678                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8679                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8680                 case ANYOF_ASCII:
8681                     if (LOC)
8682                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8683                     else {
8684                         for (value = 0; value < 128; value++)
8685                             stored +=
8686                               S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
8687                     }
8688                     yesno = '+';
8689                     what = NULL;        /* Doesn't match outside ascii, so
8690                                            don't want to add +utf8:: */
8691                     break;
8692                 case ANYOF_NASCII:
8693                     if (LOC)
8694                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8695                     else {
8696                         for (value = 128; value < 256; value++)
8697                             stored +=
8698                               S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
8699                     }
8700                     yesno = '!';
8701                     what = "ASCII";
8702                     break;
8703                 case ANYOF_DIGIT:
8704                     if (LOC)
8705                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8706                     else {
8707                         /* consecutive digits assumed */
8708                         for (value = '0'; value <= '9'; value++)
8709                             stored +=
8710                               S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8711                     }
8712                     yesno = '+';
8713                     what = POSIX_CC_UNI_NAME("Digit");
8714                     break;
8715                 case ANYOF_NDIGIT:
8716                     if (LOC)
8717                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8718                     else {
8719                         /* consecutive digits assumed */
8720                         for (value = 0; value < '0'; value++)
8721                             stored +=
8722                               S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8723                         for (value = '9' + 1; value < 256; value++)
8724                             stored +=
8725                               S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8726                     }
8727                     yesno = '!';
8728                     what = POSIX_CC_UNI_NAME("Digit");
8729                     break;
8730                 case ANYOF_MAX:
8731                     /* this is to handle \p and \P */
8732                     break;
8733                 default:
8734                     vFAIL("Invalid [::] class");
8735                     break;
8736                 }
8737                 if (what) {
8738                     /* Strings such as "+utf8::isWord\n" */
8739                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8740                     ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8741                 }
8742
8743                 continue;
8744             }
8745         } /* end of namedclass \blah */
8746
8747         if (range) {
8748             if (prevvalue > (IV)value) /* b-a */ {
8749                 const int w = RExC_parse - rangebegin;
8750                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8751                 range = 0; /* not a valid range */
8752             }
8753         }
8754         else {
8755             prevvalue = value; /* save the beginning of the range */
8756             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8757                 RExC_parse[1] != ']') {
8758                 RExC_parse++;
8759
8760                 /* a bad range like \w-, [:word:]- ? */
8761                 if (namedclass > OOB_NAMEDCLASS) {
8762                     if (ckWARN(WARN_REGEXP)) {
8763                         const int w =
8764                             RExC_parse >= rangebegin ?
8765                             RExC_parse - rangebegin : 0;
8766                         vWARN4(RExC_parse,
8767                                "False [] range \"%*.*s\"",
8768                                w, w, rangebegin);
8769                     }
8770                     if (!SIZE_ONLY)
8771                         stored +=
8772                             S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8773                 } else
8774                     range = 1;  /* yeah, it's a range! */
8775                 continue;       /* but do it the next time */
8776             }
8777         }
8778
8779         /* now is the next time */
8780         if (!SIZE_ONLY) {
8781             if (prevvalue < 256) {
8782                 const IV ceilvalue = value < 256 ? value : 255;
8783                 IV i;
8784 #ifdef EBCDIC
8785                 /* In EBCDIC [\x89-\x91] should include
8786                  * the \x8e but [i-j] should not. */
8787                 if (literal_endpoint == 2 &&
8788                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8789                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8790                 {
8791                     if (isLOWER(prevvalue)) {
8792                         for (i = prevvalue; i <= ceilvalue; i++)
8793                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8794                                 stored +=
8795                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
8796                             }
8797                     } else {
8798                         for (i = prevvalue; i <= ceilvalue; i++)
8799                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8800                                 stored +=
8801                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
8802                             }
8803                     }
8804                 }
8805                 else
8806 #endif
8807                       for (i = prevvalue; i <= ceilvalue; i++) {
8808                         stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
8809                       }
8810           }
8811           if (value > 255 || UTF) {
8812                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8813                 const UV natvalue      = NATIVE_TO_UNI(value);
8814
8815                 /* If the code point requires utf8 to represent, and we are not
8816                  * folding, it can't match unless the target is in utf8.  Only
8817                  * a few code points above 255 fold to below it, so XXX an
8818                  * optimization would be to know which ones and set the flag
8819                  * appropriately. */
8820                 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8821                                     ? ANYOF_NONBITMAP
8822                                     : ANYOF_UTF8;
8823                 if (prevnatvalue < natvalue) { /* what about > ? */
8824                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8825                                    prevnatvalue, natvalue);
8826                 }
8827                 else if (prevnatvalue == natvalue) {
8828                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8829                     if (FOLD) {
8830                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8831                          STRLEN foldlen;
8832                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8833
8834 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8835                          if (RExC_precomp[0] == ':' &&
8836                              RExC_precomp[1] == '[' &&
8837                              (f == 0xDF || f == 0x92)) {
8838                              f = NATIVE_TO_UNI(f);
8839                         }
8840 #endif
8841                          /* If folding and foldable and a single
8842                           * character, insert also the folded version
8843                           * to the charclass. */
8844                          if (f != value) {
8845 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8846                              if ((RExC_precomp[0] == ':' &&
8847                                   RExC_precomp[1] == '[' &&
8848                                   (f == 0xA2 &&
8849                                    (value == 0xFB05 || value == 0xFB06))) ?
8850                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8851                                  foldlen == (STRLEN)UNISKIP(f) )
8852 #else
8853                               if (foldlen == (STRLEN)UNISKIP(f))
8854 #endif
8855                                   Perl_sv_catpvf(aTHX_ listsv,
8856                                                  "%04"UVxf"\n", f);
8857                               else {
8858                                   /* Any multicharacter foldings
8859                                    * require the following transform:
8860                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8861                                    * where E folds into "pq" and F folds
8862                                    * into "rst", all other characters
8863                                    * fold to single characters.  We save
8864                                    * away these multicharacter foldings,
8865                                    * to be later saved as part of the
8866                                    * additional "s" data. */
8867                                   SV *sv;
8868
8869                                   if (!unicode_alternate)
8870                                       unicode_alternate = newAV();
8871                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8872                                                      TRUE);
8873                                   av_push(unicode_alternate, sv);
8874                               }
8875                          }
8876
8877                          /* If folding and the value is one of the Greek
8878                           * sigmas insert a few more sigmas to make the
8879                           * folding rules of the sigmas to work right.
8880                           * Note that not all the possible combinations
8881                           * are handled here: some of them are handled
8882                           * by the standard folding rules, and some of
8883                           * them (literal or EXACTF cases) are handled
8884                           * during runtime in regexec.c:S_find_byclass(). */
8885                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8886                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8887                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8888                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8889                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8890                          }
8891                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8892                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8893                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8894                     }
8895                 }
8896             }
8897 #ifdef EBCDIC
8898             literal_endpoint = 0;
8899 #endif
8900         }
8901
8902         range = 0; /* this range (if it was one) is done now */
8903     }
8904
8905
8906
8907     if (SIZE_ONLY)
8908         return ret;
8909     /****** !SIZE_ONLY AFTER HERE *********/
8910
8911     /* Folding in the bitmap is taken care of above, but not for locale, for
8912      * which we have to wait to see what folding is in effect at runtime, and
8913      * for things not in the bitmap */
8914     if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) {
8915         ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8916     }
8917
8918     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that this doesn't
8919      * optimize locale.  Doing so perhaps could be done as long as there is
8920      * nothing like \w in it; some thought also would have to be given to the
8921      * interaction with above 0x100 chars */
8922     if ((ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8923         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8924             ANYOF_BITMAP(ret)[value] ^= 0xFF;
8925         stored = 256 - stored;
8926
8927         /* The inversion means that everything above 255 is matched */
8928         ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
8929     }
8930
8931     /* A single character class can be "optimized" into an EXACTish node.
8932      * Note that since we don't currently count how many characters there are
8933      * outside the bitmap, we are XXX missing optimization possibilities for
8934      * them.  This optimization can't happen unless this is a truly single
8935      * character class, which means that it can't be an inversion into a
8936      * many-character class, and there must be no possibility of there being
8937      * things outside the bitmap.  'stored' (only) for locales doesn't include
8938      * \w, etc, so have to make a special test that they aren't present */
8939     if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
8940         && ((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8941                               || (! ANYOF_CLASS_TEST_ANY_SET(ret))))))
8942     {
8943         /* Note that the information needed to decide to do this optimization
8944          * is not currently available until the 2nd pass, and that the actually
8945          * used EXACT node takes less space than the calculated ANYOF node, and
8946          * hence the amount of space calculated in the first pass is larger
8947          * than actually used, so this optimization doesn't gain us any space.
8948          * But an EXACT node is faster than an ANYOF node, and can be combined
8949          * with any adjacent EXACT nodes later by the optimizer for further
8950          * gains. */
8951
8952         const char * cur_parse= RExC_parse;
8953         RExC_emit = (regnode *)orig_emit;
8954         RExC_parse = (char *)orig_parse;
8955
8956         /* (A locale node can have 1 point and be folded; all the other folds
8957          * will include the fold, hence will have 2 points, so we won't get
8958          * here with ANYOF_FOLD set unless it is also locale) */
8959         ret = reg_node(pRExC_state, (U8) (! (ANYOF_FLAGS(ret) & ANYOF_FOLD))
8960                                          ? EXACT
8961                                          : EXACTFL
8962                     );
8963         RExC_parse = (char *)cur_parse;
8964         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
8965             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
8966             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
8967             STR_LEN(ret)= 2;
8968             RExC_emit += STR_SZ(2);
8969         }
8970         else {
8971             *STRING(ret)= (char)value;
8972             STR_LEN(ret)= 1;
8973             RExC_emit += STR_SZ(1);
8974         }
8975         SvREFCNT_dec(listsv);
8976         return ret;
8977
8978         /* (A 2-character class of the very special form like [aA] could be
8979          * optimized into an EXACTFish node, but only for non-locales, and for
8980          * characters which only have the two folds; so things like 'fF' and
8981          * 'Ii' wouldn't work because of the fold of 'LATIN SMALL LIGATURE FI'.
8982          * Since we don't have that information currently conveniently
8983          * available, skip the optimization) */
8984     }
8985
8986     {
8987         AV * const av = newAV();
8988         SV *rv;
8989         /* The 0th element stores the character class description
8990          * in its textual form: used later (regexec.c:Perl_regclass_swash())
8991          * to initialize the appropriate swash (which gets stored in
8992          * the 1st element), and also useful for dumping the regnode.
8993          * The 2nd element stores the multicharacter foldings,
8994          * used later (regexec.c:S_reginclass()). */
8995         av_store(av, 0, listsv);
8996         av_store(av, 1, NULL);
8997         av_store(av, 2, MUTABLE_SV(unicode_alternate));
8998         rv = newRV_noinc(MUTABLE_SV(av));
8999         n = add_data(pRExC_state, 1, "s");
9000         RExC_rxi->data->data[n] = (void*)rv;
9001         ARG_SET(ret, n);
9002     }
9003     return ret;
9004 }
9005 #undef _C_C_T_
9006
9007
9008 /* reg_skipcomment()
9009
9010    Absorbs an /x style # comments from the input stream.
9011    Returns true if there is more text remaining in the stream.
9012    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
9013    terminates the pattern without including a newline.
9014
9015    Note its the callers responsibility to ensure that we are
9016    actually in /x mode
9017
9018 */
9019
9020 STATIC bool
9021 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
9022 {
9023     bool ended = 0;
9024
9025     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
9026
9027     while (RExC_parse < RExC_end)
9028         if (*RExC_parse++ == '\n') {
9029             ended = 1;
9030             break;
9031         }
9032     if (!ended) {
9033         /* we ran off the end of the pattern without ending
9034            the comment, so we have to add an \n when wrapping */
9035         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9036         return 0;
9037     } else
9038         return 1;
9039 }
9040
9041 /* nextchar()
9042
9043    Advances the parse position, and optionally absorbs
9044    "whitespace" from the inputstream.
9045
9046    Without /x "whitespace" means (?#...) style comments only,
9047    with /x this means (?#...) and # comments and whitespace proper.
9048
9049    Returns the RExC_parse point from BEFORE the scan occurs.
9050
9051    This is the /x friendly way of saying RExC_parse++.
9052 */
9053
9054 STATIC char*
9055 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
9056 {
9057     char* const retval = RExC_parse++;
9058
9059     PERL_ARGS_ASSERT_NEXTCHAR;
9060
9061     for (;;) {
9062         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9063                 RExC_parse[2] == '#') {
9064             while (*RExC_parse != ')') {
9065                 if (RExC_parse == RExC_end)
9066                     FAIL("Sequence (?#... not terminated");
9067                 RExC_parse++;
9068             }
9069             RExC_parse++;
9070             continue;
9071         }
9072         if (RExC_flags & RXf_PMf_EXTENDED) {
9073             if (isSPACE(*RExC_parse)) {
9074                 RExC_parse++;
9075                 continue;
9076             }
9077             else if (*RExC_parse == '#') {
9078                 if ( reg_skipcomment( pRExC_state ) )
9079                     continue;
9080             }
9081         }
9082         return retval;
9083     }
9084 }
9085
9086 /*
9087 - reg_node - emit a node
9088 */
9089 STATIC regnode *                        /* Location. */
9090 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
9091 {
9092     dVAR;
9093     register regnode *ptr;
9094     regnode * const ret = RExC_emit;
9095     GET_RE_DEBUG_FLAGS_DECL;
9096
9097     PERL_ARGS_ASSERT_REG_NODE;
9098
9099     if (SIZE_ONLY) {
9100         SIZE_ALIGN(RExC_size);
9101         RExC_size += 1;
9102         return(ret);
9103     }
9104     if (RExC_emit >= RExC_emit_bound)
9105         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9106
9107     NODE_ALIGN_FILL(ret);
9108     ptr = ret;
9109     FILL_ADVANCE_NODE(ptr, op);
9110 #ifdef RE_TRACK_PATTERN_OFFSETS
9111     if (RExC_offsets) {         /* MJD */
9112         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
9113               "reg_node", __LINE__,
9114               PL_reg_name[op],
9115               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9116                 ? "Overwriting end of array!\n" : "OK",
9117               (UV)(RExC_emit - RExC_emit_start),
9118               (UV)(RExC_parse - RExC_start),
9119               (UV)RExC_offsets[0]));
9120         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
9121     }
9122 #endif
9123     RExC_emit = ptr;
9124     return(ret);
9125 }
9126
9127 /*
9128 - reganode - emit a node with an argument
9129 */
9130 STATIC regnode *                        /* Location. */
9131 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
9132 {
9133     dVAR;
9134     register regnode *ptr;
9135     regnode * const ret = RExC_emit;
9136     GET_RE_DEBUG_FLAGS_DECL;
9137
9138     PERL_ARGS_ASSERT_REGANODE;
9139
9140     if (SIZE_ONLY) {
9141         SIZE_ALIGN(RExC_size);
9142         RExC_size += 2;
9143         /*
9144            We can't do this:
9145
9146            assert(2==regarglen[op]+1);
9147
9148            Anything larger than this has to allocate the extra amount.
9149            If we changed this to be:
9150
9151            RExC_size += (1 + regarglen[op]);
9152
9153            then it wouldn't matter. Its not clear what side effect
9154            might come from that so its not done so far.
9155            -- dmq
9156         */
9157         return(ret);
9158     }
9159     if (RExC_emit >= RExC_emit_bound)
9160         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9161
9162     NODE_ALIGN_FILL(ret);
9163     ptr = ret;
9164     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
9165 #ifdef RE_TRACK_PATTERN_OFFSETS
9166     if (RExC_offsets) {         /* MJD */
9167         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9168               "reganode",
9169               __LINE__,
9170               PL_reg_name[op],
9171               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
9172               "Overwriting end of array!\n" : "OK",
9173               (UV)(RExC_emit - RExC_emit_start),
9174               (UV)(RExC_parse - RExC_start),
9175               (UV)RExC_offsets[0]));
9176         Set_Cur_Node_Offset;
9177     }
9178 #endif
9179     RExC_emit = ptr;
9180     return(ret);
9181 }
9182
9183 /*
9184 - reguni - emit (if appropriate) a Unicode character
9185 */
9186 STATIC STRLEN
9187 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
9188 {
9189     dVAR;
9190
9191     PERL_ARGS_ASSERT_REGUNI;
9192
9193     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
9194 }
9195
9196 /*
9197 - reginsert - insert an operator in front of already-emitted operand
9198 *
9199 * Means relocating the operand.
9200 */
9201 STATIC void
9202 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
9203 {
9204     dVAR;
9205     register regnode *src;
9206     register regnode *dst;
9207     register regnode *place;
9208     const int offset = regarglen[(U8)op];
9209     const int size = NODE_STEP_REGNODE + offset;
9210     GET_RE_DEBUG_FLAGS_DECL;
9211
9212     PERL_ARGS_ASSERT_REGINSERT;
9213     PERL_UNUSED_ARG(depth);
9214 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
9215     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9216     if (SIZE_ONLY) {
9217         RExC_size += size;
9218         return;
9219     }
9220
9221     src = RExC_emit;
9222     RExC_emit += size;
9223     dst = RExC_emit;
9224     if (RExC_open_parens) {
9225         int paren;
9226         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9227         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9228             if ( RExC_open_parens[paren] >= opnd ) {
9229                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9230                 RExC_open_parens[paren] += size;
9231             } else {
9232                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9233             }
9234             if ( RExC_close_parens[paren] >= opnd ) {
9235                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9236                 RExC_close_parens[paren] += size;
9237             } else {
9238                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9239             }
9240         }
9241     }
9242
9243     while (src > opnd) {
9244         StructCopy(--src, --dst, regnode);
9245 #ifdef RE_TRACK_PATTERN_OFFSETS
9246         if (RExC_offsets) {     /* MJD 20010112 */
9247             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9248                   "reg_insert",
9249                   __LINE__,
9250                   PL_reg_name[op],
9251                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9252                     ? "Overwriting end of array!\n" : "OK",
9253                   (UV)(src - RExC_emit_start),
9254                   (UV)(dst - RExC_emit_start),
9255                   (UV)RExC_offsets[0]));
9256             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9257             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9258         }
9259 #endif
9260     }
9261
9262
9263     place = opnd;               /* Op node, where operand used to be. */
9264 #ifdef RE_TRACK_PATTERN_OFFSETS
9265     if (RExC_offsets) {         /* MJD */
9266         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9267               "reginsert",
9268               __LINE__,
9269               PL_reg_name[op],
9270               (UV)(place - RExC_emit_start) > RExC_offsets[0]
9271               ? "Overwriting end of array!\n" : "OK",
9272               (UV)(place - RExC_emit_start),
9273               (UV)(RExC_parse - RExC_start),
9274               (UV)RExC_offsets[0]));
9275         Set_Node_Offset(place, RExC_parse);
9276         Set_Node_Length(place, 1);
9277     }
9278 #endif
9279     src = NEXTOPER(place);
9280     FILL_ADVANCE_NODE(place, op);
9281     Zero(src, offset, regnode);
9282 }
9283
9284 /*
9285 - regtail - set the next-pointer at the end of a node chain of p to val.
9286 - SEE ALSO: regtail_study
9287 */
9288 /* TODO: All three parms should be const */
9289 STATIC void
9290 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9291 {
9292     dVAR;
9293     register regnode *scan;
9294     GET_RE_DEBUG_FLAGS_DECL;
9295
9296     PERL_ARGS_ASSERT_REGTAIL;
9297 #ifndef DEBUGGING
9298     PERL_UNUSED_ARG(depth);
9299 #endif
9300
9301     if (SIZE_ONLY)
9302         return;
9303
9304     /* Find last node. */
9305     scan = p;
9306     for (;;) {
9307         regnode * const temp = regnext(scan);
9308         DEBUG_PARSE_r({
9309             SV * const mysv=sv_newmortal();
9310             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9311             regprop(RExC_rx, mysv, scan);
9312             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9313                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9314                     (temp == NULL ? "->" : ""),
9315                     (temp == NULL ? PL_reg_name[OP(val)] : "")
9316             );
9317         });
9318         if (temp == NULL)
9319             break;
9320         scan = temp;
9321     }
9322
9323     if (reg_off_by_arg[OP(scan)]) {
9324         ARG_SET(scan, val - scan);
9325     }
9326     else {
9327         NEXT_OFF(scan) = val - scan;
9328     }
9329 }
9330
9331 #ifdef DEBUGGING
9332 /*
9333 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9334 - Look for optimizable sequences at the same time.
9335 - currently only looks for EXACT chains.
9336
9337 This is expermental code. The idea is to use this routine to perform
9338 in place optimizations on branches and groups as they are constructed,
9339 with the long term intention of removing optimization from study_chunk so
9340 that it is purely analytical.
9341
9342 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9343 to control which is which.
9344
9345 */
9346 /* TODO: All four parms should be const */
9347
9348 STATIC U8
9349 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9350 {
9351     dVAR;
9352     register regnode *scan;
9353     U8 exact = PSEUDO;
9354 #ifdef EXPERIMENTAL_INPLACESCAN
9355     I32 min = 0;
9356 #endif
9357     GET_RE_DEBUG_FLAGS_DECL;
9358
9359     PERL_ARGS_ASSERT_REGTAIL_STUDY;
9360
9361
9362     if (SIZE_ONLY)
9363         return exact;
9364
9365     /* Find last node. */
9366
9367     scan = p;
9368     for (;;) {
9369         regnode * const temp = regnext(scan);
9370 #ifdef EXPERIMENTAL_INPLACESCAN
9371         if (PL_regkind[OP(scan)] == EXACT)
9372             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9373                 return EXACT;
9374 #endif
9375         if ( exact ) {
9376             switch (OP(scan)) {
9377                 case EXACT:
9378                 case EXACTF:
9379                 case EXACTFU:
9380                 case EXACTFL:
9381                         if( exact == PSEUDO )
9382                             exact= OP(scan);
9383                         else if ( exact != OP(scan) )
9384                             exact= 0;
9385                 case NOTHING:
9386                     break;
9387                 default:
9388                     exact= 0;
9389             }
9390         }
9391         DEBUG_PARSE_r({
9392             SV * const mysv=sv_newmortal();
9393             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9394             regprop(RExC_rx, mysv, scan);
9395             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9396                 SvPV_nolen_const(mysv),
9397                 REG_NODE_NUM(scan),
9398                 PL_reg_name[exact]);
9399         });
9400         if (temp == NULL)
9401             break;
9402         scan = temp;
9403     }
9404     DEBUG_PARSE_r({
9405         SV * const mysv_val=sv_newmortal();
9406         DEBUG_PARSE_MSG("");
9407         regprop(RExC_rx, mysv_val, val);
9408         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9409                       SvPV_nolen_const(mysv_val),
9410                       (IV)REG_NODE_NUM(val),
9411                       (IV)(val - scan)
9412         );
9413     });
9414     if (reg_off_by_arg[OP(scan)]) {
9415         ARG_SET(scan, val - scan);
9416     }
9417     else {
9418         NEXT_OFF(scan) = val - scan;
9419     }
9420
9421     return exact;
9422 }
9423 #endif
9424
9425 /*
9426  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9427  */
9428 #ifdef DEBUGGING
9429 static void
9430 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9431 {
9432     int bit;
9433     int set=0;
9434
9435     for (bit=0; bit<32; bit++) {
9436         if (flags & (1<<bit)) {
9437             if (!set++ && lead)
9438                 PerlIO_printf(Perl_debug_log, "%s",lead);
9439             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9440         }
9441     }
9442     if (lead)  {
9443         if (set)
9444             PerlIO_printf(Perl_debug_log, "\n");
9445         else
9446             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9447     }
9448 }
9449 #endif
9450
9451 void
9452 Perl_regdump(pTHX_ const regexp *r)
9453 {
9454 #ifdef DEBUGGING
9455     dVAR;
9456     SV * const sv = sv_newmortal();
9457     SV *dsv= sv_newmortal();
9458     RXi_GET_DECL(r,ri);
9459     GET_RE_DEBUG_FLAGS_DECL;
9460
9461     PERL_ARGS_ASSERT_REGDUMP;
9462
9463     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9464
9465     /* Header fields of interest. */
9466     if (r->anchored_substr) {
9467         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9468             RE_SV_DUMPLEN(r->anchored_substr), 30);
9469         PerlIO_printf(Perl_debug_log,
9470                       "anchored %s%s at %"IVdf" ",
9471                       s, RE_SV_TAIL(r->anchored_substr),
9472                       (IV)r->anchored_offset);
9473     } else if (r->anchored_utf8) {
9474         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9475             RE_SV_DUMPLEN(r->anchored_utf8), 30);
9476         PerlIO_printf(Perl_debug_log,
9477                       "anchored utf8 %s%s at %"IVdf" ",
9478                       s, RE_SV_TAIL(r->anchored_utf8),
9479                       (IV)r->anchored_offset);
9480     }
9481     if (r->float_substr) {
9482         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9483             RE_SV_DUMPLEN(r->float_substr), 30);
9484         PerlIO_printf(Perl_debug_log,
9485                       "floating %s%s at %"IVdf"..%"UVuf" ",
9486                       s, RE_SV_TAIL(r->float_substr),
9487                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9488     } else if (r->float_utf8) {
9489         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9490             RE_SV_DUMPLEN(r->float_utf8), 30);
9491         PerlIO_printf(Perl_debug_log,
9492                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9493                       s, RE_SV_TAIL(r->float_utf8),
9494                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9495     }
9496     if (r->check_substr || r->check_utf8)
9497         PerlIO_printf(Perl_debug_log,
9498                       (const char *)
9499                       (r->check_substr == r->float_substr
9500                        && r->check_utf8 == r->float_utf8
9501                        ? "(checking floating" : "(checking anchored"));
9502     if (r->extflags & RXf_NOSCAN)
9503         PerlIO_printf(Perl_debug_log, " noscan");
9504     if (r->extflags & RXf_CHECK_ALL)
9505         PerlIO_printf(Perl_debug_log, " isall");
9506     if (r->check_substr || r->check_utf8)
9507         PerlIO_printf(Perl_debug_log, ") ");
9508
9509     if (ri->regstclass) {
9510         regprop(r, sv, ri->regstclass);
9511         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9512     }
9513     if (r->extflags & RXf_ANCH) {
9514         PerlIO_printf(Perl_debug_log, "anchored");
9515         if (r->extflags & RXf_ANCH_BOL)
9516             PerlIO_printf(Perl_debug_log, "(BOL)");
9517         if (r->extflags & RXf_ANCH_MBOL)
9518             PerlIO_printf(Perl_debug_log, "(MBOL)");
9519         if (r->extflags & RXf_ANCH_SBOL)
9520             PerlIO_printf(Perl_debug_log, "(SBOL)");
9521         if (r->extflags & RXf_ANCH_GPOS)
9522             PerlIO_printf(Perl_debug_log, "(GPOS)");
9523         PerlIO_putc(Perl_debug_log, ' ');
9524     }
9525     if (r->extflags & RXf_GPOS_SEEN)
9526         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9527     if (r->intflags & PREGf_SKIP)
9528         PerlIO_printf(Perl_debug_log, "plus ");
9529     if (r->intflags & PREGf_IMPLICIT)
9530         PerlIO_printf(Perl_debug_log, "implicit ");
9531     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9532     if (r->extflags & RXf_EVAL_SEEN)
9533         PerlIO_printf(Perl_debug_log, "with eval ");
9534     PerlIO_printf(Perl_debug_log, "\n");
9535     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9536 #else
9537     PERL_ARGS_ASSERT_REGDUMP;
9538     PERL_UNUSED_CONTEXT;
9539     PERL_UNUSED_ARG(r);
9540 #endif  /* DEBUGGING */
9541 }
9542
9543 /*
9544 - regprop - printable representation of opcode
9545 */
9546 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9547 STMT_START { \
9548         if (do_sep) {                           \
9549             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9550             if (flags & ANYOF_INVERT)           \
9551                 /*make sure the invert info is in each */ \
9552                 sv_catpvs(sv, "^");             \
9553             do_sep = 0;                         \
9554         }                                       \
9555 } STMT_END
9556
9557 void
9558 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9559 {
9560 #ifdef DEBUGGING
9561     dVAR;
9562     register int k;
9563     RXi_GET_DECL(prog,progi);
9564     GET_RE_DEBUG_FLAGS_DECL;
9565
9566     PERL_ARGS_ASSERT_REGPROP;
9567
9568     sv_setpvs(sv, "");
9569
9570     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9571         /* It would be nice to FAIL() here, but this may be called from
9572            regexec.c, and it would be hard to supply pRExC_state. */
9573         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9574     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9575
9576     k = PL_regkind[OP(o)];
9577
9578     if (k == EXACT) {
9579         sv_catpvs(sv, " ");
9580         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9581          * is a crude hack but it may be the best for now since
9582          * we have no flag "this EXACTish node was UTF-8"
9583          * --jhi */
9584         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9585                   PERL_PV_ESCAPE_UNI_DETECT |
9586                   PERL_PV_PRETTY_ELLIPSES   |
9587                   PERL_PV_PRETTY_LTGT       |
9588                   PERL_PV_PRETTY_NOCLEAR
9589                   );
9590     } else if (k == TRIE) {
9591         /* print the details of the trie in dumpuntil instead, as
9592          * progi->data isn't available here */
9593         const char op = OP(o);
9594         const U32 n = ARG(o);
9595         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9596                (reg_ac_data *)progi->data->data[n] :
9597                NULL;
9598         const reg_trie_data * const trie
9599             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9600
9601         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9602         DEBUG_TRIE_COMPILE_r(
9603             Perl_sv_catpvf(aTHX_ sv,
9604                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9605                 (UV)trie->startstate,
9606                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9607                 (UV)trie->wordcount,
9608                 (UV)trie->minlen,
9609                 (UV)trie->maxlen,
9610                 (UV)TRIE_CHARCOUNT(trie),
9611                 (UV)trie->uniquecharcount
9612             )
9613         );
9614         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9615             int i;
9616             int rangestart = -1;
9617             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9618             sv_catpvs(sv, "[");
9619             for (i = 0; i <= 256; i++) {
9620                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9621                     if (rangestart == -1)
9622                         rangestart = i;
9623                 } else if (rangestart != -1) {
9624                     if (i <= rangestart + 3)
9625                         for (; rangestart < i; rangestart++)
9626                             put_byte(sv, rangestart);
9627                     else {
9628                         put_byte(sv, rangestart);
9629                         sv_catpvs(sv, "-");
9630                         put_byte(sv, i - 1);
9631                     }
9632                     rangestart = -1;
9633                 }
9634             }
9635             sv_catpvs(sv, "]");
9636         }
9637
9638     } else if (k == CURLY) {
9639         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9640             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9641         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9642     }
9643     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9644         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9645     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9646         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9647         if ( RXp_PAREN_NAMES(prog) ) {
9648             if ( k != REF || (OP(o) < NREF)) {
9649                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9650                 SV **name= av_fetch(list, ARG(o), 0 );
9651                 if (name)
9652                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9653             }
9654             else {
9655                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9656                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9657                 I32 *nums=(I32*)SvPVX(sv_dat);
9658                 SV **name= av_fetch(list, nums[0], 0 );
9659                 I32 n;
9660                 if (name) {
9661                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9662                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9663                                     (n ? "," : ""), (IV)nums[n]);
9664                     }
9665                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9666                 }
9667             }
9668         }
9669     } else if (k == GOSUB)
9670         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9671     else if (k == VERB) {
9672         if (!o->flags)
9673             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9674                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9675     } else if (k == LOGICAL)
9676         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9677     else if (k == FOLDCHAR)
9678         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9679     else if (k == ANYOF) {
9680         int i, rangestart = -1;
9681         const U8 flags = ANYOF_FLAGS(o);
9682         int do_sep = 0;
9683
9684         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9685         static const char * const anyofs[] = {
9686             "\\w",
9687             "\\W",
9688             "\\s",
9689             "\\S",
9690             "\\d",
9691             "\\D",
9692             "[:alnum:]",
9693             "[:^alnum:]",
9694             "[:alpha:]",
9695             "[:^alpha:]",
9696             "[:ascii:]",
9697             "[:^ascii:]",
9698             "[:cntrl:]",
9699             "[:^cntrl:]",
9700             "[:graph:]",
9701             "[:^graph:]",
9702             "[:lower:]",
9703             "[:^lower:]",
9704             "[:print:]",
9705             "[:^print:]",
9706             "[:punct:]",
9707             "[:^punct:]",
9708             "[:upper:]",
9709             "[:^upper:]",
9710             "[:xdigit:]",
9711             "[:^xdigit:]",
9712             "[:space:]",
9713             "[:^space:]",
9714             "[:blank:]",
9715             "[:^blank:]"
9716         };
9717
9718         if (flags & ANYOF_LOCALE)
9719             sv_catpvs(sv, "{loc}");
9720         if (flags & ANYOF_FOLD)
9721             sv_catpvs(sv, "{i}");
9722         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9723         if (flags & ANYOF_INVERT)
9724             sv_catpvs(sv, "^");
9725
9726         /* output what the standard cp 0-255 bitmap matches */
9727         for (i = 0; i <= 256; i++) {
9728             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9729                 if (rangestart == -1)
9730                     rangestart = i;
9731             } else if (rangestart != -1) {
9732                 if (i <= rangestart + 3)
9733                     for (; rangestart < i; rangestart++)
9734                         put_byte(sv, rangestart);
9735                 else {
9736                     put_byte(sv, rangestart);
9737                     sv_catpvs(sv, "-");
9738                     put_byte(sv, i - 1);
9739                 }
9740                 do_sep = 1;
9741                 rangestart = -1;
9742             }
9743         }
9744
9745         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9746         /* output any special charclass tests (used entirely under use locale) */
9747         if (ANYOF_CLASS_TEST_ANY_SET(o))
9748             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9749                 if (ANYOF_CLASS_TEST(o,i)) {
9750                     sv_catpv(sv, anyofs[i]);
9751                     do_sep = 1;
9752                 }
9753
9754         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9755
9756         /* output information about the unicode matching */
9757         if (flags & ANYOF_UNICODE_ALL)
9758             sv_catpvs(sv, "{unicode_all}");
9759         else if (flags & ANYOF_UTF8)
9760             sv_catpvs(sv, "{unicode}");
9761         if (flags & ANYOF_NONBITMAP_NON_UTF8)
9762             sv_catpvs(sv, "{outside bitmap}");
9763
9764         {
9765             SV *lv;
9766             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9767
9768             if (lv) {
9769                 if (sw) {
9770                     U8 s[UTF8_MAXBYTES_CASE+1];
9771
9772                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9773                         uvchr_to_utf8(s, i);
9774
9775                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9776                             if (rangestart == -1)
9777                                 rangestart = i;
9778                         } else if (rangestart != -1) {
9779                             if (i <= rangestart + 3)
9780                                 for (; rangestart < i; rangestart++) {
9781                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9782                                     U8 *p;
9783                                     for(p = s; p < e; p++)
9784                                         put_byte(sv, *p);
9785                                 }
9786                             else {
9787                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9788                                 U8 *p;
9789                                 for (p = s; p < e; p++)
9790                                     put_byte(sv, *p);
9791                                 sv_catpvs(sv, "-");
9792                                 e = uvchr_to_utf8(s, i-1);
9793                                 for (p = s; p < e; p++)
9794                                     put_byte(sv, *p);
9795                                 }
9796                                 rangestart = -1;
9797                             }
9798                         }
9799
9800                     sv_catpvs(sv, "..."); /* et cetera */
9801                 }
9802
9803                 {
9804                     char *s = savesvpv(lv);
9805                     char * const origs = s;
9806
9807                     while (*s && *s != '\n')
9808                         s++;
9809
9810                     if (*s == '\n') {
9811                         const char * const t = ++s;
9812
9813                         while (*s) {
9814                             if (*s == '\n')
9815                                 *s = ' ';
9816                             s++;
9817                         }
9818                         if (s[-1] == ' ')
9819                             s[-1] = 0;
9820
9821                         sv_catpv(sv, t);
9822                     }
9823
9824                     Safefree(origs);
9825                 }
9826             }
9827         }
9828
9829         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9830     }
9831     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9832         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9833 #else
9834     PERL_UNUSED_CONTEXT;
9835     PERL_UNUSED_ARG(sv);
9836     PERL_UNUSED_ARG(o);
9837     PERL_UNUSED_ARG(prog);
9838 #endif  /* DEBUGGING */
9839 }
9840
9841 SV *
9842 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9843 {                               /* Assume that RE_INTUIT is set */
9844     dVAR;
9845     struct regexp *const prog = (struct regexp *)SvANY(r);
9846     GET_RE_DEBUG_FLAGS_DECL;
9847
9848     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9849     PERL_UNUSED_CONTEXT;
9850
9851     DEBUG_COMPILE_r(
9852         {
9853             const char * const s = SvPV_nolen_const(prog->check_substr
9854                       ? prog->check_substr : prog->check_utf8);
9855
9856             if (!PL_colorset) reginitcolors();
9857             PerlIO_printf(Perl_debug_log,
9858                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9859                       PL_colors[4],
9860                       prog->check_substr ? "" : "utf8 ",
9861                       PL_colors[5],PL_colors[0],
9862                       s,
9863                       PL_colors[1],
9864                       (strlen(s) > 60 ? "..." : ""));
9865         } );
9866
9867     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9868 }
9869
9870 /*
9871    pregfree()
9872
9873    handles refcounting and freeing the perl core regexp structure. When
9874    it is necessary to actually free the structure the first thing it
9875    does is call the 'free' method of the regexp_engine associated to
9876    the regexp, allowing the handling of the void *pprivate; member
9877    first. (This routine is not overridable by extensions, which is why
9878    the extensions free is called first.)
9879
9880    See regdupe and regdupe_internal if you change anything here.
9881 */
9882 #ifndef PERL_IN_XSUB_RE
9883 void
9884 Perl_pregfree(pTHX_ REGEXP *r)
9885 {
9886     SvREFCNT_dec(r);
9887 }
9888
9889 void
9890 Perl_pregfree2(pTHX_ REGEXP *rx)
9891 {
9892     dVAR;
9893     struct regexp *const r = (struct regexp *)SvANY(rx);
9894     GET_RE_DEBUG_FLAGS_DECL;
9895
9896     PERL_ARGS_ASSERT_PREGFREE2;
9897
9898     if (r->mother_re) {
9899         ReREFCNT_dec(r->mother_re);
9900     } else {
9901         CALLREGFREE_PVT(rx); /* free the private data */
9902         SvREFCNT_dec(RXp_PAREN_NAMES(r));
9903     }
9904     if (r->substrs) {
9905         SvREFCNT_dec(r->anchored_substr);
9906         SvREFCNT_dec(r->anchored_utf8);
9907         SvREFCNT_dec(r->float_substr);
9908         SvREFCNT_dec(r->float_utf8);
9909         Safefree(r->substrs);
9910     }
9911     RX_MATCH_COPY_FREE(rx);
9912 #ifdef PERL_OLD_COPY_ON_WRITE
9913     SvREFCNT_dec(r->saved_copy);
9914 #endif
9915     Safefree(r->offs);
9916 }
9917
9918 /*  reg_temp_copy()
9919
9920     This is a hacky workaround to the structural issue of match results
9921     being stored in the regexp structure which is in turn stored in
9922     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9923     could be PL_curpm in multiple contexts, and could require multiple
9924     result sets being associated with the pattern simultaneously, such
9925     as when doing a recursive match with (??{$qr})
9926
9927     The solution is to make a lightweight copy of the regexp structure
9928     when a qr// is returned from the code executed by (??{$qr}) this
9929     lightweight copy doesnt actually own any of its data except for
9930     the starp/end and the actual regexp structure itself.
9931
9932 */
9933
9934
9935 REGEXP *
9936 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9937 {
9938     struct regexp *ret;
9939     struct regexp *const r = (struct regexp *)SvANY(rx);
9940     register const I32 npar = r->nparens+1;
9941
9942     PERL_ARGS_ASSERT_REG_TEMP_COPY;
9943
9944     if (!ret_x)
9945         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9946     ret = (struct regexp *)SvANY(ret_x);
9947
9948     (void)ReREFCNT_inc(rx);
9949     /* We can take advantage of the existing "copied buffer" mechanism in SVs
9950        by pointing directly at the buffer, but flagging that the allocated
9951        space in the copy is zero. As we've just done a struct copy, it's now
9952        a case of zero-ing that, rather than copying the current length.  */
9953     SvPV_set(ret_x, RX_WRAPPED(rx));
9954     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9955     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9956            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9957     SvLEN_set(ret_x, 0);
9958     SvSTASH_set(ret_x, NULL);
9959     SvMAGIC_set(ret_x, NULL);
9960     Newx(ret->offs, npar, regexp_paren_pair);
9961     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9962     if (r->substrs) {
9963         Newx(ret->substrs, 1, struct reg_substr_data);
9964         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9965
9966         SvREFCNT_inc_void(ret->anchored_substr);
9967         SvREFCNT_inc_void(ret->anchored_utf8);
9968         SvREFCNT_inc_void(ret->float_substr);
9969         SvREFCNT_inc_void(ret->float_utf8);
9970
9971         /* check_substr and check_utf8, if non-NULL, point to either their
9972            anchored or float namesakes, and don't hold a second reference.  */
9973     }
9974     RX_MATCH_COPIED_off(ret_x);
9975 #ifdef PERL_OLD_COPY_ON_WRITE
9976     ret->saved_copy = NULL;
9977 #endif
9978     ret->mother_re = rx;
9979
9980     return ret_x;
9981 }
9982 #endif
9983
9984 /* regfree_internal()
9985
9986    Free the private data in a regexp. This is overloadable by
9987    extensions. Perl takes care of the regexp structure in pregfree(),
9988    this covers the *pprivate pointer which technically perl doesn't
9989    know about, however of course we have to handle the
9990    regexp_internal structure when no extension is in use.
9991
9992    Note this is called before freeing anything in the regexp
9993    structure.
9994  */
9995
9996 void
9997 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9998 {
9999     dVAR;
10000     struct regexp *const r = (struct regexp *)SvANY(rx);
10001     RXi_GET_DECL(r,ri);
10002     GET_RE_DEBUG_FLAGS_DECL;
10003
10004     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
10005
10006     DEBUG_COMPILE_r({
10007         if (!PL_colorset)
10008             reginitcolors();
10009         {
10010             SV *dsv= sv_newmortal();
10011             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
10012                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
10013             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
10014                 PL_colors[4],PL_colors[5],s);
10015         }
10016     });
10017 #ifdef RE_TRACK_PATTERN_OFFSETS
10018     if (ri->u.offsets)
10019         Safefree(ri->u.offsets);             /* 20010421 MJD */
10020 #endif
10021     if (ri->data) {
10022         int n = ri->data->count;
10023         PAD* new_comppad = NULL;
10024         PAD* old_comppad;
10025         PADOFFSET refcnt;
10026
10027         while (--n >= 0) {
10028           /* If you add a ->what type here, update the comment in regcomp.h */
10029             switch (ri->data->what[n]) {
10030             case 'a':
10031             case 's':
10032             case 'S':
10033             case 'u':
10034                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
10035                 break;
10036             case 'f':
10037                 Safefree(ri->data->data[n]);
10038                 break;
10039             case 'p':
10040                 new_comppad = MUTABLE_AV(ri->data->data[n]);
10041                 break;
10042             case 'o':
10043                 if (new_comppad == NULL)
10044                     Perl_croak(aTHX_ "panic: pregfree comppad");
10045                 PAD_SAVE_LOCAL(old_comppad,
10046                     /* Watch out for global destruction's random ordering. */
10047                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
10048                 );
10049                 OP_REFCNT_LOCK;
10050                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
10051                 OP_REFCNT_UNLOCK;
10052                 if (!refcnt)
10053                     op_free((OP_4tree*)ri->data->data[n]);
10054
10055                 PAD_RESTORE_LOCAL(old_comppad);
10056                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
10057                 new_comppad = NULL;
10058                 break;
10059             case 'n':
10060                 break;
10061             case 'T':
10062                 { /* Aho Corasick add-on structure for a trie node.
10063                      Used in stclass optimization only */
10064                     U32 refcount;
10065                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
10066                     OP_REFCNT_LOCK;
10067                     refcount = --aho->refcount;
10068                     OP_REFCNT_UNLOCK;
10069                     if ( !refcount ) {
10070                         PerlMemShared_free(aho->states);
10071                         PerlMemShared_free(aho->fail);
10072                          /* do this last!!!! */
10073                         PerlMemShared_free(ri->data->data[n]);
10074                         PerlMemShared_free(ri->regstclass);
10075                     }
10076                 }
10077                 break;
10078             case 't':
10079                 {
10080                     /* trie structure. */
10081                     U32 refcount;
10082                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
10083                     OP_REFCNT_LOCK;
10084                     refcount = --trie->refcount;
10085                     OP_REFCNT_UNLOCK;
10086                     if ( !refcount ) {
10087                         PerlMemShared_free(trie->charmap);
10088                         PerlMemShared_free(trie->states);
10089                         PerlMemShared_free(trie->trans);
10090                         if (trie->bitmap)
10091                             PerlMemShared_free(trie->bitmap);
10092                         if (trie->jump)
10093                             PerlMemShared_free(trie->jump);
10094                         PerlMemShared_free(trie->wordinfo);
10095                         /* do this last!!!! */
10096                         PerlMemShared_free(ri->data->data[n]);
10097                     }
10098                 }
10099                 break;
10100             default:
10101                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
10102             }
10103         }
10104         Safefree(ri->data->what);
10105         Safefree(ri->data);
10106     }
10107
10108     Safefree(ri);
10109 }
10110
10111 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10112 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
10113 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
10114
10115 /*
10116    re_dup - duplicate a regexp.
10117
10118    This routine is expected to clone a given regexp structure. It is only
10119    compiled under USE_ITHREADS.
10120
10121    After all of the core data stored in struct regexp is duplicated
10122    the regexp_engine.dupe method is used to copy any private data
10123    stored in the *pprivate pointer. This allows extensions to handle
10124    any duplication it needs to do.
10125
10126    See pregfree() and regfree_internal() if you change anything here.
10127 */
10128 #if defined(USE_ITHREADS)
10129 #ifndef PERL_IN_XSUB_RE
10130 void
10131 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
10132 {
10133     dVAR;
10134     I32 npar;
10135     const struct regexp *r = (const struct regexp *)SvANY(sstr);
10136     struct regexp *ret = (struct regexp *)SvANY(dstr);
10137
10138     PERL_ARGS_ASSERT_RE_DUP_GUTS;
10139
10140     npar = r->nparens+1;
10141     Newx(ret->offs, npar, regexp_paren_pair);
10142     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10143     if(ret->swap) {
10144         /* no need to copy these */
10145         Newx(ret->swap, npar, regexp_paren_pair);
10146     }
10147
10148     if (ret->substrs) {
10149         /* Do it this way to avoid reading from *r after the StructCopy().
10150            That way, if any of the sv_dup_inc()s dislodge *r from the L1
10151            cache, it doesn't matter.  */
10152         const bool anchored = r->check_substr
10153             ? r->check_substr == r->anchored_substr
10154             : r->check_utf8 == r->anchored_utf8;
10155         Newx(ret->substrs, 1, struct reg_substr_data);
10156         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10157
10158         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10159         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10160         ret->float_substr = sv_dup_inc(ret->float_substr, param);
10161         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
10162
10163         /* check_substr and check_utf8, if non-NULL, point to either their
10164            anchored or float namesakes, and don't hold a second reference.  */
10165
10166         if (ret->check_substr) {
10167             if (anchored) {
10168                 assert(r->check_utf8 == r->anchored_utf8);
10169                 ret->check_substr = ret->anchored_substr;
10170                 ret->check_utf8 = ret->anchored_utf8;
10171             } else {
10172                 assert(r->check_substr == r->float_substr);
10173                 assert(r->check_utf8 == r->float_utf8);
10174                 ret->check_substr = ret->float_substr;
10175                 ret->check_utf8 = ret->float_utf8;
10176             }
10177         } else if (ret->check_utf8) {
10178             if (anchored) {
10179                 ret->check_utf8 = ret->anchored_utf8;
10180             } else {
10181                 ret->check_utf8 = ret->float_utf8;
10182             }
10183         }
10184     }
10185
10186     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
10187
10188     if (ret->pprivate)
10189         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
10190
10191     if (RX_MATCH_COPIED(dstr))
10192         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
10193     else
10194         ret->subbeg = NULL;
10195 #ifdef PERL_OLD_COPY_ON_WRITE
10196     ret->saved_copy = NULL;
10197 #endif
10198
10199     if (ret->mother_re) {
10200         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10201             /* Our storage points directly to our mother regexp, but that's
10202                1: a buffer in a different thread
10203                2: something we no longer hold a reference on
10204                so we need to copy it locally.  */
10205             /* Note we need to sue SvCUR() on our mother_re, because it, in
10206                turn, may well be pointing to its own mother_re.  */
10207             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10208                                    SvCUR(ret->mother_re)+1));
10209             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10210         }
10211         ret->mother_re      = NULL;
10212     }
10213     ret->gofs = 0;
10214 }
10215 #endif /* PERL_IN_XSUB_RE */
10216
10217 /*
10218    regdupe_internal()
10219
10220    This is the internal complement to regdupe() which is used to copy
10221    the structure pointed to by the *pprivate pointer in the regexp.
10222    This is the core version of the extension overridable cloning hook.
10223    The regexp structure being duplicated will be copied by perl prior
10224    to this and will be provided as the regexp *r argument, however
10225    with the /old/ structures pprivate pointer value. Thus this routine
10226    may override any copying normally done by perl.
10227
10228    It returns a pointer to the new regexp_internal structure.
10229 */
10230
10231 void *
10232 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10233 {
10234     dVAR;
10235     struct regexp *const r = (struct regexp *)SvANY(rx);
10236     regexp_internal *reti;
10237     int len, npar;
10238     RXi_GET_DECL(r,ri);
10239
10240     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10241
10242     npar = r->nparens+1;
10243     len = ProgLen(ri);
10244
10245     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10246     Copy(ri->program, reti->program, len+1, regnode);
10247
10248
10249     reti->regstclass = NULL;
10250
10251     if (ri->data) {
10252         struct reg_data *d;
10253         const int count = ri->data->count;
10254         int i;
10255
10256         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10257                 char, struct reg_data);
10258         Newx(d->what, count, U8);
10259
10260         d->count = count;
10261         for (i = 0; i < count; i++) {
10262             d->what[i] = ri->data->what[i];
10263             switch (d->what[i]) {
10264                 /* legal options are one of: sSfpontTua
10265                    see also regcomp.h and pregfree() */
10266             case 'a': /* actually an AV, but the dup function is identical.  */
10267             case 's':
10268             case 'S':
10269             case 'p': /* actually an AV, but the dup function is identical.  */
10270             case 'u': /* actually an HV, but the dup function is identical.  */
10271                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10272                 break;
10273             case 'f':
10274                 /* This is cheating. */
10275                 Newx(d->data[i], 1, struct regnode_charclass_class);
10276                 StructCopy(ri->data->data[i], d->data[i],
10277                             struct regnode_charclass_class);
10278                 reti->regstclass = (regnode*)d->data[i];
10279                 break;
10280             case 'o':
10281                 /* Compiled op trees are readonly and in shared memory,
10282                    and can thus be shared without duplication. */
10283                 OP_REFCNT_LOCK;
10284                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10285                 OP_REFCNT_UNLOCK;
10286                 break;
10287             case 'T':
10288                 /* Trie stclasses are readonly and can thus be shared
10289                  * without duplication. We free the stclass in pregfree
10290                  * when the corresponding reg_ac_data struct is freed.
10291                  */
10292                 reti->regstclass= ri->regstclass;
10293                 /* Fall through */
10294             case 't':
10295                 OP_REFCNT_LOCK;
10296                 ((reg_trie_data*)ri->data->data[i])->refcount++;
10297                 OP_REFCNT_UNLOCK;
10298                 /* Fall through */
10299             case 'n':
10300                 d->data[i] = ri->data->data[i];
10301                 break;
10302             default:
10303                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10304             }
10305         }
10306
10307         reti->data = d;
10308     }
10309     else
10310         reti->data = NULL;
10311
10312     reti->name_list_idx = ri->name_list_idx;
10313
10314 #ifdef RE_TRACK_PATTERN_OFFSETS
10315     if (ri->u.offsets) {
10316         Newx(reti->u.offsets, 2*len+1, U32);
10317         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10318     }
10319 #else
10320     SetProgLen(reti,len);
10321 #endif
10322
10323     return (void*)reti;
10324 }
10325
10326 #endif    /* USE_ITHREADS */
10327
10328 #ifndef PERL_IN_XSUB_RE
10329
10330 /*
10331  - regnext - dig the "next" pointer out of a node
10332  */
10333 regnode *
10334 Perl_regnext(pTHX_ register regnode *p)
10335 {
10336     dVAR;
10337     register I32 offset;
10338
10339     if (!p)
10340         return(NULL);
10341
10342     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
10343         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10344     }
10345
10346     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10347     if (offset == 0)
10348         return(NULL);
10349
10350     return(p+offset);
10351 }
10352 #endif
10353
10354 STATIC void
10355 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10356 {
10357     va_list args;
10358     STRLEN l1 = strlen(pat1);
10359     STRLEN l2 = strlen(pat2);
10360     char buf[512];
10361     SV *msv;
10362     const char *message;
10363
10364     PERL_ARGS_ASSERT_RE_CROAK2;
10365
10366     if (l1 > 510)
10367         l1 = 510;
10368     if (l1 + l2 > 510)
10369         l2 = 510 - l1;
10370     Copy(pat1, buf, l1 , char);
10371     Copy(pat2, buf + l1, l2 , char);
10372     buf[l1 + l2] = '\n';
10373     buf[l1 + l2 + 1] = '\0';
10374 #ifdef I_STDARG
10375     /* ANSI variant takes additional second argument */
10376     va_start(args, pat2);
10377 #else
10378     va_start(args);
10379 #endif
10380     msv = vmess(buf, &args);
10381     va_end(args);
10382     message = SvPV_const(msv,l1);
10383     if (l1 > 512)
10384         l1 = 512;
10385     Copy(message, buf, l1 , char);
10386     buf[l1-1] = '\0';                   /* Overwrite \n */
10387     Perl_croak(aTHX_ "%s", buf);
10388 }
10389
10390 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
10391
10392 #ifndef PERL_IN_XSUB_RE
10393 void
10394 Perl_save_re_context(pTHX)
10395 {
10396     dVAR;
10397
10398     struct re_save_state *state;
10399
10400     SAVEVPTR(PL_curcop);
10401     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10402
10403     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10404     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10405     SSPUSHUV(SAVEt_RE_STATE);
10406
10407     Copy(&PL_reg_state, state, 1, struct re_save_state);
10408
10409     PL_reg_start_tmp = 0;
10410     PL_reg_start_tmpl = 0;
10411     PL_reg_oldsaved = NULL;
10412     PL_reg_oldsavedlen = 0;
10413     PL_reg_maxiter = 0;
10414     PL_reg_leftiter = 0;
10415     PL_reg_poscache = NULL;
10416     PL_reg_poscache_size = 0;
10417 #ifdef PERL_OLD_COPY_ON_WRITE
10418     PL_nrs = NULL;
10419 #endif
10420
10421     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10422     if (PL_curpm) {
10423         const REGEXP * const rx = PM_GETRE(PL_curpm);
10424         if (rx) {
10425             U32 i;
10426             for (i = 1; i <= RX_NPARENS(rx); i++) {
10427                 char digits[TYPE_CHARS(long)];
10428                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10429                 GV *const *const gvp
10430                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10431
10432                 if (gvp) {
10433                     GV * const gv = *gvp;
10434                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10435                         save_scalar(gv);
10436                 }
10437             }
10438         }
10439     }
10440 }
10441 #endif
10442
10443 static void
10444 clear_re(pTHX_ void *r)
10445 {
10446     dVAR;
10447     ReREFCNT_dec((REGEXP *)r);
10448 }
10449
10450 #ifdef DEBUGGING
10451
10452 STATIC void
10453 S_put_byte(pTHX_ SV *sv, int c)
10454 {
10455     PERL_ARGS_ASSERT_PUT_BYTE;
10456
10457     /* Our definition of isPRINT() ignores locales, so only bytes that are
10458        not part of UTF-8 are considered printable. I assume that the same
10459        holds for UTF-EBCDIC.
10460        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10461        which Wikipedia says:
10462
10463        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10464        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10465        identical, to the ASCII delete (DEL) or rubout control character.
10466        ) So the old condition can be simplified to !isPRINT(c)  */
10467     if (!isPRINT(c)) {
10468         if (c < 256) {
10469             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10470         }
10471         else {
10472             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10473         }
10474     }
10475     else {
10476         const char string = c;
10477         if (c == '-' || c == ']' || c == '\\' || c == '^')
10478             sv_catpvs(sv, "\\");
10479         sv_catpvn(sv, &string, 1);
10480     }
10481 }
10482
10483
10484 #define CLEAR_OPTSTART \
10485     if (optstart) STMT_START { \
10486             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10487             optstart=NULL; \
10488     } STMT_END
10489
10490 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10491
10492 STATIC const regnode *
10493 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10494             const regnode *last, const regnode *plast,
10495             SV* sv, I32 indent, U32 depth)
10496 {
10497     dVAR;
10498     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
10499     register const regnode *next;
10500     const regnode *optstart= NULL;
10501
10502     RXi_GET_DECL(r,ri);
10503     GET_RE_DEBUG_FLAGS_DECL;
10504
10505     PERL_ARGS_ASSERT_DUMPUNTIL;
10506
10507 #ifdef DEBUG_DUMPUNTIL
10508     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10509         last ? last-start : 0,plast ? plast-start : 0);
10510 #endif
10511
10512     if (plast && plast < last)
10513         last= plast;
10514
10515     while (PL_regkind[op] != END && (!last || node < last)) {
10516         /* While that wasn't END last time... */
10517         NODE_ALIGN(node);
10518         op = OP(node);
10519         if (op == CLOSE || op == WHILEM)
10520             indent--;
10521         next = regnext((regnode *)node);
10522
10523         /* Where, what. */
10524         if (OP(node) == OPTIMIZED) {
10525             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10526                 optstart = node;
10527             else
10528                 goto after_print;
10529         } else
10530             CLEAR_OPTSTART;
10531
10532         regprop(r, sv, node);
10533         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10534                       (int)(2*indent + 1), "", SvPVX_const(sv));
10535
10536         if (OP(node) != OPTIMIZED) {
10537             if (next == NULL)           /* Next ptr. */
10538                 PerlIO_printf(Perl_debug_log, " (0)");
10539             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10540                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10541             else
10542                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10543             (void)PerlIO_putc(Perl_debug_log, '\n');
10544         }
10545
10546       after_print:
10547         if (PL_regkind[(U8)op] == BRANCHJ) {
10548             assert(next);
10549             {
10550                 register const regnode *nnode = (OP(next) == LONGJMP
10551                                              ? regnext((regnode *)next)
10552                                              : next);
10553                 if (last && nnode > last)
10554                     nnode = last;
10555                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10556             }
10557         }
10558         else if (PL_regkind[(U8)op] == BRANCH) {
10559             assert(next);
10560             DUMPUNTIL(NEXTOPER(node), next);
10561         }
10562         else if ( PL_regkind[(U8)op]  == TRIE ) {
10563             const regnode *this_trie = node;
10564             const char op = OP(node);
10565             const U32 n = ARG(node);
10566             const reg_ac_data * const ac = op>=AHOCORASICK ?
10567                (reg_ac_data *)ri->data->data[n] :
10568                NULL;
10569             const reg_trie_data * const trie =
10570                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10571 #ifdef DEBUGGING
10572             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10573 #endif
10574             const regnode *nextbranch= NULL;
10575             I32 word_idx;
10576             sv_setpvs(sv, "");
10577             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10578                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10579
10580                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10581                    (int)(2*(indent+3)), "",
10582                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10583                             PL_colors[0], PL_colors[1],
10584                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10585                             PERL_PV_PRETTY_ELLIPSES    |
10586                             PERL_PV_PRETTY_LTGT
10587                             )
10588                             : "???"
10589                 );
10590                 if (trie->jump) {
10591                     U16 dist= trie->jump[word_idx+1];
10592                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10593                                   (UV)((dist ? this_trie + dist : next) - start));
10594                     if (dist) {
10595                         if (!nextbranch)
10596                             nextbranch= this_trie + trie->jump[0];
10597                         DUMPUNTIL(this_trie + dist, nextbranch);
10598                     }
10599                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10600                         nextbranch= regnext((regnode *)nextbranch);
10601                 } else {
10602                     PerlIO_printf(Perl_debug_log, "\n");
10603                 }
10604             }
10605             if (last && next > last)
10606                 node= last;
10607             else
10608                 node= next;
10609         }
10610         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10611             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10612                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10613         }
10614         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10615             assert(next);
10616             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10617         }
10618         else if ( op == PLUS || op == STAR) {
10619             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10620         }
10621         else if (op == ANYOF) {
10622             /* arglen 1 + class block */
10623             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
10624                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10625             node = NEXTOPER(node);
10626         }
10627         else if (PL_regkind[(U8)op] == EXACT) {
10628             /* Literal string, where present. */
10629             node += NODE_SZ_STR(node) - 1;
10630             node = NEXTOPER(node);
10631         }
10632         else {
10633             node = NEXTOPER(node);
10634             node += regarglen[(U8)op];
10635         }
10636         if (op == CURLYX || op == OPEN)
10637             indent++;
10638     }
10639     CLEAR_OPTSTART;
10640 #ifdef DEBUG_DUMPUNTIL
10641     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10642 #endif
10643     return node;
10644 }
10645
10646 #endif  /* DEBUGGING */
10647
10648 /*
10649  * Local variables:
10650  * c-indentation-style: bsd
10651  * c-basic-offset: 4
10652  * indent-tabs-mode: t
10653  * End:
10654  *
10655  * ex: set ts=8 sts=4 sw=4 noet:
10656  */