regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifdef op
  99 #undef op
 100 #endif /* op */
 101
 102 #ifdef MSDOS
 103 #  if defined(BUGGY_MSC6)
 104  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 105 #    pragma optimize("a",off)
 106  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 107 #    pragma optimize("w",on )
 108 #  endif /* BUGGY_MSC6 */
 109 #endif /* MSDOS */
 110
 111 #ifndef STATIC
 112 #define STATIC  static
 113 #endif
 114
 115
 116 typedef struct RExC_state_t {
 117     U32         flags;                  /* RXf_* are we folding, multilining? */
 118     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 119     char        *precomp;               /* uncompiled string. */
 120     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 121     regexp      *rx;                    /* perl core regexp structure */
 122     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 123     char        *start;                 /* Start of input for compile */
 124     char        *end;                   /* End of input for compile */
 125     char        *parse;                 /* Input-scan pointer. */
 126     I32         whilem_seen;            /* number of WHILEM in this expr */
 127     regnode     *emit_start;            /* Start of emitted-code area */
 128     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 129     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 130     I32         naughty;                /* How bad is this pattern? */
 131     I32         sawback;                /* Did we see \1, ...? */
 132     U32         seen;
 133     I32         size;                   /* Code size. */
 134     I32         npar;                   /* Capture buffer count, (OPEN). */
 135     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 136     I32         nestroot;               /* root parens we are in - used by accept */
 137     I32         extralen;
 138     I32         seen_zerolen;
 139     regnode     **open_parens;          /* pointers to open parens */
 140     regnode     **close_parens;         /* pointers to close parens */
 141     regnode     *opend;                 /* END node in program */
 142     I32         utf8;           /* whether the pattern is utf8 or not */
 143     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 144                                 /* XXX use this for future optimisation of case
 145                                  * where pattern must be upgraded to utf8. */
 146     I32         uni_semantics;  /* If a d charset modifier should use unicode
 147                                    rules, even if the pattern is not in
 148                                    utf8 */
 149     HV          *paren_names;           /* Paren names */
 150
 151     regnode     **recurse;              /* Recurse regops */
 152     I32         recurse_count;          /* Number of recurse regops */
 153     I32         in_lookbehind;
 154     I32         contains_locale;
 155     I32         override_recoding;
 156     I32         in_multi_char_class;
 157     struct reg_code_block *code_blocks; /* positions of literal (?{})
 158                                             within pattern */
 159     int         num_code_blocks;        /* size of code_blocks[] */
 160     int         code_index;             /* next code_blocks[] slot */
 161 #if ADD_TO_REGEXEC
 162     char        *starttry;              /* -Dr: where regtry was called. */
 163 #define RExC_starttry   (pRExC_state->starttry)
 164 #endif
 165     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 166 #ifdef DEBUGGING
 167     const char  *lastparse;
 168     I32         lastnum;
 169     AV          *paren_name_list;       /* idx -> name */
 170 #define RExC_lastparse  (pRExC_state->lastparse)
 171 #define RExC_lastnum    (pRExC_state->lastnum)
 172 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 173 #endif
 174 } RExC_state_t;
 175
 176 #define RExC_flags      (pRExC_state->flags)
 177 #define RExC_pm_flags   (pRExC_state->pm_flags)
 178 #define RExC_precomp    (pRExC_state->precomp)
 179 #define RExC_rx_sv      (pRExC_state->rx_sv)
 180 #define RExC_rx         (pRExC_state->rx)
 181 #define RExC_rxi        (pRExC_state->rxi)
 182 #define RExC_start      (pRExC_state->start)
 183 #define RExC_end        (pRExC_state->end)
 184 #define RExC_parse      (pRExC_state->parse)
 185 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 186 #ifdef RE_TRACK_PATTERN_OFFSETS
 187 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 188 #endif
 189 #define RExC_emit       (pRExC_state->emit)
 190 #define RExC_emit_start (pRExC_state->emit_start)
 191 #define RExC_emit_bound (pRExC_state->emit_bound)
 192 #define RExC_naughty    (pRExC_state->naughty)
 193 #define RExC_sawback    (pRExC_state->sawback)
 194 #define RExC_seen       (pRExC_state->seen)
 195 #define RExC_size       (pRExC_state->size)
 196 #define RExC_npar       (pRExC_state->npar)
 197 #define RExC_nestroot   (pRExC_state->nestroot)
 198 #define RExC_extralen   (pRExC_state->extralen)
 199 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 200 #define RExC_utf8       (pRExC_state->utf8)
 201 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 202 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 203 #define RExC_open_parens        (pRExC_state->open_parens)
 204 #define RExC_close_parens       (pRExC_state->close_parens)
 205 #define RExC_opend      (pRExC_state->opend)
 206 #define RExC_paren_names        (pRExC_state->paren_names)
 207 #define RExC_recurse    (pRExC_state->recurse)
 208 #define RExC_recurse_count      (pRExC_state->recurse_count)
 209 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 210 #define RExC_contains_locale    (pRExC_state->contains_locale)
 211 #define RExC_override_recoding (pRExC_state->override_recoding)
 212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 213
 214
 215 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 216 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 217         ((*s) == '{' && regcurly(s)))
 218
 219 #ifdef SPSTART
 220 #undef SPSTART          /* dratted cpp namespace... */
 221 #endif
 222 /*
 223  * Flags to be passed up and down.
 224  */
 225 #define WORST           0       /* Worst case. */
 226 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 227
 228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 229  * character.  (There needs to be a case: in the switch statement in regexec.c
 230  * for any node marked SIMPLE.)  Note that this is not the same thing as
 231  * REGNODE_SIMPLE */
 232 #define SIMPLE          0x02
 233 #define SPSTART         0x04    /* Starts with * or + */
 234 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 235 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 236
 237 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 238
 239 /* whether trie related optimizations are enabled */
 240 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 241 #define TRIE_STUDY_OPT
 242 #define FULL_TRIE_STUDY
 243 #define TRIE_STCLASS
 244 #endif
 245
 246
 247
 248 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 249 #define PBITVAL(paren) (1 << ((paren) & 7))
 250 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 251 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 252 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 253
 254 /* If not already in utf8, do a longjmp back to the beginning */
 255 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 256 #define REQUIRE_UTF8    STMT_START {                                       \
 257                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 258                         } STMT_END
 259
 260 /* This converts the named class defined in regcomp.h to its equivalent class
 261  * number defined in handy.h. */
 262 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 263 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 264
 265 /* About scan_data_t.
 266
 267   During optimisation we recurse through the regexp program performing
 268   various inplace (keyhole style) optimisations. In addition study_chunk
 269   and scan_commit populate this data structure with information about
 270   what strings MUST appear in the pattern. We look for the longest
 271   string that must appear at a fixed location, and we look for the
 272   longest string that may appear at a floating location. So for instance
 273   in the pattern:
 274
 275     /FOO[xX]A.*B[xX]BAR/
 276
 277   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 278   strings (because they follow a .* construct). study_chunk will identify
 279   both FOO and BAR as being the longest fixed and floating strings respectively.
 280
 281   The strings can be composites, for instance
 282
 283      /(f)(o)(o)/
 284
 285   will result in a composite fixed substring 'foo'.
 286
 287   For each string some basic information is maintained:
 288
 289   - offset or min_offset
 290     This is the position the string must appear at, or not before.
 291     It also implicitly (when combined with minlenp) tells us how many
 292     characters must match before the string we are searching for.
 293     Likewise when combined with minlenp and the length of the string it
 294     tells us how many characters must appear after the string we have
 295     found.
 296
 297   - max_offset
 298     Only used for floating strings. This is the rightmost point that
 299     the string can appear at. If set to I32 max it indicates that the
 300     string can occur infinitely far to the right.
 301
 302   - minlenp
 303     A pointer to the minimum number of characters of the pattern that the
 304     string was found inside. This is important as in the case of positive
 305     lookahead or positive lookbehind we can have multiple patterns
 306     involved. Consider
 307
 308     /(?=FOO).*F/
 309
 310     The minimum length of the pattern overall is 3, the minimum length
 311     of the lookahead part is 3, but the minimum length of the part that
 312     will actually match is 1. So 'FOO's minimum length is 3, but the
 313     minimum length for the F is 1. This is important as the minimum length
 314     is used to determine offsets in front of and behind the string being
 315     looked for.  Since strings can be composites this is the length of the
 316     pattern at the time it was committed with a scan_commit. Note that
 317     the length is calculated by study_chunk, so that the minimum lengths
 318     are not known until the full pattern has been compiled, thus the
 319     pointer to the value.
 320
 321   - lookbehind
 322
 323     In the case of lookbehind the string being searched for can be
 324     offset past the start point of the final matching string.
 325     If this value was just blithely removed from the min_offset it would
 326     invalidate some of the calculations for how many chars must match
 327     before or after (as they are derived from min_offset and minlen and
 328     the length of the string being searched for).
 329     When the final pattern is compiled and the data is moved from the
 330     scan_data_t structure into the regexp structure the information
 331     about lookbehind is factored in, with the information that would
 332     have been lost precalculated in the end_shift field for the
 333     associated string.
 334
 335   The fields pos_min and pos_delta are used to store the minimum offset
 336   and the delta to the maximum offset at the current point in the pattern.
 337
 338 */
 339
 340 typedef struct scan_data_t {
 341     /*I32 len_min;      unused */
 342     /*I32 len_delta;    unused */
 343     I32 pos_min;
 344     I32 pos_delta;
 345     SV *last_found;
 346     I32 last_end;           /* min value, <0 unless valid. */
 347     I32 last_start_min;
 348     I32 last_start_max;
 349     SV **longest;           /* Either &l_fixed, or &l_float. */
 350     SV *longest_fixed;      /* longest fixed string found in pattern */
 351     I32 offset_fixed;       /* offset where it starts */
 352     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 353     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 354     SV *longest_float;      /* longest floating string found in pattern */
 355     I32 offset_float_min;   /* earliest point in string it can appear */
 356     I32 offset_float_max;   /* latest point in string it can appear */
 357     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 358     I32 lookbehind_float;   /* is the position of the string modified by LB */
 359     I32 flags;
 360     I32 whilem_c;
 361     I32 *last_closep;
 362     struct regnode_charclass_class *start_class;
 363 } scan_data_t;
 364
 365 /*
 366  * Forward declarations for pregcomp()'s friends.
 367  */
 368
 369 static const scan_data_t zero_scan_data =
 370   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 371
 372 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 373 #define SF_BEFORE_SEOL          0x0001
 374 #define SF_BEFORE_MEOL          0x0002
 375 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 376 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 377
 378 #ifdef NO_UNARY_PLUS
 379 #  define SF_FIX_SHIFT_EOL      (0+2)
 380 #  define SF_FL_SHIFT_EOL               (0+4)
 381 #else
 382 #  define SF_FIX_SHIFT_EOL      (+2)
 383 #  define SF_FL_SHIFT_EOL               (+4)
 384 #endif
 385
 386 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 387 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 388
 389 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 390 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 391 #define SF_IS_INF               0x0040
 392 #define SF_HAS_PAR              0x0080
 393 #define SF_IN_PAR               0x0100
 394 #define SF_HAS_EVAL             0x0200
 395 #define SCF_DO_SUBSTR           0x0400
 396 #define SCF_DO_STCLASS_AND      0x0800
 397 #define SCF_DO_STCLASS_OR       0x1000
 398 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 399 #define SCF_WHILEM_VISITED_POS  0x2000
 400
 401 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 402 #define SCF_SEEN_ACCEPT         0x8000
 403
 404 #define UTF cBOOL(RExC_utf8)
 405
 406 /* The enums for all these are ordered so things work out correctly */
 407 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 408 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 409 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 410 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 411 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 412 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 413 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 414
 415 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 416
 417 #define OOB_NAMEDCLASS          -1
 418
 419 /* There is no code point that is out-of-bounds, so this is problematic.  But
 420  * its only current use is to initialize a variable that is always set before
 421  * looked at. */
 422 #define OOB_UNICODE             0xDEADBEEF
 423
 424 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 425 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 426
 427
 428 /* length of regex to show in messages that don't mark a position within */
 429 #define RegexLengthToShowInErrorMessages 127
 430
 431 /*
 432  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 433  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 434  * op/pragma/warn/regcomp.
 435  */
 436 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 437 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 438
 439 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 440
 441 /*
 442  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 443  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 444  * "...".
 445  */
 446 #define _FAIL(code) STMT_START {                                        \
 447     const char *ellipses = "";                                          \
 448     IV len = RExC_end - RExC_precomp;                                   \
 449                                                                         \
 450     if (!SIZE_ONLY)                                                     \
 451         SAVEFREESV(RExC_rx_sv);                                         \
 452     if (len > RegexLengthToShowInErrorMessages) {                       \
 453         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 454         len = RegexLengthToShowInErrorMessages - 10;                    \
 455         ellipses = "...";                                               \
 456     }                                                                   \
 457     code;                                                               \
 458 } STMT_END
 459
 460 #define FAIL(msg) _FAIL(                            \
 461     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 462             msg, (int)len, RExC_precomp, ellipses))
 463
 464 #define FAIL2(msg,arg) _FAIL(                       \
 465     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 466             arg, (int)len, RExC_precomp, ellipses))
 467
 468 /*
 469  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 470  */
 471 #define Simple_vFAIL(m) STMT_START {                                    \
 472     const IV offset = RExC_parse - RExC_precomp;                        \
 473     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 474             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 475 } STMT_END
 476
 477 /*
 478  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 479  */
 480 #define vFAIL(m) STMT_START {                           \
 481     if (!SIZE_ONLY)                                     \
 482         SAVEFREESV(RExC_rx_sv);                         \
 483     Simple_vFAIL(m);                                    \
 484 } STMT_END
 485
 486 /*
 487  * Like Simple_vFAIL(), but accepts two arguments.
 488  */
 489 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 490     const IV offset = RExC_parse - RExC_precomp;                        \
 491     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 492             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 493 } STMT_END
 494
 495 /*
 496  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 497  */
 498 #define vFAIL2(m,a1) STMT_START {                       \
 499     if (!SIZE_ONLY)                                     \
 500         SAVEFREESV(RExC_rx_sv);                         \
 501     Simple_vFAIL2(m, a1);                               \
 502 } STMT_END
 503
 504
 505 /*
 506  * Like Simple_vFAIL(), but accepts three arguments.
 507  */
 508 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 509     const IV offset = RExC_parse - RExC_precomp;                \
 510     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 511             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 512 } STMT_END
 513
 514 /*
 515  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 516  */
 517 #define vFAIL3(m,a1,a2) STMT_START {                    \
 518     if (!SIZE_ONLY)                                     \
 519         SAVEFREESV(RExC_rx_sv);                         \
 520     Simple_vFAIL3(m, a1, a2);                           \
 521 } STMT_END
 522
 523 /*
 524  * Like Simple_vFAIL(), but accepts four arguments.
 525  */
 526 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 527     const IV offset = RExC_parse - RExC_precomp;                \
 528     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 529             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 530 } STMT_END
 531
 532 #define ckWARNreg(loc,m) STMT_START {                                   \
 533     const IV offset = loc - RExC_precomp;                               \
 534     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 535             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 536 } STMT_END
 537
 538 #define ckWARNregdep(loc,m) STMT_START {                                \
 539     const IV offset = loc - RExC_precomp;                               \
 540     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 541             m REPORT_LOCATION,                                          \
 542             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 543 } STMT_END
 544
 545 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 546     const IV offset = loc - RExC_precomp;                               \
 547     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 548             m REPORT_LOCATION,                                          \
 549             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 550 } STMT_END
 551
 552 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 553     const IV offset = loc - RExC_precomp;                               \
 554     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 555             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 556 } STMT_END
 557
 558 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 559     const IV offset = loc - RExC_precomp;                               \
 560     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 561             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 562 } STMT_END
 563
 564 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 565     const IV offset = loc - RExC_precomp;                               \
 566     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 567             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 568 } STMT_END
 569
 570 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 571     const IV offset = loc - RExC_precomp;                               \
 572     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 573             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 574 } STMT_END
 575
 576 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 577     const IV offset = loc - RExC_precomp;                               \
 578     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 579             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 580 } STMT_END
 581
 582 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 583     const IV offset = loc - RExC_precomp;                               \
 584     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 585             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 586 } STMT_END
 587
 588
 589 /* Allow for side effects in s */
 590 #define REGC(c,s) STMT_START {                  \
 591     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 592 } STMT_END
 593
 594 /* Macros for recording node offsets.   20001227 mjd@plover.com
 595  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 596  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 597  * Element 0 holds the number n.
 598  * Position is 1 indexed.
 599  */
 600 #ifndef RE_TRACK_PATTERN_OFFSETS
 601 #define Set_Node_Offset_To_R(node,byte)
 602 #define Set_Node_Offset(node,byte)
 603 #define Set_Cur_Node_Offset
 604 #define Set_Node_Length_To_R(node,len)
 605 #define Set_Node_Length(node,len)
 606 #define Set_Node_Cur_Length(node)
 607 #define Node_Offset(n)
 608 #define Node_Length(n)
 609 #define Set_Node_Offset_Length(node,offset,len)
 610 #define ProgLen(ri) ri->u.proglen
 611 #define SetProgLen(ri,x) ri->u.proglen = x
 612 #else
 613 #define ProgLen(ri) ri->u.offsets[0]
 614 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 615 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 616     if (! SIZE_ONLY) {                                                  \
 617         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 618                     __LINE__, (int)(node), (int)(byte)));               \
 619         if((node) < 0) {                                                \
 620             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 621         } else {                                                        \
 622             RExC_offsets[2*(node)-1] = (byte);                          \
 623         }                                                               \
 624     }                                                                   \
 625 } STMT_END
 626
 627 #define Set_Node_Offset(node,byte) \
 628     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 629 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 630
 631 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 632     if (! SIZE_ONLY) {                                                  \
 633         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 634                 __LINE__, (int)(node), (int)(len)));                    \
 635         if((node) < 0) {                                                \
 636             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 637         } else {                                                        \
 638             RExC_offsets[2*(node)] = (len);                             \
 639         }                                                               \
 640     }                                                                   \
 641 } STMT_END
 642
 643 #define Set_Node_Length(node,len) \
 644     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 645 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 646 #define Set_Node_Cur_Length(node) \
 647     Set_Node_Length(node, RExC_parse - parse_start)
 648
 649 /* Get offsets and lengths */
 650 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 651 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 652
 653 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 654     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 655     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 656 } STMT_END
 657 #endif
 658
 659 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 660 #define EXPERIMENTAL_INPLACESCAN
 661 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 662
 663 #define DEBUG_STUDYDATA(str,data,depth)                              \
 664 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 665     PerlIO_printf(Perl_debug_log,                                    \
 666         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 667         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 668         (int)(depth)*2, "",                                          \
 669         (IV)((data)->pos_min),                                       \
 670         (IV)((data)->pos_delta),                                     \
 671         (UV)((data)->flags),                                         \
 672         (IV)((data)->whilem_c),                                      \
 673         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 674         is_inf ? "INF " : ""                                         \
 675     );                                                               \
 676     if ((data)->last_found)                                          \
 677         PerlIO_printf(Perl_debug_log,                                \
 678             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 679             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 680             SvPVX_const((data)->last_found),                         \
 681             (IV)((data)->last_end),                                  \
 682             (IV)((data)->last_start_min),                            \
 683             (IV)((data)->last_start_max),                            \
 684             ((data)->longest &&                                      \
 685              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 686             SvPVX_const((data)->longest_fixed),                      \
 687             (IV)((data)->offset_fixed),                              \
 688             ((data)->longest &&                                      \
 689              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 690             SvPVX_const((data)->longest_float),                      \
 691             (IV)((data)->offset_float_min),                          \
 692             (IV)((data)->offset_float_max)                           \
 693         );                                                           \
 694     PerlIO_printf(Perl_debug_log,"\n");                              \
 695 });
 696
 697 /* Mark that we cannot extend a found fixed substring at this point.
 698    Update the longest found anchored substring and the longest found
 699    floating substrings if needed. */
 700
 701 STATIC void
 702 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 703 {
 704     const STRLEN l = CHR_SVLEN(data->last_found);
 705     const STRLEN old_l = CHR_SVLEN(*data->longest);
 706     GET_RE_DEBUG_FLAGS_DECL;
 707
 708     PERL_ARGS_ASSERT_SCAN_COMMIT;
 709
 710     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 711         SvSetMagicSV(*data->longest, data->last_found);
 712         if (*data->longest == data->longest_fixed) {
 713             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 714             if (data->flags & SF_BEFORE_EOL)
 715                 data->flags
 716                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 717             else
 718                 data->flags &= ~SF_FIX_BEFORE_EOL;
 719             data->minlen_fixed=minlenp;
 720             data->lookbehind_fixed=0;
 721         }
 722         else { /* *data->longest == data->longest_float */
 723             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 724             data->offset_float_max = (l
 725                                       ? data->last_start_max
 726                                       : data->pos_min + data->pos_delta);
 727             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 728                 data->offset_float_max = I32_MAX;
 729             if (data->flags & SF_BEFORE_EOL)
 730                 data->flags
 731                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 732             else
 733                 data->flags &= ~SF_FL_BEFORE_EOL;
 734             data->minlen_float=minlenp;
 735             data->lookbehind_float=0;
 736         }
 737     }
 738     SvCUR_set(data->last_found, 0);
 739     {
 740         SV * const sv = data->last_found;
 741         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 742             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 743             if (mg)
 744                 mg->mg_len = 0;
 745         }
 746     }
 747     data->last_end = -1;
 748     data->flags &= ~SF_BEFORE_EOL;
 749     DEBUG_STUDYDATA("commit: ",data,0);
 750 }
 751
 752 /* These macros set, clear and test whether the synthetic start class ('ssc',
 753  * given by the parameter) matches an empty string (EOS).  This uses the
 754  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 755  * stands alone, so there is never a next_off, so this field is otherwise
 756  * unused.  The EOS information is used only for compilation, but theoretically
 757  * it could be passed on to the execution code.  This could be used to store
 758  * more than one bit of information, but only this one is currently used. */
 759 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 760 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 761 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 762
 763 /* Can match anything (initialization) */
 764 STATIC void
 765 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 766 {
 767     PERL_ARGS_ASSERT_CL_ANYTHING;
 768
 769     ANYOF_BITMAP_SETALL(cl);
 770     cl->flags = ANYOF_UNICODE_ALL;
 771     SET_SSC_EOS(cl);
 772
 773     /* If any portion of the regex is to operate under locale rules,
 774      * initialization includes it.  The reason this isn't done for all regexes
 775      * is that the optimizer was written under the assumption that locale was
 776      * all-or-nothing.  Given the complexity and lack of documentation in the
 777      * optimizer, and that there are inadequate test cases for locale, so many
 778      * parts of it may not work properly, it is safest to avoid locale unless
 779      * necessary. */
 780     if (RExC_contains_locale) {
 781         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 782         cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 783     }
 784     else {
 785         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 786     }
 787 }
 788
 789 /* Can match anything (initialization) */
 790 STATIC int
 791 S_cl_is_anything(const struct regnode_charclass_class *cl)
 792 {
 793     int value;
 794
 795     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 796
 797     for (value = 0; value < ANYOF_MAX; value += 2)
 798         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 799             return 1;
 800     if (!(cl->flags & ANYOF_UNICODE_ALL))
 801         return 0;
 802     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 803         return 0;
 804     return 1;
 805 }
 806
 807 /* Can match anything (initialization) */
 808 STATIC void
 809 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 810 {
 811     PERL_ARGS_ASSERT_CL_INIT;
 812
 813     Zero(cl, 1, struct regnode_charclass_class);
 814     cl->type = ANYOF;
 815     cl_anything(pRExC_state, cl);
 816     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 817 }
 818
 819 /* These two functions currently do the exact same thing */
 820 #define cl_init_zero            S_cl_init
 821
 822 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 823  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 824  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 825 STATIC void
 826 S_cl_and(struct regnode_charclass_class *cl,
 827         const struct regnode_charclass_class *and_with)
 828 {
 829     PERL_ARGS_ASSERT_CL_AND;
 830
 831     assert(PL_regkind[and_with->type] == ANYOF);
 832
 833     /* I (khw) am not sure all these restrictions are necessary XXX */
 834     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 835         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 836         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 837         && !(and_with->flags & ANYOF_LOC_FOLD)
 838         && !(cl->flags & ANYOF_LOC_FOLD)) {
 839         int i;
 840
 841         if (and_with->flags & ANYOF_INVERT)
 842             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 843                 cl->bitmap[i] &= ~and_with->bitmap[i];
 844         else
 845             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 846                 cl->bitmap[i] &= and_with->bitmap[i];
 847     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 848
 849     if (and_with->flags & ANYOF_INVERT) {
 850
 851         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 852          * aren't affected by the inversion.  Those that are affected are
 853          * handled individually below */
 854         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 855         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 856         cl->flags |= affected_flags;
 857
 858         /* We currently don't know how to deal with things that aren't in the
 859          * bitmap, but we know that the intersection is no greater than what
 860          * is already in cl, so let there be false positives that get sorted
 861          * out after the synthetic start class succeeds, and the node is
 862          * matched for real. */
 863
 864         /* The inversion of these two flags indicate that the resulting
 865          * intersection doesn't have them */
 866         if (and_with->flags & ANYOF_UNICODE_ALL) {
 867             cl->flags &= ~ANYOF_UNICODE_ALL;
 868         }
 869         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 870             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 871         }
 872     }
 873     else {   /* and'd node is not inverted */
 874         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 875
 876         if (! ANYOF_NONBITMAP(and_with)) {
 877
 878             /* Here 'and_with' doesn't match anything outside the bitmap
 879              * (except possibly ANYOF_UNICODE_ALL), which means the
 880              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 881              * which case we don't know what the intersection is, but it's no
 882              * greater than what cl already has, so can just leave it alone,
 883              * with possible false positives */
 884             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 885                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 886                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 887             }
 888         }
 889         else if (! ANYOF_NONBITMAP(cl)) {
 890
 891             /* Here, 'and_with' does match something outside the bitmap, and cl
 892              * doesn't have a list of things to match outside the bitmap.  If
 893              * cl can match all code points above 255, the intersection will
 894              * be those above-255 code points that 'and_with' matches.  If cl
 895              * can't match all Unicode code points, it means that it can't
 896              * match anything outside the bitmap (since the 'if' that got us
 897              * into this block tested for that), so we leave the bitmap empty.
 898              */
 899             if (cl->flags & ANYOF_UNICODE_ALL) {
 900                 ARG_SET(cl, ARG(and_with));
 901
 902                 /* and_with's ARG may match things that don't require UTF8.
 903                  * And now cl's will too, in spite of this being an 'and'.  See
 904                  * the comments below about the kludge */
 905                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 906             }
 907         }
 908         else {
 909             /* Here, both 'and_with' and cl match something outside the
 910              * bitmap.  Currently we do not do the intersection, so just match
 911              * whatever cl had at the beginning.  */
 912         }
 913
 914
 915         /* Take the intersection of the two sets of flags.  However, the
 916          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 917          * kludge around the fact that this flag is not treated like the others
 918          * which are initialized in cl_anything().  The way the optimizer works
 919          * is that the synthetic start class (SSC) is initialized to match
 920          * anything, and then the first time a real node is encountered, its
 921          * values are AND'd with the SSC's with the result being the values of
 922          * the real node.  However, there are paths through the optimizer where
 923          * the AND never gets called, so those initialized bits are set
 924          * inappropriately, which is not usually a big deal, as they just cause
 925          * false positives in the SSC, which will just mean a probably
 926          * imperceptible slow down in execution.  However this bit has a
 927          * higher false positive consequence in that it can cause utf8.pm,
 928          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 929          * bigger slowdown and also causes significant extra memory to be used.
 930          * In order to prevent this, the code now takes a different tack.  The
 931          * bit isn't set unless some part of the regular expression needs it,
 932          * but once set it won't get cleared.  This means that these extra
 933          * modules won't get loaded unless there was some path through the
 934          * pattern that would have required them anyway, and  so any false
 935          * positives that occur by not ANDing them out when they could be
 936          * aren't as severe as they would be if we treated this bit like all
 937          * the others */
 938         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 939                                       & ANYOF_NONBITMAP_NON_UTF8;
 940         cl->flags &= and_with->flags;
 941         cl->flags |= outside_bitmap_but_not_utf8;
 942     }
 943 }
 944
 945 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 946  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 947  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 948 STATIC void
 949 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 950 {
 951     PERL_ARGS_ASSERT_CL_OR;
 952
 953     if (or_with->flags & ANYOF_INVERT) {
 954
 955         /* Here, the or'd node is to be inverted.  This means we take the
 956          * complement of everything not in the bitmap, but currently we don't
 957          * know what that is, so give up and match anything */
 958         if (ANYOF_NONBITMAP(or_with)) {
 959             cl_anything(pRExC_state, cl);
 960         }
 961         /* We do not use
 962          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 963          *   <= (B1 | !B2) | (CL1 | !CL2)
 964          * which is wasteful if CL2 is small, but we ignore CL2:
 965          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 966          * XXXX Can we handle case-fold?  Unclear:
 967          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 968          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 969          */
 970         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 971              && !(or_with->flags & ANYOF_LOC_FOLD)
 972              && !(cl->flags & ANYOF_LOC_FOLD) ) {
 973             int i;
 974
 975             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 976                 cl->bitmap[i] |= ~or_with->bitmap[i];
 977         } /* XXXX: logic is complicated otherwise */
 978         else {
 979             cl_anything(pRExC_state, cl);
 980         }
 981
 982         /* And, we can just take the union of the flags that aren't affected
 983          * by the inversion */
 984         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 985
 986         /* For the remaining flags:
 987             ANYOF_UNICODE_ALL and inverted means to not match anything above
 988                     255, which means that the union with cl should just be
 989                     what cl has in it, so can ignore this flag
 990             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 991                     is 127-255 to match them, but then invert that, so the
 992                     union with cl should just be what cl has in it, so can
 993                     ignore this flag
 994          */
 995     } else {    /* 'or_with' is not inverted */
 996         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 997         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 998              && (!(or_with->flags & ANYOF_LOC_FOLD)
 999                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
1000             int i;
1001
1002             /* OR char bitmap and class bitmap separately */
1003             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1004                 cl->bitmap[i] |= or_with->bitmap[i];
1005             ANYOF_CLASS_OR(or_with, cl);
1006         }
1007         else { /* XXXX: logic is complicated, leave it along for a moment. */
1008             cl_anything(pRExC_state, cl);
1009         }
1010
1011         if (ANYOF_NONBITMAP(or_with)) {
1012
1013             /* Use the added node's outside-the-bit-map match if there isn't a
1014              * conflict.  If there is a conflict (both nodes match something
1015              * outside the bitmap, but what they match outside is not the same
1016              * pointer, and hence not easily compared until XXX we extend
1017              * inversion lists this far), give up and allow the start class to
1018              * match everything outside the bitmap.  If that stuff is all above
1019              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1020             if (! ANYOF_NONBITMAP(cl)) {
1021                 ARG_SET(cl, ARG(or_with));
1022             }
1023             else if (ARG(cl) != ARG(or_with)) {
1024
1025                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1026                     cl_anything(pRExC_state, cl);
1027                 }
1028                 else {
1029                     cl->flags |= ANYOF_UNICODE_ALL;
1030                 }
1031             }
1032         }
1033
1034         /* Take the union */
1035         cl->flags |= or_with->flags;
1036     }
1037 }
1038
1039 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1040 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1041 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1042 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1043
1044
1045 #ifdef DEBUGGING
1046 /*
1047    dump_trie(trie,widecharmap,revcharmap)
1048    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1049    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1050
1051    These routines dump out a trie in a somewhat readable format.
1052    The _interim_ variants are used for debugging the interim
1053    tables that are used to generate the final compressed
1054    representation which is what dump_trie expects.
1055
1056    Part of the reason for their existence is to provide a form
1057    of documentation as to how the different representations function.
1058
1059 */
1060
1061 /*
1062   Dumps the final compressed table form of the trie to Perl_debug_log.
1063   Used for debugging make_trie().
1064 */
1065
1066 STATIC void
1067 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1068             AV *revcharmap, U32 depth)
1069 {
1070     U32 state;
1071     SV *sv=sv_newmortal();
1072     int colwidth= widecharmap ? 6 : 4;
1073     U16 word;
1074     GET_RE_DEBUG_FLAGS_DECL;
1075
1076     PERL_ARGS_ASSERT_DUMP_TRIE;
1077
1078     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1079         (int)depth * 2 + 2,"",
1080         "Match","Base","Ofs" );
1081
1082     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1083         SV ** const tmp = av_fetch( revcharmap, state, 0);
1084         if ( tmp ) {
1085             PerlIO_printf( Perl_debug_log, "%*s",
1086                 colwidth,
1087                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1088                             PL_colors[0], PL_colors[1],
1089                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1090                             PERL_PV_ESCAPE_FIRSTCHAR
1091                 )
1092             );
1093         }
1094     }
1095     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1096         (int)depth * 2 + 2,"");
1097
1098     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1099         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1100     PerlIO_printf( Perl_debug_log, "\n");
1101
1102     for( state = 1 ; state < trie->statecount ; state++ ) {
1103         const U32 base = trie->states[ state ].trans.base;
1104
1105         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1106
1107         if ( trie->states[ state ].wordnum ) {
1108             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1109         } else {
1110             PerlIO_printf( Perl_debug_log, "%6s", "" );
1111         }
1112
1113         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1114
1115         if ( base ) {
1116             U32 ofs = 0;
1117
1118             while( ( base + ofs  < trie->uniquecharcount ) ||
1119                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1120                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1121                     ofs++;
1122
1123             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1124
1125             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1126                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1127                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1128                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1129                 {
1130                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1131                     colwidth,
1132                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1133                 } else {
1134                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1135                 }
1136             }
1137
1138             PerlIO_printf( Perl_debug_log, "]");
1139
1140         }
1141         PerlIO_printf( Perl_debug_log, "\n" );
1142     }
1143     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1144     for (word=1; word <= trie->wordcount; word++) {
1145         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1146             (int)word, (int)(trie->wordinfo[word].prev),
1147             (int)(trie->wordinfo[word].len));
1148     }
1149     PerlIO_printf(Perl_debug_log, "\n" );
1150 }
1151 /*
1152   Dumps a fully constructed but uncompressed trie in list form.
1153   List tries normally only are used for construction when the number of
1154   possible chars (trie->uniquecharcount) is very high.
1155   Used for debugging make_trie().
1156 */
1157 STATIC void
1158 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1159                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1160                          U32 depth)
1161 {
1162     U32 state;
1163     SV *sv=sv_newmortal();
1164     int colwidth= widecharmap ? 6 : 4;
1165     GET_RE_DEBUG_FLAGS_DECL;
1166
1167     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1168
1169     /* print out the table precompression.  */
1170     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1171         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1172         "------:-----+-----------------\n" );
1173
1174     for( state=1 ; state < next_alloc ; state ++ ) {
1175         U16 charid;
1176
1177         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1178             (int)depth * 2 + 2,"", (UV)state  );
1179         if ( ! trie->states[ state ].wordnum ) {
1180             PerlIO_printf( Perl_debug_log, "%5s| ","");
1181         } else {
1182             PerlIO_printf( Perl_debug_log, "W%4x| ",
1183                 trie->states[ state ].wordnum
1184             );
1185         }
1186         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1187             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1188             if ( tmp ) {
1189                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1190                     colwidth,
1191                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1192                             PL_colors[0], PL_colors[1],
1193                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1194                             PERL_PV_ESCAPE_FIRSTCHAR
1195                     ) ,
1196                     TRIE_LIST_ITEM(state,charid).forid,
1197                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1198                 );
1199                 if (!(charid % 10))
1200                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1201                         (int)((depth * 2) + 14), "");
1202             }
1203         }
1204         PerlIO_printf( Perl_debug_log, "\n");
1205     }
1206 }
1207
1208 /*
1209   Dumps a fully constructed but uncompressed trie in table form.
1210   This is the normal DFA style state transition table, with a few
1211   twists to facilitate compression later.
1212   Used for debugging make_trie().
1213 */
1214 STATIC void
1215 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1216                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1217                           U32 depth)
1218 {
1219     U32 state;
1220     U16 charid;
1221     SV *sv=sv_newmortal();
1222     int colwidth= widecharmap ? 6 : 4;
1223     GET_RE_DEBUG_FLAGS_DECL;
1224
1225     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1226
1227     /*
1228        print out the table precompression so that we can do a visual check
1229        that they are identical.
1230      */
1231
1232     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1233
1234     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1235         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1236         if ( tmp ) {
1237             PerlIO_printf( Perl_debug_log, "%*s",
1238                 colwidth,
1239                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1240                             PL_colors[0], PL_colors[1],
1241                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1242                             PERL_PV_ESCAPE_FIRSTCHAR
1243                 )
1244             );
1245         }
1246     }
1247
1248     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1249
1250     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1251         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1252     }
1253
1254     PerlIO_printf( Perl_debug_log, "\n" );
1255
1256     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1257
1258         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1259             (int)depth * 2 + 2,"",
1260             (UV)TRIE_NODENUM( state ) );
1261
1262         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1263             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1264             if (v)
1265                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1266             else
1267                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1268         }
1269         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1270             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1271         } else {
1272             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1273             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1274         }
1275     }
1276 }
1277
1278 #endif
1279
1280
1281 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1282   startbranch: the first branch in the whole branch sequence
1283   first      : start branch of sequence of branch-exact nodes.
1284                May be the same as startbranch
1285   last       : Thing following the last branch.
1286                May be the same as tail.
1287   tail       : item following the branch sequence
1288   count      : words in the sequence
1289   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1290   depth      : indent depth
1291
1292 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1293
1294 A trie is an N'ary tree where the branches are determined by digital
1295 decomposition of the key. IE, at the root node you look up the 1st character and
1296 follow that branch repeat until you find the end of the branches. Nodes can be
1297 marked as "accepting" meaning they represent a complete word. Eg:
1298
1299   /he|she|his|hers/
1300
1301 would convert into the following structure. Numbers represent states, letters
1302 following numbers represent valid transitions on the letter from that state, if
1303 the number is in square brackets it represents an accepting state, otherwise it
1304 will be in parenthesis.
1305
1306       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1307       |    |
1308       |   (2)
1309       |    |
1310      (1)   +-i->(6)-+-s->[7]
1311       |
1312       +-s->(3)-+-h->(4)-+-e->[5]
1313
1314       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1315
1316 This shows that when matching against the string 'hers' we will begin at state 1
1317 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1318 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1319 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1320 single traverse. We store a mapping from accepting to state to which word was
1321 matched, and then when we have multiple possibilities we try to complete the
1322 rest of the regex in the order in which they occured in the alternation.
1323
1324 The only prior NFA like behaviour that would be changed by the TRIE support is
1325 the silent ignoring of duplicate alternations which are of the form:
1326
1327  / (DUPE|DUPE) X? (?{ ... }) Y /x
1328
1329 Thus EVAL blocks following a trie may be called a different number of times with
1330 and without the optimisation. With the optimisations dupes will be silently
1331 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1332 the following demonstrates:
1333
1334  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1335
1336 which prints out 'word' three times, but
1337
1338  'words'=~/(word|word|word)(?{ print $1 })S/
1339
1340 which doesnt print it out at all. This is due to other optimisations kicking in.
1341
1342 Example of what happens on a structural level:
1343
1344 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1345
1346    1: CURLYM[1] {1,32767}(18)
1347    5:   BRANCH(8)
1348    6:     EXACT <ac>(16)
1349    8:   BRANCH(11)
1350    9:     EXACT <ad>(16)
1351   11:   BRANCH(14)
1352   12:     EXACT <ab>(16)
1353   16:   SUCCEED(0)
1354   17:   NOTHING(18)
1355   18: END(0)
1356
1357 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1358 and should turn into:
1359
1360    1: CURLYM[1] {1,32767}(18)
1361    5:   TRIE(16)
1362         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1363           <ac>
1364           <ad>
1365           <ab>
1366   16:   SUCCEED(0)
1367   17:   NOTHING(18)
1368   18: END(0)
1369
1370 Cases where tail != last would be like /(?foo|bar)baz/:
1371
1372    1: BRANCH(4)
1373    2:   EXACT <foo>(8)
1374    4: BRANCH(7)
1375    5:   EXACT <bar>(8)
1376    7: TAIL(8)
1377    8: EXACT <baz>(10)
1378   10: END(0)
1379
1380 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1381 and would end up looking like:
1382
1383     1: TRIE(8)
1384       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1385         <foo>
1386         <bar>
1387    7: TAIL(8)
1388    8: EXACT <baz>(10)
1389   10: END(0)
1390
1391     d = uvuni_to_utf8_flags(d, uv, 0);
1392
1393 is the recommended Unicode-aware way of saying
1394
1395     *(d++) = uv;
1396 */
1397
1398 #define TRIE_STORE_REVCHAR(val)                                            \
1399     STMT_START {                                                           \
1400         if (UTF) {                                                         \
1401             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1402             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1403             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1404             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1405             SvPOK_on(zlopp);                                               \
1406             SvUTF8_on(zlopp);                                              \
1407             av_push(revcharmap, zlopp);                                    \
1408         } else {                                                           \
1409             char ooooff = (char)val;                                           \
1410             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1411         }                                                                  \
1412         } STMT_END
1413
1414 #define TRIE_READ_CHAR STMT_START {                                                     \
1415     wordlen++;                                                                          \
1416     if ( UTF ) {                                                                        \
1417         /* if it is UTF then it is either already folded, or does not need folding */   \
1418         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1419     }                                                                                   \
1420     else if (folder == PL_fold_latin1) {                                                \
1421         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1422         if ( foldlen > 0 ) {                                                            \
1423            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1424            foldlen -= len;                                                              \
1425            scan += len;                                                                 \
1426            len = 0;                                                                     \
1427         } else {                                                                        \
1428             len = 1;                                                                    \
1429             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1430             skiplen = UNISKIP(uvc);                                                     \
1431             foldlen -= skiplen;                                                         \
1432             scan = foldbuf + skiplen;                                                   \
1433         }                                                                               \
1434     } else {                                                                            \
1435         /* raw data, will be folded later if needed */                                  \
1436         uvc = (U32)*uc;                                                                 \
1437         len = 1;                                                                        \
1438     }                                                                                   \
1439 } STMT_END
1440
1441
1442
1443 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1444     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1445         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1446         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1447     }                                                           \
1448     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1449     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1450     TRIE_LIST_CUR( state )++;                                   \
1451 } STMT_END
1452
1453 #define TRIE_LIST_NEW(state) STMT_START {                       \
1454     Newxz( trie->states[ state ].trans.list,               \
1455         4, reg_trie_trans_le );                                 \
1456      TRIE_LIST_CUR( state ) = 1;                                \
1457      TRIE_LIST_LEN( state ) = 4;                                \
1458 } STMT_END
1459
1460 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1461     U16 dupe= trie->states[ state ].wordnum;                    \
1462     regnode * const noper_next = regnext( noper );              \
1463                                                                 \
1464     DEBUG_r({                                                   \
1465         /* store the word for dumping */                        \
1466         SV* tmp;                                                \
1467         if (OP(noper) != NOTHING)                               \
1468             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1469         else                                                    \
1470             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1471         av_push( trie_words, tmp );                             \
1472     });                                                         \
1473                                                                 \
1474     curword++;                                                  \
1475     trie->wordinfo[curword].prev   = 0;                         \
1476     trie->wordinfo[curword].len    = wordlen;                   \
1477     trie->wordinfo[curword].accept = state;                     \
1478                                                                 \
1479     if ( noper_next < tail ) {                                  \
1480         if (!trie->jump)                                        \
1481             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1482         trie->jump[curword] = (U16)(noper_next - convert);      \
1483         if (!jumper)                                            \
1484             jumper = noper_next;                                \
1485         if (!nextbranch)                                        \
1486             nextbranch= regnext(cur);                           \
1487     }                                                           \
1488                                                                 \
1489     if ( dupe ) {                                               \
1490         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1491         /* chain, so that when the bits of chain are later    */\
1492         /* linked together, the dups appear in the chain      */\
1493         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1494         trie->wordinfo[dupe].prev = curword;                    \
1495     } else {                                                    \
1496         /* we haven't inserted this word yet.                */ \
1497         trie->states[ state ].wordnum = curword;                \
1498     }                                                           \
1499 } STMT_END
1500
1501
1502 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1503      ( ( base + charid >=  ucharcount                                   \
1504          && base + charid < ubound                                      \
1505          && state == trie->trans[ base - ucharcount + charid ].check    \
1506          && trie->trans[ base - ucharcount + charid ].next )            \
1507            ? trie->trans[ base - ucharcount + charid ].next             \
1508            : ( state==1 ? special : 0 )                                 \
1509       )
1510
1511 #define MADE_TRIE       1
1512 #define MADE_JUMP_TRIE  2
1513 #define MADE_EXACT_TRIE 4
1514
1515 STATIC I32
1516 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1517 {
1518     dVAR;
1519     /* first pass, loop through and scan words */
1520     reg_trie_data *trie;
1521     HV *widecharmap = NULL;
1522     AV *revcharmap = newAV();
1523     regnode *cur;
1524     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1525     STRLEN len = 0;
1526     UV uvc = 0;
1527     U16 curword = 0;
1528     U32 next_alloc = 0;
1529     regnode *jumper = NULL;
1530     regnode *nextbranch = NULL;
1531     regnode *convert = NULL;
1532     U32 *prev_states; /* temp array mapping each state to previous one */
1533     /* we just use folder as a flag in utf8 */
1534     const U8 * folder = NULL;
1535
1536 #ifdef DEBUGGING
1537     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1538     AV *trie_words = NULL;
1539     /* along with revcharmap, this only used during construction but both are
1540      * useful during debugging so we store them in the struct when debugging.
1541      */
1542 #else
1543     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1544     STRLEN trie_charcount=0;
1545 #endif
1546     SV *re_trie_maxbuff;
1547     GET_RE_DEBUG_FLAGS_DECL;
1548
1549     PERL_ARGS_ASSERT_MAKE_TRIE;
1550 #ifndef DEBUGGING
1551     PERL_UNUSED_ARG(depth);
1552 #endif
1553
1554     switch (flags) {
1555         case EXACT: break;
1556         case EXACTFA:
1557         case EXACTFU_SS:
1558         case EXACTFU_TRICKYFOLD:
1559         case EXACTFU: folder = PL_fold_latin1; break;
1560         case EXACTF:  folder = PL_fold; break;
1561         case EXACTFL: folder = PL_fold_locale; break;
1562         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1563     }
1564
1565     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1566     trie->refcount = 1;
1567     trie->startstate = 1;
1568     trie->wordcount = word_count;
1569     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1570     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1571     if (flags == EXACT)
1572         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1573     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1574                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1575
1576     DEBUG_r({
1577         trie_words = newAV();
1578     });
1579
1580     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1581     if (!SvIOK(re_trie_maxbuff)) {
1582         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1583     }
1584     DEBUG_TRIE_COMPILE_r({
1585                 PerlIO_printf( Perl_debug_log,
1586                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1587                   (int)depth * 2 + 2, "",
1588                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1589                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1590                   (int)depth);
1591     });
1592
1593    /* Find the node we are going to overwrite */
1594     if ( first == startbranch && OP( last ) != BRANCH ) {
1595         /* whole branch chain */
1596         convert = first;
1597     } else {
1598         /* branch sub-chain */
1599         convert = NEXTOPER( first );
1600     }
1601
1602     /*  -- First loop and Setup --
1603
1604        We first traverse the branches and scan each word to determine if it
1605        contains widechars, and how many unique chars there are, this is
1606        important as we have to build a table with at least as many columns as we
1607        have unique chars.
1608
1609        We use an array of integers to represent the character codes 0..255
1610        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1611        native representation of the character value as the key and IV's for the
1612        coded index.
1613
1614        *TODO* If we keep track of how many times each character is used we can
1615        remap the columns so that the table compression later on is more
1616        efficient in terms of memory by ensuring the most common value is in the
1617        middle and the least common are on the outside.  IMO this would be better
1618        than a most to least common mapping as theres a decent chance the most
1619        common letter will share a node with the least common, meaning the node
1620        will not be compressible. With a middle is most common approach the worst
1621        case is when we have the least common nodes twice.
1622
1623      */
1624
1625     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1626         regnode *noper = NEXTOPER( cur );
1627         const U8 *uc = (U8*)STRING( noper );
1628         const U8 *e  = uc + STR_LEN( noper );
1629         STRLEN foldlen = 0;
1630         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1631         STRLEN skiplen = 0;
1632         const U8 *scan = (U8*)NULL;
1633         U32 wordlen      = 0;         /* required init */
1634         STRLEN chars = 0;
1635         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1636
1637         if (OP(noper) == NOTHING) {
1638             regnode *noper_next= regnext(noper);
1639             if (noper_next != tail && OP(noper_next) == flags) {
1640                 noper = noper_next;
1641                 uc= (U8*)STRING(noper);
1642                 e= uc + STR_LEN(noper);
1643                 trie->minlen= STR_LEN(noper);
1644             } else {
1645                 trie->minlen= 0;
1646                 continue;
1647             }
1648         }
1649
1650         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1651             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1652                                           regardless of encoding */
1653             if (OP( noper ) == EXACTFU_SS) {
1654                 /* false positives are ok, so just set this */
1655                 TRIE_BITMAP_SET(trie,0xDF);
1656             }
1657         }
1658         for ( ; uc < e ; uc += len ) {
1659             TRIE_CHARCOUNT(trie)++;
1660             TRIE_READ_CHAR;
1661             chars++;
1662             if ( uvc < 256 ) {
1663                 if ( folder ) {
1664                     U8 folded= folder[ (U8) uvc ];
1665                     if ( !trie->charmap[ folded ] ) {
1666                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1667                         TRIE_STORE_REVCHAR( folded );
1668                     }
1669                 }
1670                 if ( !trie->charmap[ uvc ] ) {
1671                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1672                     TRIE_STORE_REVCHAR( uvc );
1673                 }
1674                 if ( set_bit ) {
1675                     /* store the codepoint in the bitmap, and its folded
1676                      * equivalent. */
1677                     TRIE_BITMAP_SET(trie, uvc);
1678
1679                     /* store the folded codepoint */
1680                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1681
1682                     if ( !UTF ) {
1683                         /* store first byte of utf8 representation of
1684                            variant codepoints */
1685                         if (! UNI_IS_INVARIANT(uvc)) {
1686                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1687                         }
1688                     }
1689                     set_bit = 0; /* We've done our bit :-) */
1690                 }
1691             } else {
1692                 SV** svpp;
1693                 if ( !widecharmap )
1694                     widecharmap = newHV();
1695
1696                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1697
1698                 if ( !svpp )
1699                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1700
1701                 if ( !SvTRUE( *svpp ) ) {
1702                     sv_setiv( *svpp, ++trie->uniquecharcount );
1703                     TRIE_STORE_REVCHAR(uvc);
1704                 }
1705             }
1706         }
1707         if( cur == first ) {
1708             trie->minlen = chars;
1709             trie->maxlen = chars;
1710         } else if (chars < trie->minlen) {
1711             trie->minlen = chars;
1712         } else if (chars > trie->maxlen) {
1713             trie->maxlen = chars;
1714         }
1715         if (OP( noper ) == EXACTFU_SS) {
1716             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1717             if (trie->minlen > 1)
1718                 trie->minlen= 1;
1719         }
1720         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1721             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1722              *                - We assume that any such sequence might match a 2 byte string */
1723             if (trie->minlen > 2 )
1724                 trie->minlen= 2;
1725         }
1726
1727     } /* end first pass */
1728     DEBUG_TRIE_COMPILE_r(
1729         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1730                 (int)depth * 2 + 2,"",
1731                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1732                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1733                 (int)trie->minlen, (int)trie->maxlen )
1734     );
1735
1736     /*
1737         We now know what we are dealing with in terms of unique chars and
1738         string sizes so we can calculate how much memory a naive
1739         representation using a flat table  will take. If it's over a reasonable
1740         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1741         conservative but potentially much slower representation using an array
1742         of lists.
1743
1744         At the end we convert both representations into the same compressed
1745         form that will be used in regexec.c for matching with. The latter
1746         is a form that cannot be used to construct with but has memory
1747         properties similar to the list form and access properties similar
1748         to the table form making it both suitable for fast searches and
1749         small enough that its feasable to store for the duration of a program.
1750
1751         See the comment in the code where the compressed table is produced
1752         inplace from the flat tabe representation for an explanation of how
1753         the compression works.
1754
1755     */
1756
1757
1758     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1759     prev_states[1] = 0;
1760
1761     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1762         /*
1763             Second Pass -- Array Of Lists Representation
1764
1765             Each state will be represented by a list of charid:state records
1766             (reg_trie_trans_le) the first such element holds the CUR and LEN
1767             points of the allocated array. (See defines above).
1768
1769             We build the initial structure using the lists, and then convert
1770             it into the compressed table form which allows faster lookups
1771             (but cant be modified once converted).
1772         */
1773
1774         STRLEN transcount = 1;
1775
1776         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1777             "%*sCompiling trie using list compiler\n",
1778             (int)depth * 2 + 2, ""));
1779
1780         trie->states = (reg_trie_state *)
1781             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1782                                   sizeof(reg_trie_state) );
1783         TRIE_LIST_NEW(1);
1784         next_alloc = 2;
1785
1786         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1787
1788             regnode *noper   = NEXTOPER( cur );
1789             U8 *uc           = (U8*)STRING( noper );
1790             const U8 *e      = uc + STR_LEN( noper );
1791             U32 state        = 1;         /* required init */
1792             U16 charid       = 0;         /* sanity init */
1793             U8 *scan         = (U8*)NULL; /* sanity init */
1794             STRLEN foldlen   = 0;         /* required init */
1795             U32 wordlen      = 0;         /* required init */
1796             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1797             STRLEN skiplen   = 0;
1798
1799             if (OP(noper) == NOTHING) {
1800                 regnode *noper_next= regnext(noper);
1801                 if (noper_next != tail && OP(noper_next) == flags) {
1802                     noper = noper_next;
1803                     uc= (U8*)STRING(noper);
1804                     e= uc + STR_LEN(noper);
1805                 }
1806             }
1807
1808             if (OP(noper) != NOTHING) {
1809                 for ( ; uc < e ; uc += len ) {
1810
1811                     TRIE_READ_CHAR;
1812
1813                     if ( uvc < 256 ) {
1814                         charid = trie->charmap[ uvc ];
1815                     } else {
1816                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1817                         if ( !svpp ) {
1818                             charid = 0;
1819                         } else {
1820                             charid=(U16)SvIV( *svpp );
1821                         }
1822                     }
1823                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1824                     if ( charid ) {
1825
1826                         U16 check;
1827                         U32 newstate = 0;
1828
1829                         charid--;
1830                         if ( !trie->states[ state ].trans.list ) {
1831                             TRIE_LIST_NEW( state );
1832                         }
1833                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1834                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1835                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1836                                 break;
1837                             }
1838                         }
1839                         if ( ! newstate ) {
1840                             newstate = next_alloc++;
1841                             prev_states[newstate] = state;
1842                             TRIE_LIST_PUSH( state, charid, newstate );
1843                             transcount++;
1844                         }
1845                         state = newstate;
1846                     } else {
1847                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1848                     }
1849                 }
1850             }
1851             TRIE_HANDLE_WORD(state);
1852
1853         } /* end second pass */
1854
1855         /* next alloc is the NEXT state to be allocated */
1856         trie->statecount = next_alloc;
1857         trie->states = (reg_trie_state *)
1858             PerlMemShared_realloc( trie->states,
1859                                    next_alloc
1860                                    * sizeof(reg_trie_state) );
1861
1862         /* and now dump it out before we compress it */
1863         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1864                                                          revcharmap, next_alloc,
1865                                                          depth+1)
1866         );
1867
1868         trie->trans = (reg_trie_trans *)
1869             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1870         {
1871             U32 state;
1872             U32 tp = 0;
1873             U32 zp = 0;
1874
1875
1876             for( state=1 ; state < next_alloc ; state ++ ) {
1877                 U32 base=0;
1878
1879                 /*
1880                 DEBUG_TRIE_COMPILE_MORE_r(
1881                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1882                 );
1883                 */
1884
1885                 if (trie->states[state].trans.list) {
1886                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1887                     U16 maxid=minid;
1888                     U16 idx;
1889
1890                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1891                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1892                         if ( forid < minid ) {
1893                             minid=forid;
1894                         } else if ( forid > maxid ) {
1895                             maxid=forid;
1896                         }
1897                     }
1898                     if ( transcount < tp + maxid - minid + 1) {
1899                         transcount *= 2;
1900                         trie->trans = (reg_trie_trans *)
1901                             PerlMemShared_realloc( trie->trans,
1902                                                      transcount
1903                                                      * sizeof(reg_trie_trans) );
1904                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1905                     }
1906                     base = trie->uniquecharcount + tp - minid;
1907                     if ( maxid == minid ) {
1908                         U32 set = 0;
1909                         for ( ; zp < tp ; zp++ ) {
1910                             if ( ! trie->trans[ zp ].next ) {
1911                                 base = trie->uniquecharcount + zp - minid;
1912                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1913                                 trie->trans[ zp ].check = state;
1914                                 set = 1;
1915                                 break;
1916                             }
1917                         }
1918                         if ( !set ) {
1919                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1920                             trie->trans[ tp ].check = state;
1921                             tp++;
1922                             zp = tp;
1923                         }
1924                     } else {
1925                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1926                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1927                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1928                             trie->trans[ tid ].check = state;
1929                         }
1930                         tp += ( maxid - minid + 1 );
1931                     }
1932                     Safefree(trie->states[ state ].trans.list);
1933                 }
1934                 /*
1935                 DEBUG_TRIE_COMPILE_MORE_r(
1936                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1937                 );
1938                 */
1939                 trie->states[ state ].trans.base=base;
1940             }
1941             trie->lasttrans = tp + 1;
1942         }
1943     } else {
1944         /*
1945            Second Pass -- Flat Table Representation.
1946
1947            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1948            We know that we will need Charcount+1 trans at most to store the data
1949            (one row per char at worst case) So we preallocate both structures
1950            assuming worst case.
1951
1952            We then construct the trie using only the .next slots of the entry
1953            structs.
1954
1955            We use the .check field of the first entry of the node temporarily to
1956            make compression both faster and easier by keeping track of how many non
1957            zero fields are in the node.
1958
1959            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1960            transition.
1961
1962            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1963            number representing the first entry of the node, and state as a
1964            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1965            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1966            are 2 entrys per node. eg:
1967
1968              A B       A B
1969           1. 2 4    1. 3 7
1970           2. 0 3    3. 0 5
1971           3. 0 0    5. 0 0
1972           4. 0 0    7. 0 0
1973
1974            The table is internally in the right hand, idx form. However as we also
1975            have to deal with the states array which is indexed by nodenum we have to
1976            use TRIE_NODENUM() to convert.
1977
1978         */
1979         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1980             "%*sCompiling trie using table compiler\n",
1981             (int)depth * 2 + 2, ""));
1982
1983         trie->trans = (reg_trie_trans *)
1984             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1985                                   * trie->uniquecharcount + 1,
1986                                   sizeof(reg_trie_trans) );
1987         trie->states = (reg_trie_state *)
1988             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1989                                   sizeof(reg_trie_state) );
1990         next_alloc = trie->uniquecharcount + 1;
1991
1992
1993         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1994
1995             regnode *noper   = NEXTOPER( cur );
1996             const U8 *uc     = (U8*)STRING( noper );
1997             const U8 *e      = uc + STR_LEN( noper );
1998
1999             U32 state        = 1;         /* required init */
2000
2001             U16 charid       = 0;         /* sanity init */
2002             U32 accept_state = 0;         /* sanity init */
2003             U8 *scan         = (U8*)NULL; /* sanity init */
2004
2005             STRLEN foldlen   = 0;         /* required init */
2006             U32 wordlen      = 0;         /* required init */
2007             STRLEN skiplen   = 0;
2008             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2009
2010             if (OP(noper) == NOTHING) {
2011                 regnode *noper_next= regnext(noper);
2012                 if (noper_next != tail && OP(noper_next) == flags) {
2013                     noper = noper_next;
2014                     uc= (U8*)STRING(noper);
2015                     e= uc + STR_LEN(noper);
2016                 }
2017             }
2018
2019             if ( OP(noper) != NOTHING ) {
2020                 for ( ; uc < e ; uc += len ) {
2021
2022                     TRIE_READ_CHAR;
2023
2024                     if ( uvc < 256 ) {
2025                         charid = trie->charmap[ uvc ];
2026                     } else {
2027                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2028                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2029                     }
2030                     if ( charid ) {
2031                         charid--;
2032                         if ( !trie->trans[ state + charid ].next ) {
2033                             trie->trans[ state + charid ].next = next_alloc;
2034                             trie->trans[ state ].check++;
2035                             prev_states[TRIE_NODENUM(next_alloc)]
2036                                     = TRIE_NODENUM(state);
2037                             next_alloc += trie->uniquecharcount;
2038                         }
2039                         state = trie->trans[ state + charid ].next;
2040                     } else {
2041                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2042                     }
2043                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2044                 }
2045             }
2046             accept_state = TRIE_NODENUM( state );
2047             TRIE_HANDLE_WORD(accept_state);
2048
2049         } /* end second pass */
2050
2051         /* and now dump it out before we compress it */
2052         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2053                                                           revcharmap,
2054                                                           next_alloc, depth+1));
2055
2056         {
2057         /*
2058            * Inplace compress the table.*
2059
2060            For sparse data sets the table constructed by the trie algorithm will
2061            be mostly 0/FAIL transitions or to put it another way mostly empty.
2062            (Note that leaf nodes will not contain any transitions.)
2063
2064            This algorithm compresses the tables by eliminating most such
2065            transitions, at the cost of a modest bit of extra work during lookup:
2066
2067            - Each states[] entry contains a .base field which indicates the
2068            index in the state[] array wheres its transition data is stored.
2069
2070            - If .base is 0 there are no valid transitions from that node.
2071
2072            - If .base is nonzero then charid is added to it to find an entry in
2073            the trans array.
2074
2075            -If trans[states[state].base+charid].check!=state then the
2076            transition is taken to be a 0/Fail transition. Thus if there are fail
2077            transitions at the front of the node then the .base offset will point
2078            somewhere inside the previous nodes data (or maybe even into a node
2079            even earlier), but the .check field determines if the transition is
2080            valid.
2081
2082            XXX - wrong maybe?
2083            The following process inplace converts the table to the compressed
2084            table: We first do not compress the root node 1,and mark all its
2085            .check pointers as 1 and set its .base pointer as 1 as well. This
2086            allows us to do a DFA construction from the compressed table later,
2087            and ensures that any .base pointers we calculate later are greater
2088            than 0.
2089
2090            - We set 'pos' to indicate the first entry of the second node.
2091
2092            - We then iterate over the columns of the node, finding the first and
2093            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2094            and set the .check pointers accordingly, and advance pos
2095            appropriately and repreat for the next node. Note that when we copy
2096            the next pointers we have to convert them from the original
2097            NODEIDX form to NODENUM form as the former is not valid post
2098            compression.
2099
2100            - If a node has no transitions used we mark its base as 0 and do not
2101            advance the pos pointer.
2102
2103            - If a node only has one transition we use a second pointer into the
2104            structure to fill in allocated fail transitions from other states.
2105            This pointer is independent of the main pointer and scans forward
2106            looking for null transitions that are allocated to a state. When it
2107            finds one it writes the single transition into the "hole".  If the
2108            pointer doesnt find one the single transition is appended as normal.
2109
2110            - Once compressed we can Renew/realloc the structures to release the
2111            excess space.
2112
2113            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2114            specifically Fig 3.47 and the associated pseudocode.
2115
2116            demq
2117         */
2118         const U32 laststate = TRIE_NODENUM( next_alloc );
2119         U32 state, charid;
2120         U32 pos = 0, zp=0;
2121         trie->statecount = laststate;
2122
2123         for ( state = 1 ; state < laststate ; state++ ) {
2124             U8 flag = 0;
2125             const U32 stateidx = TRIE_NODEIDX( state );
2126             const U32 o_used = trie->trans[ stateidx ].check;
2127             U32 used = trie->trans[ stateidx ].check;
2128             trie->trans[ stateidx ].check = 0;
2129
2130             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2131                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2132                     if ( trie->trans[ stateidx + charid ].next ) {
2133                         if (o_used == 1) {
2134                             for ( ; zp < pos ; zp++ ) {
2135                                 if ( ! trie->trans[ zp ].next ) {
2136                                     break;
2137                                 }
2138                             }
2139                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2140                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2141                             trie->trans[ zp ].check = state;
2142                             if ( ++zp > pos ) pos = zp;
2143                             break;
2144                         }
2145                         used--;
2146                     }
2147                     if ( !flag ) {
2148                         flag = 1;
2149                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2150                     }
2151                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2152                     trie->trans[ pos ].check = state;
2153                     pos++;
2154                 }
2155             }
2156         }
2157         trie->lasttrans = pos + 1;
2158         trie->states = (reg_trie_state *)
2159             PerlMemShared_realloc( trie->states, laststate
2160                                    * sizeof(reg_trie_state) );
2161         DEBUG_TRIE_COMPILE_MORE_r(
2162                 PerlIO_printf( Perl_debug_log,
2163                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2164                     (int)depth * 2 + 2,"",
2165                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2166                     (IV)next_alloc,
2167                     (IV)pos,
2168                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2169             );
2170
2171         } /* end table compress */
2172     }
2173     DEBUG_TRIE_COMPILE_MORE_r(
2174             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2175                 (int)depth * 2 + 2, "",
2176                 (UV)trie->statecount,
2177                 (UV)trie->lasttrans)
2178     );
2179     /* resize the trans array to remove unused space */
2180     trie->trans = (reg_trie_trans *)
2181         PerlMemShared_realloc( trie->trans, trie->lasttrans
2182                                * sizeof(reg_trie_trans) );
2183
2184     {   /* Modify the program and insert the new TRIE node */
2185         U8 nodetype =(U8)(flags & 0xFF);
2186         char *str=NULL;
2187
2188 #ifdef DEBUGGING
2189         regnode *optimize = NULL;
2190 #ifdef RE_TRACK_PATTERN_OFFSETS
2191
2192         U32 mjd_offset = 0;
2193         U32 mjd_nodelen = 0;
2194 #endif /* RE_TRACK_PATTERN_OFFSETS */
2195 #endif /* DEBUGGING */
2196         /*
2197            This means we convert either the first branch or the first Exact,
2198            depending on whether the thing following (in 'last') is a branch
2199            or not and whther first is the startbranch (ie is it a sub part of
2200            the alternation or is it the whole thing.)
2201            Assuming its a sub part we convert the EXACT otherwise we convert
2202            the whole branch sequence, including the first.
2203          */
2204         /* Find the node we are going to overwrite */
2205         if ( first != startbranch || OP( last ) == BRANCH ) {
2206             /* branch sub-chain */
2207             NEXT_OFF( first ) = (U16)(last - first);
2208 #ifdef RE_TRACK_PATTERN_OFFSETS
2209             DEBUG_r({
2210                 mjd_offset= Node_Offset((convert));
2211                 mjd_nodelen= Node_Length((convert));
2212             });
2213 #endif
2214             /* whole branch chain */
2215         }
2216 #ifdef RE_TRACK_PATTERN_OFFSETS
2217         else {
2218             DEBUG_r({
2219                 const  regnode *nop = NEXTOPER( convert );
2220                 mjd_offset= Node_Offset((nop));
2221                 mjd_nodelen= Node_Length((nop));
2222             });
2223         }
2224         DEBUG_OPTIMISE_r(
2225             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2226                 (int)depth * 2 + 2, "",
2227                 (UV)mjd_offset, (UV)mjd_nodelen)
2228         );
2229 #endif
2230         /* But first we check to see if there is a common prefix we can
2231            split out as an EXACT and put in front of the TRIE node.  */
2232         trie->startstate= 1;
2233         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2234             U32 state;
2235             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2236                 U32 ofs = 0;
2237                 I32 idx = -1;
2238                 U32 count = 0;
2239                 const U32 base = trie->states[ state ].trans.base;
2240
2241                 if ( trie->states[state].wordnum )
2242                         count = 1;
2243
2244                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2245                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2246                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2247                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2248                     {
2249                         if ( ++count > 1 ) {
2250                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2251                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2252                             if ( state == 1 ) break;
2253                             if ( count == 2 ) {
2254                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2255                                 DEBUG_OPTIMISE_r(
2256                                     PerlIO_printf(Perl_debug_log,
2257                                         "%*sNew Start State=%"UVuf" Class: [",
2258                                         (int)depth * 2 + 2, "",
2259                                         (UV)state));
2260                                 if (idx >= 0) {
2261                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2262                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2263
2264                                     TRIE_BITMAP_SET(trie,*ch);
2265                                     if ( folder )
2266                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2267                                     DEBUG_OPTIMISE_r(
2268                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2269                                     );
2270                                 }
2271                             }
2272                             TRIE_BITMAP_SET(trie,*ch);
2273                             if ( folder )
2274                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2275                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2276                         }
2277                         idx = ofs;
2278                     }
2279                 }
2280                 if ( count == 1 ) {
2281                     SV **tmp = av_fetch( revcharmap, idx, 0);
2282                     STRLEN len;
2283                     char *ch = SvPV( *tmp, len );
2284                     DEBUG_OPTIMISE_r({
2285                         SV *sv=sv_newmortal();
2286                         PerlIO_printf( Perl_debug_log,
2287                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2288                             (int)depth * 2 + 2, "",
2289                             (UV)state, (UV)idx,
2290                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2291                                 PL_colors[0], PL_colors[1],
2292                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2293                                 PERL_PV_ESCAPE_FIRSTCHAR
2294                             )
2295                         );
2296                     });
2297                     if ( state==1 ) {
2298                         OP( convert ) = nodetype;
2299                         str=STRING(convert);
2300                         STR_LEN(convert)=0;
2301                     }
2302                     STR_LEN(convert) += len;
2303                     while (len--)
2304                         *str++ = *ch++;
2305                 } else {
2306 #ifdef DEBUGGING
2307                     if (state>1)
2308                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2309 #endif
2310                     break;
2311                 }
2312             }
2313             trie->prefixlen = (state-1);
2314             if (str) {
2315                 regnode *n = convert+NODE_SZ_STR(convert);
2316                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2317                 trie->startstate = state;
2318                 trie->minlen -= (state - 1);
2319                 trie->maxlen -= (state - 1);
2320 #ifdef DEBUGGING
2321                /* At least the UNICOS C compiler choked on this
2322                 * being argument to DEBUG_r(), so let's just have
2323                 * it right here. */
2324                if (
2325 #ifdef PERL_EXT_RE_BUILD
2326                    1
2327 #else
2328                    DEBUG_r_TEST
2329 #endif
2330                    ) {
2331                    regnode *fix = convert;
2332                    U32 word = trie->wordcount;
2333                    mjd_nodelen++;
2334                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2335                    while( ++fix < n ) {
2336                        Set_Node_Offset_Length(fix, 0, 0);
2337                    }
2338                    while (word--) {
2339                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2340                        if (tmp) {
2341                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2342                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2343                            else
2344                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2345                        }
2346                    }
2347                }
2348 #endif
2349                 if (trie->maxlen) {
2350                     convert = n;
2351                 } else {
2352                     NEXT_OFF(convert) = (U16)(tail - convert);
2353                     DEBUG_r(optimize= n);
2354                 }
2355             }
2356         }
2357         if (!jumper)
2358             jumper = last;
2359         if ( trie->maxlen ) {
2360             NEXT_OFF( convert ) = (U16)(tail - convert);
2361             ARG_SET( convert, data_slot );
2362             /* Store the offset to the first unabsorbed branch in
2363                jump[0], which is otherwise unused by the jump logic.
2364                We use this when dumping a trie and during optimisation. */
2365             if (trie->jump)
2366                 trie->jump[0] = (U16)(nextbranch - convert);
2367
2368             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2369              *   and there is a bitmap
2370              *   and the first "jump target" node we found leaves enough room
2371              * then convert the TRIE node into a TRIEC node, with the bitmap
2372              * embedded inline in the opcode - this is hypothetically faster.
2373              */
2374             if ( !trie->states[trie->startstate].wordnum
2375                  && trie->bitmap
2376                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2377             {
2378                 OP( convert ) = TRIEC;
2379                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2380                 PerlMemShared_free(trie->bitmap);
2381                 trie->bitmap= NULL;
2382             } else
2383                 OP( convert ) = TRIE;
2384
2385             /* store the type in the flags */
2386             convert->flags = nodetype;
2387             DEBUG_r({
2388             optimize = convert
2389                       + NODE_STEP_REGNODE
2390                       + regarglen[ OP( convert ) ];
2391             });
2392             /* XXX We really should free up the resource in trie now,
2393                    as we won't use them - (which resources?) dmq */
2394         }
2395         /* needed for dumping*/
2396         DEBUG_r(if (optimize) {
2397             regnode *opt = convert;
2398
2399             while ( ++opt < optimize) {
2400                 Set_Node_Offset_Length(opt,0,0);
2401             }
2402             /*
2403                 Try to clean up some of the debris left after the
2404                 optimisation.
2405              */
2406             while( optimize < jumper ) {
2407                 mjd_nodelen += Node_Length((optimize));
2408                 OP( optimize ) = OPTIMIZED;
2409                 Set_Node_Offset_Length(optimize,0,0);
2410                 optimize++;
2411             }
2412             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2413         });
2414     } /* end node insert */
2415
2416     /*  Finish populating the prev field of the wordinfo array.  Walk back
2417      *  from each accept state until we find another accept state, and if
2418      *  so, point the first word's .prev field at the second word. If the
2419      *  second already has a .prev field set, stop now. This will be the
2420      *  case either if we've already processed that word's accept state,
2421      *  or that state had multiple words, and the overspill words were
2422      *  already linked up earlier.
2423      */
2424     {
2425         U16 word;
2426         U32 state;
2427         U16 prev;
2428
2429         for (word=1; word <= trie->wordcount; word++) {
2430             prev = 0;
2431             if (trie->wordinfo[word].prev)
2432                 continue;
2433             state = trie->wordinfo[word].accept;
2434             while (state) {
2435                 state = prev_states[state];
2436                 if (!state)
2437                     break;
2438                 prev = trie->states[state].wordnum;
2439                 if (prev)
2440                     break;
2441             }
2442             trie->wordinfo[word].prev = prev;
2443         }
2444         Safefree(prev_states);
2445     }
2446
2447
2448     /* and now dump out the compressed format */
2449     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2450
2451     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2452 #ifdef DEBUGGING
2453     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2454     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2455 #else
2456     SvREFCNT_dec_NN(revcharmap);
2457 #endif
2458     return trie->jump
2459            ? MADE_JUMP_TRIE
2460            : trie->startstate>1
2461              ? MADE_EXACT_TRIE
2462              : MADE_TRIE;
2463 }
2464
2465 STATIC void
2466 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2467 {
2468 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2469
2470    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2471    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2472    ISBN 0-201-10088-6
2473
2474    We find the fail state for each state in the trie, this state is the longest proper
2475    suffix of the current state's 'word' that is also a proper prefix of another word in our
2476    trie. State 1 represents the word '' and is thus the default fail state. This allows
2477    the DFA not to have to restart after its tried and failed a word at a given point, it
2478    simply continues as though it had been matching the other word in the first place.
2479    Consider
2480       'abcdgu'=~/abcdefg|cdgu/
2481    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2482    fail, which would bring us to the state representing 'd' in the second word where we would
2483    try 'g' and succeed, proceeding to match 'cdgu'.
2484  */
2485  /* add a fail transition */
2486     const U32 trie_offset = ARG(source);
2487     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2488     U32 *q;
2489     const U32 ucharcount = trie->uniquecharcount;
2490     const U32 numstates = trie->statecount;
2491     const U32 ubound = trie->lasttrans + ucharcount;
2492     U32 q_read = 0;
2493     U32 q_write = 0;
2494     U32 charid;
2495     U32 base = trie->states[ 1 ].trans.base;
2496     U32 *fail;
2497     reg_ac_data *aho;
2498     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2499     GET_RE_DEBUG_FLAGS_DECL;
2500
2501     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2502 #ifndef DEBUGGING
2503     PERL_UNUSED_ARG(depth);
2504 #endif
2505
2506
2507     ARG_SET( stclass, data_slot );
2508     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2509     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2510     aho->trie=trie_offset;
2511     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2512     Copy( trie->states, aho->states, numstates, reg_trie_state );
2513     Newxz( q, numstates, U32);
2514     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2515     aho->refcount = 1;
2516     fail = aho->fail;
2517     /* initialize fail[0..1] to be 1 so that we always have
2518        a valid final fail state */
2519     fail[ 0 ] = fail[ 1 ] = 1;
2520
2521     for ( charid = 0; charid < ucharcount ; charid++ ) {
2522         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2523         if ( newstate ) {
2524             q[ q_write ] = newstate;
2525             /* set to point at the root */
2526             fail[ q[ q_write++ ] ]=1;
2527         }
2528     }
2529     while ( q_read < q_write) {
2530         const U32 cur = q[ q_read++ % numstates ];
2531         base = trie->states[ cur ].trans.base;
2532
2533         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2534             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2535             if (ch_state) {
2536                 U32 fail_state = cur;
2537                 U32 fail_base;
2538                 do {
2539                     fail_state = fail[ fail_state ];
2540                     fail_base = aho->states[ fail_state ].trans.base;
2541                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2542
2543                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2544                 fail[ ch_state ] = fail_state;
2545                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2546                 {
2547                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2548                 }
2549                 q[ q_write++ % numstates] = ch_state;
2550             }
2551         }
2552     }
2553     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2554        when we fail in state 1, this allows us to use the
2555        charclass scan to find a valid start char. This is based on the principle
2556        that theres a good chance the string being searched contains lots of stuff
2557        that cant be a start char.
2558      */
2559     fail[ 0 ] = fail[ 1 ] = 0;
2560     DEBUG_TRIE_COMPILE_r({
2561         PerlIO_printf(Perl_debug_log,
2562                       "%*sStclass Failtable (%"UVuf" states): 0",
2563                       (int)(depth * 2), "", (UV)numstates
2564         );
2565         for( q_read=1; q_read<numstates; q_read++ ) {
2566             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2567         }
2568         PerlIO_printf(Perl_debug_log, "\n");
2569     });
2570     Safefree(q);
2571     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2572 }
2573
2574
2575 /*
2576  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2577  * These need to be revisited when a newer toolchain becomes available.
2578  */
2579 #if defined(__sparc64__) && defined(__GNUC__)
2580 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2581 #       undef  SPARC64_GCC_WORKAROUND
2582 #       define SPARC64_GCC_WORKAROUND 1
2583 #   endif
2584 #endif
2585
2586 #define DEBUG_PEEP(str,scan,depth) \
2587     DEBUG_OPTIMISE_r({if (scan){ \
2588        SV * const mysv=sv_newmortal(); \
2589        regnode *Next = regnext(scan); \
2590        regprop(RExC_rx, mysv, scan); \
2591        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2592        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2593        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2594    }});
2595
2596
2597 /* The below joins as many adjacent EXACTish nodes as possible into a single
2598  * one.  The regop may be changed if the node(s) contain certain sequences that
2599  * require special handling.  The joining is only done if:
2600  * 1) there is room in the current conglomerated node to entirely contain the
2601  *    next one.
2602  * 2) they are the exact same node type
2603  *
2604  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2605  * these get optimized out
2606  *
2607  * If a node is to match under /i (folded), the number of characters it matches
2608  * can be different than its character length if it contains a multi-character
2609  * fold.  *min_subtract is set to the total delta of the input nodes.
2610  *
2611  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2612  * and contains LATIN SMALL LETTER SHARP S
2613  *
2614  * This is as good a place as any to discuss the design of handling these
2615  * multi-character fold sequences.  It's been wrong in Perl for a very long
2616  * time.  There are three code points in Unicode whose multi-character folds
2617  * were long ago discovered to mess things up.  The previous designs for
2618  * dealing with these involved assigning a special node for them.  This
2619  * approach doesn't work, as evidenced by this example:
2620  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2621  * Both these fold to "sss", but if the pattern is parsed to create a node that
2622  * would match just the \xDF, it won't be able to handle the case where a
2623  * successful match would have to cross the node's boundary.  The new approach
2624  * that hopefully generally solves the problem generates an EXACTFU_SS node
2625  * that is "sss".
2626  *
2627  * It turns out that there are problems with all multi-character folds, and not
2628  * just these three.  Now the code is general, for all such cases, but the
2629  * three still have some special handling.  The approach taken is:
2630  * 1)   This routine examines each EXACTFish node that could contain multi-
2631  *      character fold sequences.  It returns in *min_subtract how much to
2632  *      subtract from the the actual length of the string to get a real minimum
2633  *      match length; it is 0 if there are no multi-char folds.  This delta is
2634  *      used by the caller to adjust the min length of the match, and the delta
2635  *      between min and max, so that the optimizer doesn't reject these
2636  *      possibilities based on size constraints.
2637  * 2)   Certain of these sequences require special handling by the trie code,
2638  *      so, if found, this code changes the joined node type to special ops:
2639  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2640  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2641  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2642  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2643  *      there is a possible fold length change.  That means that a regular
2644  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2645  *      with length changes, and so can be processed faster.  regexec.c takes
2646  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2647  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2648  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2649  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2650  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2651  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2652  *      possibilities for the non-UTF8 patterns are quite simple, except for
2653  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2654  *      members of a fold-pair, and arrays are set up for all of them so that
2655  *      the other member of the pair can be found quickly.  Code elsewhere in
2656  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2657  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2658  *      described in the next item.
2659  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2660  *      'ss' or not is not knowable at compile time.  It will match iff the
2661  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2662  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2663  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2664  *      described in item 3).  An assumption that the optimizer part of
2665  *      regexec.c (probably unwittingly) makes is that a character in the
2666  *      pattern corresponds to at most a single character in the target string.
2667  *      (And I do mean character, and not byte here, unlike other parts of the
2668  *      documentation that have never been updated to account for multibyte
2669  *      Unicode.)  This assumption is wrong only in this case, as all other
2670  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2671  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2672  *      reluctant to try to change this assumption, so instead the code punts.
2673  *      This routine examines EXACTF nodes for the sharp s, and returns a
2674  *      boolean indicating whether or not the node is an EXACTF node that
2675  *      contains a sharp s.  When it is true, the caller sets a flag that later
2676  *      causes the optimizer in this file to not set values for the floating
2677  *      and fixed string lengths, and thus avoids the optimizer code in
2678  *      regexec.c that makes the invalid assumption.  Thus, there is no
2679  *      optimization based on string lengths for EXACTF nodes that contain the
2680  *      sharp s.  This only happens for /id rules (which means the pattern
2681  *      isn't in UTF-8).
2682  */
2683
2684 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2685     if (PL_regkind[OP(scan)] == EXACT) \
2686         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2687
2688 STATIC U32
2689 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2690     /* Merge several consecutive EXACTish nodes into one. */
2691     regnode *n = regnext(scan);
2692     U32 stringok = 1;
2693     regnode *next = scan + NODE_SZ_STR(scan);
2694     U32 merged = 0;
2695     U32 stopnow = 0;
2696 #ifdef DEBUGGING
2697     regnode *stop = scan;
2698     GET_RE_DEBUG_FLAGS_DECL;
2699 #else
2700     PERL_UNUSED_ARG(depth);
2701 #endif
2702
2703     PERL_ARGS_ASSERT_JOIN_EXACT;
2704 #ifndef EXPERIMENTAL_INPLACESCAN
2705     PERL_UNUSED_ARG(flags);
2706     PERL_UNUSED_ARG(val);
2707 #endif
2708     DEBUG_PEEP("join",scan,depth);
2709
2710     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2711      * EXACT ones that are mergeable to the current one. */
2712     while (n
2713            && (PL_regkind[OP(n)] == NOTHING
2714                || (stringok && OP(n) == OP(scan)))
2715            && NEXT_OFF(n)
2716            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2717     {
2718
2719         if (OP(n) == TAIL || n > next)
2720             stringok = 0;
2721         if (PL_regkind[OP(n)] == NOTHING) {
2722             DEBUG_PEEP("skip:",n,depth);
2723             NEXT_OFF(scan) += NEXT_OFF(n);
2724             next = n + NODE_STEP_REGNODE;
2725 #ifdef DEBUGGING
2726             if (stringok)
2727                 stop = n;
2728 #endif
2729             n = regnext(n);
2730         }
2731         else if (stringok) {
2732             const unsigned int oldl = STR_LEN(scan);
2733             regnode * const nnext = regnext(n);
2734
2735             /* XXX I (khw) kind of doubt that this works on platforms where
2736              * U8_MAX is above 255 because of lots of other assumptions */
2737             /* Don't join if the sum can't fit into a single node */
2738             if (oldl + STR_LEN(n) > U8_MAX)
2739                 break;
2740
2741             DEBUG_PEEP("merg",n,depth);
2742             merged++;
2743
2744             NEXT_OFF(scan) += NEXT_OFF(n);
2745             STR_LEN(scan) += STR_LEN(n);
2746             next = n + NODE_SZ_STR(n);
2747             /* Now we can overwrite *n : */
2748             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2749 #ifdef DEBUGGING
2750             stop = next - 1;
2751 #endif
2752             n = nnext;
2753             if (stopnow) break;
2754         }
2755
2756 #ifdef EXPERIMENTAL_INPLACESCAN
2757         if (flags && !NEXT_OFF(n)) {
2758             DEBUG_PEEP("atch", val, depth);
2759             if (reg_off_by_arg[OP(n)]) {
2760                 ARG_SET(n, val - n);
2761             }
2762             else {
2763                 NEXT_OFF(n) = val - n;
2764             }
2765             stopnow = 1;
2766         }
2767 #endif
2768     }
2769
2770     *min_subtract = 0;
2771     *has_exactf_sharp_s = FALSE;
2772
2773     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2774      * can now analyze for sequences of problematic code points.  (Prior to
2775      * this final joining, sequences could have been split over boundaries, and
2776      * hence missed).  The sequences only happen in folding, hence for any
2777      * non-EXACT EXACTish node */
2778     if (OP(scan) != EXACT) {
2779         const U8 * const s0 = (U8*) STRING(scan);
2780         const U8 * s = s0;
2781         const U8 * const s_end = s0 + STR_LEN(scan);
2782
2783         /* One pass is made over the node's string looking for all the
2784          * possibilities.  to avoid some tests in the loop, there are two main
2785          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2786          * non-UTF-8 */
2787         if (UTF) {
2788
2789             /* Examine the string for a multi-character fold sequence.  UTF-8
2790              * patterns have all characters pre-folded by the time this code is
2791              * executed */
2792             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2793                                      length sequence we are looking for is 2 */
2794             {
2795                 int count = 0;
2796                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2797                 if (! len) {    /* Not a multi-char fold: get next char */
2798                     s += UTF8SKIP(s);
2799                     continue;
2800                 }
2801
2802                 /* Nodes with 'ss' require special handling, except for EXACTFL
2803                  * and EXACTFA for which there is no multi-char fold to this */
2804                 if (len == 2 && *s == 's' && *(s+1) == 's'
2805                     && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2806                 {
2807                     count = 2;
2808                     OP(scan) = EXACTFU_SS;
2809                     s += 2;
2810                 }
2811                 else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2812                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2813                                       COMBINING_DIAERESIS_UTF8
2814                                       COMBINING_ACUTE_ACCENT_UTF8,
2815                                    6)
2816                              || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2817                                          COMBINING_DIAERESIS_UTF8
2818                                          COMBINING_ACUTE_ACCENT_UTF8,
2819                                      6)))
2820                 {
2821                     count = 3;
2822
2823                     /* These two folds require special handling by trie's, so
2824                      * change the node type to indicate this.  If EXACTFA and
2825                      * EXACTFL were ever to be handled by trie's, this would
2826                      * have to be changed.  If this node has already been
2827                      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2828                      * (khw) think it doesn't matter in regexec.c for UTF
2829                      * patterns, but no need to change it */
2830                     if (OP(scan) == EXACTFU) {
2831                         OP(scan) = EXACTFU_TRICKYFOLD;
2832                     }
2833                     s += 6;
2834                 }
2835                 else { /* Here is a generic multi-char fold. */
2836                     const U8* multi_end  = s + len;
2837
2838                     /* Count how many characters in it.  In the case of /l and
2839                      * /aa, no folds which contain ASCII code points are
2840                      * allowed, so check for those, and skip if found.  (In
2841                      * EXACTFL, no folds are allowed to any Latin1 code point,
2842                      * not just ASCII.  But there aren't any of these
2843                      * currently, nor ever likely, so don't take the time to
2844                      * test for them.  The code that generates the
2845                      * is_MULTI_foo() macros croaks should one actually get put
2846                      * into Unicode .) */
2847                     if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2848                         count = utf8_length(s, multi_end);
2849                         s = multi_end;
2850                     }
2851                     else {
2852                         while (s < multi_end) {
2853                             if (isASCII(*s)) {
2854                                 s++;
2855                                 goto next_iteration;
2856                             }
2857                             else {
2858                                 s += UTF8SKIP(s);
2859                             }
2860                             count++;
2861                         }
2862                     }
2863                 }
2864
2865                 /* The delta is how long the sequence is minus 1 (1 is how long
2866                  * the character that folds to the sequence is) */
2867                 *min_subtract += count - 1;
2868             next_iteration: ;
2869             }
2870         }
2871         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2872
2873             /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2874              * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2875              * nodes can't have multi-char folds to this range (and there are
2876              * no existing ones in the upper latin1 range).  In the EXACTF
2877              * case we look also for the sharp s, which can be in the final
2878              * position.  Otherwise we can stop looking 1 byte earlier because
2879              * have to find at least two characters for a multi-fold */
2880             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2881
2882             /* The below is perhaps overboard, but this allows us to save a
2883              * test each time through the loop at the expense of a mask.  This
2884              * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2885              * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2886              * are 64.  This uses an exclusive 'or' to find that bit and then
2887              * inverts it to form a mask, with just a single 0, in the bit
2888              * position where 'S' and 's' differ. */
2889             const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2890             const U8 s_masked = 's' & S_or_s_mask;
2891
2892             while (s < upper) {
2893                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2894                 if (! len) {    /* Not a multi-char fold. */
2895                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2896                     {
2897                         *has_exactf_sharp_s = TRUE;
2898                     }
2899                     s++;
2900                     continue;
2901                 }
2902
2903                 if (len == 2
2904                     && ((*s & S_or_s_mask) == s_masked)
2905                     && ((*(s+1) & S_or_s_mask) == s_masked))
2906                 {
2907
2908                     /* EXACTF nodes need to know that the minimum length
2909                      * changed so that a sharp s in the string can match this
2910                      * ss in the pattern, but they remain EXACTF nodes, as they
2911                      * won't match this unless the target string is is UTF-8,
2912                      * which we don't know until runtime */
2913                     if (OP(scan) != EXACTF) {
2914                         OP(scan) = EXACTFU_SS;
2915                     }
2916                 }
2917
2918                 *min_subtract += len - 1;
2919                 s += len;
2920             }
2921         }
2922     }
2923
2924 #ifdef DEBUGGING
2925     /* Allow dumping but overwriting the collection of skipped
2926      * ops and/or strings with fake optimized ops */
2927     n = scan + NODE_SZ_STR(scan);
2928     while (n <= stop) {
2929         OP(n) = OPTIMIZED;
2930         FLAGS(n) = 0;
2931         NEXT_OFF(n) = 0;
2932         n++;
2933     }
2934 #endif
2935     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2936     return stopnow;
2937 }
2938
2939 /* REx optimizer.  Converts nodes into quicker variants "in place".
2940    Finds fixed substrings.  */
2941
2942 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2943    to the position after last scanned or to NULL. */
2944
2945 #define INIT_AND_WITHP \
2946     assert(!and_withp); \
2947     Newx(and_withp,1,struct regnode_charclass_class); \
2948     SAVEFREEPV(and_withp)
2949
2950 /* this is a chain of data about sub patterns we are processing that
2951    need to be handled separately/specially in study_chunk. Its so
2952    we can simulate recursion without losing state.  */
2953 struct scan_frame;
2954 typedef struct scan_frame {
2955     regnode *last;  /* last node to process in this frame */
2956     regnode *next;  /* next node to process when last is reached */
2957     struct scan_frame *prev; /*previous frame*/
2958     I32 stop; /* what stopparen do we use */
2959 } scan_frame;
2960
2961
2962 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2963
2964 STATIC I32
2965 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2966                         I32 *minlenp, I32 *deltap,
2967                         regnode *last,
2968                         scan_data_t *data,
2969                         I32 stopparen,
2970                         U8* recursed,
2971                         struct regnode_charclass_class *and_withp,
2972                         U32 flags, U32 depth)
2973                         /* scanp: Start here (read-write). */
2974                         /* deltap: Write maxlen-minlen here. */
2975                         /* last: Stop before this one. */
2976                         /* data: string data about the pattern */
2977                         /* stopparen: treat close N as END */
2978                         /* recursed: which subroutines have we recursed into */
2979                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2980 {
2981     dVAR;
2982     I32 min = 0;    /* There must be at least this number of characters to match */
2983     I32 pars = 0, code;
2984     regnode *scan = *scanp, *next;
2985     I32 delta = 0;
2986     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2987     int is_inf_internal = 0;            /* The studied chunk is infinite */
2988     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2989     scan_data_t data_fake;
2990     SV *re_trie_maxbuff = NULL;
2991     regnode *first_non_open = scan;
2992     I32 stopmin = I32_MAX;
2993     scan_frame *frame = NULL;
2994     GET_RE_DEBUG_FLAGS_DECL;
2995
2996     PERL_ARGS_ASSERT_STUDY_CHUNK;
2997
2998 #ifdef DEBUGGING
2999     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3000 #endif
3001
3002     if ( depth == 0 ) {
3003         while (first_non_open && OP(first_non_open) == OPEN)
3004             first_non_open=regnext(first_non_open);
3005     }
3006
3007
3008   fake_study_recurse:
3009     while ( scan && OP(scan) != END && scan < last ){
3010         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3011                                    node length to get a real minimum (because
3012                                    the folded version may be shorter) */
3013         bool has_exactf_sharp_s = FALSE;
3014         /* Peephole optimizer: */
3015         DEBUG_STUDYDATA("Peep:", data,depth);
3016         DEBUG_PEEP("Peep",scan,depth);
3017
3018         /* Its not clear to khw or hv why this is done here, and not in the
3019          * clauses that deal with EXACT nodes.  khw's guess is that it's
3020          * because of a previous design */
3021         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3022
3023         /* Follow the next-chain of the current node and optimize
3024            away all the NOTHINGs from it.  */
3025         if (OP(scan) != CURLYX) {
3026             const int max = (reg_off_by_arg[OP(scan)]
3027                        ? I32_MAX
3028                        /* I32 may be smaller than U16 on CRAYs! */
3029                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3030             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3031             int noff;
3032             regnode *n = scan;
3033
3034             /* Skip NOTHING and LONGJMP. */
3035             while ((n = regnext(n))
3036                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3037                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3038                    && off + noff < max)
3039                 off += noff;
3040             if (reg_off_by_arg[OP(scan)])
3041                 ARG(scan) = off;
3042             else
3043                 NEXT_OFF(scan) = off;
3044         }
3045
3046
3047
3048         /* The principal pseudo-switch.  Cannot be a switch, since we
3049            look into several different things.  */
3050         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3051                    || OP(scan) == IFTHEN) {
3052             next = regnext(scan);
3053             code = OP(scan);
3054             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3055
3056             if (OP(next) == code || code == IFTHEN) {
3057                 /* NOTE - There is similar code to this block below for handling
3058                    TRIE nodes on a re-study.  If you change stuff here check there
3059                    too. */
3060                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3061                 struct regnode_charclass_class accum;
3062                 regnode * const startbranch=scan;
3063
3064                 if (flags & SCF_DO_SUBSTR)
3065                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3066                 if (flags & SCF_DO_STCLASS)
3067                     cl_init_zero(pRExC_state, &accum);
3068
3069                 while (OP(scan) == code) {
3070                     I32 deltanext, minnext, f = 0, fake;
3071                     struct regnode_charclass_class this_class;
3072
3073                     num++;
3074                     data_fake.flags = 0;
3075                     if (data) {
3076                         data_fake.whilem_c = data->whilem_c;
3077                         data_fake.last_closep = data->last_closep;
3078                     }
3079                     else
3080                         data_fake.last_closep = &fake;
3081
3082                     data_fake.pos_delta = delta;
3083                     next = regnext(scan);
3084                     scan = NEXTOPER(scan);
3085                     if (code != BRANCH)
3086                         scan = NEXTOPER(scan);
3087                     if (flags & SCF_DO_STCLASS) {
3088                         cl_init(pRExC_state, &this_class);
3089                         data_fake.start_class = &this_class;
3090                         f = SCF_DO_STCLASS_AND;
3091                     }
3092                     if (flags & SCF_WHILEM_VISITED_POS)
3093                         f |= SCF_WHILEM_VISITED_POS;
3094
3095                     /* we suppose the run is continuous, last=next...*/
3096                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3097                                           next, &data_fake,
3098                                           stopparen, recursed, NULL, f,depth+1);
3099                     if (min1 > minnext)
3100                         min1 = minnext;
3101                     if (max1 < minnext + deltanext)
3102                         max1 = minnext + deltanext;
3103                     if (deltanext == I32_MAX)
3104                         is_inf = is_inf_internal = 1;
3105                     scan = next;
3106                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3107                         pars++;
3108                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3109                         if ( stopmin > minnext)
3110                             stopmin = min + min1;
3111                         flags &= ~SCF_DO_SUBSTR;
3112                         if (data)
3113                             data->flags |= SCF_SEEN_ACCEPT;
3114                     }
3115                     if (data) {
3116                         if (data_fake.flags & SF_HAS_EVAL)
3117                             data->flags |= SF_HAS_EVAL;
3118                         data->whilem_c = data_fake.whilem_c;
3119                     }
3120                     if (flags & SCF_DO_STCLASS)
3121                         cl_or(pRExC_state, &accum, &this_class);
3122                 }
3123                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3124                     min1 = 0;
3125                 if (flags & SCF_DO_SUBSTR) {
3126                     data->pos_min += min1;
3127                     data->pos_delta += max1 - min1;
3128                     if (max1 != min1 || is_inf)
3129                         data->longest = &(data->longest_float);
3130                 }
3131                 min += min1;
3132                 delta += max1 - min1;
3133                 if (flags & SCF_DO_STCLASS_OR) {
3134                     cl_or(pRExC_state, data->start_class, &accum);
3135                     if (min1) {
3136                         cl_and(data->start_class, and_withp);
3137                         flags &= ~SCF_DO_STCLASS;
3138                     }
3139                 }
3140                 else if (flags & SCF_DO_STCLASS_AND) {
3141                     if (min1) {
3142                         cl_and(data->start_class, &accum);
3143                         flags &= ~SCF_DO_STCLASS;
3144                     }
3145                     else {
3146                         /* Switch to OR mode: cache the old value of
3147                          * data->start_class */
3148                         INIT_AND_WITHP;
3149                         StructCopy(data->start_class, and_withp,
3150                                    struct regnode_charclass_class);
3151                         flags &= ~SCF_DO_STCLASS_AND;
3152                         StructCopy(&accum, data->start_class,
3153                                    struct regnode_charclass_class);
3154                         flags |= SCF_DO_STCLASS_OR;
3155                         SET_SSC_EOS(data->start_class);
3156                     }
3157                 }
3158
3159                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3160                 /* demq.
3161
3162                    Assuming this was/is a branch we are dealing with: 'scan' now
3163                    points at the item that follows the branch sequence, whatever
3164                    it is. We now start at the beginning of the sequence and look
3165                    for subsequences of
3166
3167                    BRANCH->EXACT=>x1
3168                    BRANCH->EXACT=>x2
3169                    tail
3170
3171                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3172
3173                    If we can find such a subsequence we need to turn the first
3174                    element into a trie and then add the subsequent branch exact
3175                    strings to the trie.
3176
3177                    We have two cases
3178
3179                      1. patterns where the whole set of branches can be converted.
3180
3181                      2. patterns where only a subset can be converted.
3182
3183                    In case 1 we can replace the whole set with a single regop
3184                    for the trie. In case 2 we need to keep the start and end
3185                    branches so
3186
3187                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3188                      becomes BRANCH TRIE; BRANCH X;
3189
3190                   There is an additional case, that being where there is a
3191                   common prefix, which gets split out into an EXACT like node
3192                   preceding the TRIE node.
3193
3194                   If x(1..n)==tail then we can do a simple trie, if not we make
3195                   a "jump" trie, such that when we match the appropriate word
3196                   we "jump" to the appropriate tail node. Essentially we turn
3197                   a nested if into a case structure of sorts.
3198
3199                 */
3200
3201                     int made=0;
3202                     if (!re_trie_maxbuff) {
3203                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3204                         if (!SvIOK(re_trie_maxbuff))
3205                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3206                     }
3207                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3208                         regnode *cur;
3209                         regnode *first = (regnode *)NULL;
3210                         regnode *last = (regnode *)NULL;
3211                         regnode *tail = scan;
3212                         U8 trietype = 0;
3213                         U32 count=0;
3214
3215 #ifdef DEBUGGING
3216                         SV * const mysv = sv_newmortal();       /* for dumping */
3217 #endif
3218                         /* var tail is used because there may be a TAIL
3219                            regop in the way. Ie, the exacts will point to the
3220                            thing following the TAIL, but the last branch will
3221                            point at the TAIL. So we advance tail. If we
3222                            have nested (?:) we may have to move through several
3223                            tails.
3224                          */
3225
3226                         while ( OP( tail ) == TAIL ) {
3227                             /* this is the TAIL generated by (?:) */
3228                             tail = regnext( tail );
3229                         }
3230
3231
3232                         DEBUG_TRIE_COMPILE_r({
3233                             regprop(RExC_rx, mysv, tail );
3234                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3235                                 (int)depth * 2 + 2, "",
3236                                 "Looking for TRIE'able sequences. Tail node is: ",
3237                                 SvPV_nolen_const( mysv )
3238                             );
3239                         });
3240
3241                         /*
3242
3243                             Step through the branches
3244                                 cur represents each branch,
3245                                 noper is the first thing to be matched as part of that branch
3246                                 noper_next is the regnext() of that node.
3247
3248                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3249                             via a "jump trie" but we also support building with NOJUMPTRIE,
3250                             which restricts the trie logic to structures like /FOO|BAR/.
3251
3252                             If noper is a trieable nodetype then the branch is a possible optimization
3253                             target. If we are building under NOJUMPTRIE then we require that noper_next
3254                             is the same as scan (our current position in the regex program).
3255
3256                             Once we have two or more consecutive such branches we can create a
3257                             trie of the EXACT's contents and stitch it in place into the program.
3258
3259                             If the sequence represents all of the branches in the alternation we
3260                             replace the entire thing with a single TRIE node.
3261
3262                             Otherwise when it is a subsequence we need to stitch it in place and
3263                             replace only the relevant branches. This means the first branch has
3264                             to remain as it is used by the alternation logic, and its next pointer,
3265                             and needs to be repointed at the item on the branch chain following
3266                             the last branch we have optimized away.
3267
3268                             This could be either a BRANCH, in which case the subsequence is internal,
3269                             or it could be the item following the branch sequence in which case the
3270                             subsequence is at the end (which does not necessarily mean the first node
3271                             is the start of the alternation).
3272
3273                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3274
3275                                 optype          |  trietype
3276                                 ----------------+-----------
3277                                 NOTHING         | NOTHING
3278                                 EXACT           | EXACT
3279                                 EXACTFU         | EXACTFU
3280                                 EXACTFU_SS      | EXACTFU
3281                                 EXACTFU_TRICKYFOLD | EXACTFU
3282                                 EXACTFA         | 0
3283
3284
3285                         */
3286 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3287                        ( EXACT == (X) )   ? EXACT :        \
3288                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3289                        0 )
3290
3291                         /* dont use tail as the end marker for this traverse */
3292                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3293                             regnode * const noper = NEXTOPER( cur );
3294                             U8 noper_type = OP( noper );
3295                             U8 noper_trietype = TRIE_TYPE( noper_type );
3296 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3297                             regnode * const noper_next = regnext( noper );
3298                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3299                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3300 #endif
3301
3302                             DEBUG_TRIE_COMPILE_r({
3303                                 regprop(RExC_rx, mysv, cur);
3304                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3305                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3306
3307                                 regprop(RExC_rx, mysv, noper);
3308                                 PerlIO_printf( Perl_debug_log, " -> %s",
3309                                     SvPV_nolen_const(mysv));
3310
3311                                 if ( noper_next ) {
3312                                   regprop(RExC_rx, mysv, noper_next );
3313                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3314                                     SvPV_nolen_const(mysv));
3315                                 }
3316                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3317                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3318                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3319                                 );
3320                             });
3321
3322                             /* Is noper a trieable nodetype that can be merged with the
3323                              * current trie (if there is one)? */
3324                             if ( noper_trietype
3325                                   &&
3326                                   (
3327                                         ( noper_trietype == NOTHING)
3328                                         || ( trietype == NOTHING )
3329                                         || ( trietype == noper_trietype )
3330                                   )
3331 #ifdef NOJUMPTRIE
3332                                   && noper_next == tail
3333 #endif
3334                                   && count < U16_MAX)
3335                             {
3336                                 /* Handle mergable triable node
3337                                  * Either we are the first node in a new trieable sequence,
3338                                  * in which case we do some bookkeeping, otherwise we update
3339                                  * the end pointer. */
3340                                 if ( !first ) {
3341                                     first = cur;
3342                                     if ( noper_trietype == NOTHING ) {
3343 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3344                                         regnode * const noper_next = regnext( noper );
3345                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3346                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3347 #endif
3348
3349                                         if ( noper_next_trietype ) {
3350                                             trietype = noper_next_trietype;
3351                                         } else if (noper_next_type)  {
3352                                             /* a NOTHING regop is 1 regop wide. We need at least two
3353                                              * for a trie so we can't merge this in */
3354                                             first = NULL;
3355                                         }
3356                                     } else {
3357                                         trietype = noper_trietype;
3358                                     }
3359                                 } else {
3360                                     if ( trietype == NOTHING )
3361                                         trietype = noper_trietype;
3362                                     last = cur;
3363                                 }
3364                                 if (first)
3365                                     count++;
3366                             } /* end handle mergable triable node */
3367                             else {
3368                                 /* handle unmergable node -
3369                                  * noper may either be a triable node which can not be tried
3370                                  * together with the current trie, or a non triable node */
3371                                 if ( last ) {
3372                                     /* If last is set and trietype is not NOTHING then we have found
3373                                      * at least two triable branch sequences in a row of a similar
3374                                      * trietype so we can turn them into a trie. If/when we
3375                                      * allow NOTHING to start a trie sequence this condition will be
3376                                      * required, and it isn't expensive so we leave it in for now. */
3377                                     if ( trietype && trietype != NOTHING )
3378                                         make_trie( pRExC_state,
3379                                                 startbranch, first, cur, tail, count,
3380                                                 trietype, depth+1 );
3381                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3382                                 }
3383                                 if ( noper_trietype
3384 #ifdef NOJUMPTRIE
3385                                      && noper_next == tail
3386 #endif
3387                                 ){
3388                                     /* noper is triable, so we can start a new trie sequence */
3389                                     count = 1;
3390                                     first = cur;
3391                                     trietype = noper_trietype;
3392                                 } else if (first) {
3393                                     /* if we already saw a first but the current node is not triable then we have
3394                                      * to reset the first information. */
3395                                     count = 0;
3396                                     first = NULL;
3397                                     trietype = 0;
3398                                 }
3399                             } /* end handle unmergable node */
3400                         } /* loop over branches */
3401                         DEBUG_TRIE_COMPILE_r({
3402                             regprop(RExC_rx, mysv, cur);
3403                             PerlIO_printf( Perl_debug_log,
3404                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3405                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3406
3407                         });
3408                         if ( last && trietype ) {
3409                             if ( trietype != NOTHING ) {
3410                                 /* the last branch of the sequence was part of a trie,
3411                                  * so we have to construct it here outside of the loop
3412                                  */
3413                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3414 #ifdef TRIE_STUDY_OPT
3415                                 if ( ((made == MADE_EXACT_TRIE &&
3416                                      startbranch == first)
3417                                      || ( first_non_open == first )) &&
3418                                      depth==0 ) {
3419                                     flags |= SCF_TRIE_RESTUDY;
3420                                     if ( startbranch == first
3421                                          && scan == tail )
3422                                     {
3423                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3424                                     }
3425                                 }
3426 #endif
3427                             } else {
3428                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3429                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3430                                  */
3431                                 if ( startbranch == first ) {
3432                                     regnode *opt;
3433                                     /* the entire thing is a NOTHING sequence, something like this:
3434                                      * (?:|) So we can turn it into a plain NOTHING op. */
3435                                     DEBUG_TRIE_COMPILE_r({
3436                                         regprop(RExC_rx, mysv, cur);
3437                                         PerlIO_printf( Perl_debug_log,
3438                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3439                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3440
3441                                     });
3442                                     OP(startbranch)= NOTHING;
3443                                     NEXT_OFF(startbranch)= tail - startbranch;
3444                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3445                                         OP(opt)= OPTIMIZED;
3446                                 }
3447                             }
3448                         } /* end if ( last) */
3449                     } /* TRIE_MAXBUF is non zero */
3450
3451                 } /* do trie */
3452
3453             }
3454             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3455                 scan = NEXTOPER(NEXTOPER(scan));
3456             } else                      /* single branch is optimized. */
3457                 scan = NEXTOPER(scan);
3458             continue;
3459         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3460             scan_frame *newframe = NULL;
3461             I32 paren;
3462             regnode *start;
3463             regnode *end;
3464
3465             if (OP(scan) != SUSPEND) {
3466             /* set the pointer */
3467                 if (OP(scan) == GOSUB) {
3468                     paren = ARG(scan);
3469                     RExC_recurse[ARG2L(scan)] = scan;
3470                     start = RExC_open_parens[paren-1];
3471                     end   = RExC_close_parens[paren-1];
3472                 } else {
3473                     paren = 0;
3474                     start = RExC_rxi->program + 1;
3475                     end   = RExC_opend;
3476                 }
3477                 if (!recursed) {
3478                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3479                     SAVEFREEPV(recursed);
3480                 }
3481                 if (!PAREN_TEST(recursed,paren+1)) {
3482                     PAREN_SET(recursed,paren+1);
3483                     Newx(newframe,1,scan_frame);
3484                 } else {
3485                     if (flags & SCF_DO_SUBSTR) {
3486                         SCAN_COMMIT(pRExC_state,data,minlenp);
3487                         data->longest = &(data->longest_float);
3488                     }
3489                     is_inf = is_inf_internal = 1;
3490                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3491                         cl_anything(pRExC_state, data->start_class);
3492                     flags &= ~SCF_DO_STCLASS;
3493                 }
3494             } else {
3495                 Newx(newframe,1,scan_frame);
3496                 paren = stopparen;
3497                 start = scan+2;
3498                 end = regnext(scan);
3499             }
3500             if (newframe) {
3501                 assert(start);
3502                 assert(end);
3503                 SAVEFREEPV(newframe);
3504                 newframe->next = regnext(scan);
3505                 newframe->last = last;
3506                 newframe->stop = stopparen;
3507                 newframe->prev = frame;
3508
3509                 frame = newframe;
3510                 scan =  start;
3511                 stopparen = paren;
3512                 last = end;
3513
3514                 continue;
3515             }
3516         }
3517         else if (OP(scan) == EXACT) {
3518             I32 l = STR_LEN(scan);
3519             UV uc;
3520             if (UTF) {
3521                 const U8 * const s = (U8*)STRING(scan);
3522                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3523                 l = utf8_length(s, s + l);
3524             } else {
3525                 uc = *((U8*)STRING(scan));
3526             }
3527             min += l;
3528             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3529                 /* The code below prefers earlier match for fixed
3530                    offset, later match for variable offset.  */
3531                 if (data->last_end == -1) { /* Update the start info. */
3532                     data->last_start_min = data->pos_min;
3533                     data->last_start_max = is_inf
3534                         ? I32_MAX : data->pos_min + data->pos_delta;
3535                 }
3536                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3537                 if (UTF)
3538                     SvUTF8_on(data->last_found);
3539                 {
3540                     SV * const sv = data->last_found;
3541                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3542                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3543                     if (mg && mg->mg_len >= 0)
3544                         mg->mg_len += utf8_length((U8*)STRING(scan),
3545                                                   (U8*)STRING(scan)+STR_LEN(scan));
3546                 }
3547                 data->last_end = data->pos_min + l;
3548                 data->pos_min += l; /* As in the first entry. */
3549                 data->flags &= ~SF_BEFORE_EOL;
3550             }
3551             if (flags & SCF_DO_STCLASS_AND) {
3552                 /* Check whether it is compatible with what we know already! */
3553                 int compat = 1;
3554
3555
3556                 /* If compatible, we or it in below.  It is compatible if is
3557                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3558                  * it's for a locale.  Even if there isn't unicode semantics
3559                  * here, at runtime there may be because of matching against a
3560                  * utf8 string, so accept a possible false positive for
3561                  * latin1-range folds */
3562                 if (uc >= 0x100 ||
3563                     (!(data->start_class->flags & ANYOF_LOCALE)
3564                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3565                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3566                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3567                     )
3568                 {
3569                     compat = 0;
3570                 }
3571                 ANYOF_CLASS_ZERO(data->start_class);
3572                 ANYOF_BITMAP_ZERO(data->start_class);
3573                 if (compat)
3574                     ANYOF_BITMAP_SET(data->start_class, uc);
3575                 else if (uc >= 0x100) {
3576                     int i;
3577
3578                     /* Some Unicode code points fold to the Latin1 range; as
3579                      * XXX temporary code, instead of figuring out if this is
3580                      * one, just assume it is and set all the start class bits
3581                      * that could be some such above 255 code point's fold
3582                      * which will generate fals positives.  As the code
3583                      * elsewhere that does compute the fold settles down, it
3584                      * can be extracted out and re-used here */
3585                     for (i = 0; i < 256; i++){
3586                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3587                             ANYOF_BITMAP_SET(data->start_class, i);
3588                         }
3589                     }
3590                 }
3591                 CLEAR_SSC_EOS(data->start_class);
3592                 if (uc < 0x100)
3593                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3594             }
3595             else if (flags & SCF_DO_STCLASS_OR) {
3596                 /* false positive possible if the class is case-folded */
3597                 if (uc < 0x100)
3598                     ANYOF_BITMAP_SET(data->start_class, uc);
3599                 else
3600                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3601                 CLEAR_SSC_EOS(data->start_class);
3602                 cl_and(data->start_class, and_withp);
3603             }
3604             flags &= ~SCF_DO_STCLASS;
3605         }
3606         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3607             I32 l = STR_LEN(scan);
3608             UV uc = *((U8*)STRING(scan));
3609
3610             /* Search for fixed substrings supports EXACT only. */
3611             if (flags & SCF_DO_SUBSTR) {
3612                 assert(data);
3613                 SCAN_COMMIT(pRExC_state, data, minlenp);
3614             }
3615             if (UTF) {
3616                 const U8 * const s = (U8 *)STRING(scan);
3617                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3618                 l = utf8_length(s, s + l);
3619             }
3620             if (has_exactf_sharp_s) {
3621                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3622             }
3623             min += l - min_subtract;
3624             assert (min >= 0);
3625             delta += min_subtract;
3626             if (flags & SCF_DO_SUBSTR) {
3627                 data->pos_min += l - min_subtract;
3628                 if (data->pos_min < 0) {
3629                     data->pos_min = 0;
3630                 }
3631                 data->pos_delta += min_subtract;
3632                 if (min_subtract) {
3633                     data->longest = &(data->longest_float);
3634                 }
3635             }
3636             if (flags & SCF_DO_STCLASS_AND) {
3637                 /* Check whether it is compatible with what we know already! */
3638                 int compat = 1;
3639                 if (uc >= 0x100 ||
3640                  (!(data->start_class->flags & ANYOF_LOCALE)
3641                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3642                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3643                 {
3644                     compat = 0;
3645                 }
3646                 ANYOF_CLASS_ZERO(data->start_class);
3647                 ANYOF_BITMAP_ZERO(data->start_class);
3648                 if (compat) {
3649                     ANYOF_BITMAP_SET(data->start_class, uc);
3650                     CLEAR_SSC_EOS(data->start_class);
3651                     if (OP(scan) == EXACTFL) {
3652                         /* XXX This set is probably no longer necessary, and
3653                          * probably wrong as LOCALE now is on in the initial
3654                          * state */
3655                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3656                     }
3657                     else {
3658
3659                         /* Also set the other member of the fold pair.  In case
3660                          * that unicode semantics is called for at runtime, use
3661                          * the full latin1 fold.  (Can't do this for locale,
3662                          * because not known until runtime) */
3663                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3664
3665                         /* All other (EXACTFL handled above) folds except under
3666                          * /iaa that include s, S, and sharp_s also may include
3667                          * the others */
3668                         if (OP(scan) != EXACTFA) {
3669                             if (uc == 's' || uc == 'S') {
3670                                 ANYOF_BITMAP_SET(data->start_class,
3671                                                  LATIN_SMALL_LETTER_SHARP_S);
3672                             }
3673                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3674                                 ANYOF_BITMAP_SET(data->start_class, 's');
3675                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3676                             }
3677                         }
3678                     }
3679                 }
3680                 else if (uc >= 0x100) {
3681                     int i;
3682                     for (i = 0; i < 256; i++){
3683                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3684                             ANYOF_BITMAP_SET(data->start_class, i);
3685                         }
3686                     }
3687                 }
3688             }
3689             else if (flags & SCF_DO_STCLASS_OR) {
3690                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3691                     /* false positive possible if the class is case-folded.
3692                        Assume that the locale settings are the same... */
3693                     if (uc < 0x100) {
3694                         ANYOF_BITMAP_SET(data->start_class, uc);
3695                         if (OP(scan) != EXACTFL) {
3696
3697                             /* And set the other member of the fold pair, but
3698                              * can't do that in locale because not known until
3699                              * run-time */
3700                             ANYOF_BITMAP_SET(data->start_class,
3701                                              PL_fold_latin1[uc]);
3702
3703                             /* All folds except under /iaa that include s, S,
3704                              * and sharp_s also may include the others */
3705                             if (OP(scan) != EXACTFA) {
3706                                 if (uc == 's' || uc == 'S') {
3707                                     ANYOF_BITMAP_SET(data->start_class,
3708                                                    LATIN_SMALL_LETTER_SHARP_S);
3709                                 }
3710                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3711                                     ANYOF_BITMAP_SET(data->start_class, 's');
3712                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3713                                 }
3714                             }
3715                         }
3716                     }
3717                     CLEAR_SSC_EOS(data->start_class);
3718                 }
3719                 cl_and(data->start_class, and_withp);
3720             }
3721             flags &= ~SCF_DO_STCLASS;
3722         }
3723         else if (REGNODE_VARIES(OP(scan))) {
3724             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3725             I32 f = flags, pos_before = 0;
3726             regnode * const oscan = scan;
3727             struct regnode_charclass_class this_class;
3728             struct regnode_charclass_class *oclass = NULL;
3729             I32 next_is_eval = 0;
3730
3731             switch (PL_regkind[OP(scan)]) {
3732             case WHILEM:                /* End of (?:...)* . */
3733                 scan = NEXTOPER(scan);
3734                 goto finish;
3735             case PLUS:
3736                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3737                     next = NEXTOPER(scan);
3738                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3739                         mincount = 1;
3740                         maxcount = REG_INFTY;
3741                         next = regnext(scan);
3742                         scan = NEXTOPER(scan);
3743                         goto do_curly;
3744                     }
3745                 }
3746                 if (flags & SCF_DO_SUBSTR)
3747                     data->pos_min++;
3748                 min++;
3749                 /* Fall through. */
3750             case STAR:
3751                 if (flags & SCF_DO_STCLASS) {
3752                     mincount = 0;
3753                     maxcount = REG_INFTY;
3754                     next = regnext(scan);
3755                     scan = NEXTOPER(scan);
3756                     goto do_curly;
3757                 }
3758                 is_inf = is_inf_internal = 1;
3759                 scan = regnext(scan);
3760                 if (flags & SCF_DO_SUBSTR) {
3761                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3762                     data->longest = &(data->longest_float);
3763                 }
3764                 goto optimize_curly_tail;
3765             case CURLY:
3766                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3767                     && (scan->flags == stopparen))
3768                 {
3769                     mincount = 1;
3770                     maxcount = 1;
3771                 } else {
3772                     mincount = ARG1(scan);
3773                     maxcount = ARG2(scan);
3774                 }
3775                 next = regnext(scan);
3776                 if (OP(scan) == CURLYX) {
3777                     I32 lp = (data ? *(data->last_closep) : 0);
3778                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3779                 }
3780                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3781                 next_is_eval = (OP(scan) == EVAL);
3782               do_curly:
3783                 if (flags & SCF_DO_SUBSTR) {
3784                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3785                     pos_before = data->pos_min;
3786                 }
3787                 if (data) {
3788                     fl = data->flags;
3789                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3790                     if (is_inf)
3791                         data->flags |= SF_IS_INF;
3792                 }
3793                 if (flags & SCF_DO_STCLASS) {
3794                     cl_init(pRExC_state, &this_class);
3795                     oclass = data->start_class;
3796                     data->start_class = &this_class;
3797                     f |= SCF_DO_STCLASS_AND;
3798                     f &= ~SCF_DO_STCLASS_OR;
3799                 }
3800                 /* Exclude from super-linear cache processing any {n,m}
3801                    regops for which the combination of input pos and regex
3802                    pos is not enough information to determine if a match
3803                    will be possible.
3804
3805                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3806                    regex pos at the \s*, the prospects for a match depend not
3807                    only on the input position but also on how many (bar\s*)
3808                    repeats into the {4,8} we are. */
3809                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3810                     f &= ~SCF_WHILEM_VISITED_POS;
3811
3812                 /* This will finish on WHILEM, setting scan, or on NULL: */
3813                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3814                                       last, data, stopparen, recursed, NULL,
3815                                       (mincount == 0
3816                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3817
3818                 if (flags & SCF_DO_STCLASS)
3819                     data->start_class = oclass;
3820                 if (mincount == 0 || minnext == 0) {
3821                     if (flags & SCF_DO_STCLASS_OR) {
3822                         cl_or(pRExC_state, data->start_class, &this_class);
3823                     }
3824                     else if (flags & SCF_DO_STCLASS_AND) {
3825                         /* Switch to OR mode: cache the old value of
3826                          * data->start_class */
3827                         INIT_AND_WITHP;
3828                         StructCopy(data->start_class, and_withp,
3829                                    struct regnode_charclass_class);
3830                         flags &= ~SCF_DO_STCLASS_AND;
3831                         StructCopy(&this_class, data->start_class,
3832                                    struct regnode_charclass_class);
3833                         flags |= SCF_DO_STCLASS_OR;
3834                         SET_SSC_EOS(data->start_class);
3835                     }
3836                 } else {                /* Non-zero len */
3837                     if (flags & SCF_DO_STCLASS_OR) {
3838                         cl_or(pRExC_state, data->start_class, &this_class);
3839                         cl_and(data->start_class, and_withp);
3840                     }
3841                     else if (flags & SCF_DO_STCLASS_AND)
3842                         cl_and(data->start_class, &this_class);
3843                     flags &= ~SCF_DO_STCLASS;
3844                 }
3845                 if (!scan)              /* It was not CURLYX, but CURLY. */
3846                     scan = next;
3847                 if ( /* ? quantifier ok, except for (?{ ... }) */
3848                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3849                     && (minnext == 0) && (deltanext == 0)
3850                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3851                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3852                 {
3853                     /* Fatal warnings may leak the regexp without this: */
3854                     SAVEFREESV(RExC_rx_sv);
3855                     ckWARNreg(RExC_parse,
3856                               "Quantifier unexpected on zero-length expression");
3857                     (void)ReREFCNT_inc(RExC_rx_sv);
3858                 }
3859
3860                 min += minnext * mincount;
3861                 is_inf_internal |= ((maxcount == REG_INFTY
3862                                      && (minnext + deltanext) > 0)
3863                                     || deltanext == I32_MAX);
3864                 is_inf |= is_inf_internal;
3865                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3866
3867                 /* Try powerful optimization CURLYX => CURLYN. */
3868                 if (  OP(oscan) == CURLYX && data
3869                       && data->flags & SF_IN_PAR
3870                       && !(data->flags & SF_HAS_EVAL)
3871                       && !deltanext && minnext == 1 ) {
3872                     /* Try to optimize to CURLYN.  */
3873                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3874                     regnode * const nxt1 = nxt;
3875 #ifdef DEBUGGING
3876                     regnode *nxt2;
3877 #endif
3878
3879                     /* Skip open. */
3880                     nxt = regnext(nxt);
3881                     if (!REGNODE_SIMPLE(OP(nxt))
3882                         && !(PL_regkind[OP(nxt)] == EXACT
3883                              && STR_LEN(nxt) == 1))
3884                         goto nogo;
3885 #ifdef DEBUGGING
3886                     nxt2 = nxt;
3887 #endif
3888                     nxt = regnext(nxt);
3889                     if (OP(nxt) != CLOSE)
3890                         goto nogo;
3891                     if (RExC_open_parens) {
3892                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3893                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3894                     }
3895                     /* Now we know that nxt2 is the only contents: */
3896                     oscan->flags = (U8)ARG(nxt);
3897                     OP(oscan) = CURLYN;
3898                     OP(nxt1) = NOTHING; /* was OPEN. */
3899
3900 #ifdef DEBUGGING
3901                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3902                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3903                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3904                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3905                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3906                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3907 #endif
3908                 }
3909               nogo:
3910
3911                 /* Try optimization CURLYX => CURLYM. */
3912                 if (  OP(oscan) == CURLYX && data
3913                       && !(data->flags & SF_HAS_PAR)
3914                       && !(data->flags & SF_HAS_EVAL)
3915                       && !deltanext     /* atom is fixed width */
3916                       && minnext != 0   /* CURLYM can't handle zero width */
3917                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3918                 ) {
3919                     /* XXXX How to optimize if data == 0? */
3920                     /* Optimize to a simpler form.  */
3921                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3922                     regnode *nxt2;
3923
3924                     OP(oscan) = CURLYM;
3925                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3926                             && (OP(nxt2) != WHILEM))
3927                         nxt = nxt2;
3928                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3929                     /* Need to optimize away parenths. */
3930                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3931                         /* Set the parenth number.  */
3932                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3933
3934                         oscan->flags = (U8)ARG(nxt);
3935                         if (RExC_open_parens) {
3936                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3937                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3938                         }
3939                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3940                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3941
3942 #ifdef DEBUGGING
3943                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3944                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3945                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3946                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3947 #endif
3948 #if 0
3949                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3950                             regnode *nnxt = regnext(nxt1);
3951                             if (nnxt == nxt) {
3952                                 if (reg_off_by_arg[OP(nxt1)])
3953                                     ARG_SET(nxt1, nxt2 - nxt1);
3954                                 else if (nxt2 - nxt1 < U16_MAX)
3955                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3956                                 else
3957                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3958                             }
3959                             nxt1 = nnxt;
3960                         }
3961 #endif
3962                         /* Optimize again: */
3963                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3964                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3965                     }
3966                     else
3967                         oscan->flags = 0;
3968                 }
3969                 else if ((OP(oscan) == CURLYX)
3970                          && (flags & SCF_WHILEM_VISITED_POS)
3971                          /* See the comment on a similar expression above.
3972                             However, this time it's not a subexpression
3973                             we care about, but the expression itself. */
3974                          && (maxcount == REG_INFTY)
3975                          && data && ++data->whilem_c < 16) {
3976                     /* This stays as CURLYX, we can put the count/of pair. */
3977                     /* Find WHILEM (as in regexec.c) */
3978                     regnode *nxt = oscan + NEXT_OFF(oscan);
3979
3980                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3981                         nxt += ARG(nxt);
3982                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3983                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3984                 }
3985                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3986                     pars++;
3987                 if (flags & SCF_DO_SUBSTR) {
3988                     SV *last_str = NULL;
3989                     int counted = mincount != 0;
3990
3991                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3992 #if defined(SPARC64_GCC_WORKAROUND)
3993                         I32 b = 0;
3994                         STRLEN l = 0;
3995                         const char *s = NULL;
3996                         I32 old = 0;
3997
3998                         if (pos_before >= data->last_start_min)
3999                             b = pos_before;
4000                         else
4001                             b = data->last_start_min;
4002
4003                         l = 0;
4004                         s = SvPV_const(data->last_found, l);
4005                         old = b - data->last_start_min;
4006
4007 #else
4008                         I32 b = pos_before >= data->last_start_min
4009                             ? pos_before : data->last_start_min;
4010                         STRLEN l;
4011                         const char * const s = SvPV_const(data->last_found, l);
4012                         I32 old = b - data->last_start_min;
4013 #endif
4014
4015                         if (UTF)
4016                             old = utf8_hop((U8*)s, old) - (U8*)s;
4017                         l -= old;
4018                         /* Get the added string: */
4019                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4020                         if (deltanext == 0 && pos_before == b) {
4021                             /* What was added is a constant string */
4022                             if (mincount > 1) {
4023                                 SvGROW(last_str, (mincount * l) + 1);
4024                                 repeatcpy(SvPVX(last_str) + l,
4025                                           SvPVX_const(last_str), l, mincount - 1);
4026                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4027                                 /* Add additional parts. */
4028                                 SvCUR_set(data->last_found,
4029                                           SvCUR(data->last_found) - l);
4030                                 sv_catsv(data->last_found, last_str);
4031                                 {
4032                                     SV * sv = data->last_found;
4033                                     MAGIC *mg =
4034                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4035                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4036                                     if (mg && mg->mg_len >= 0)
4037                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4038                                 }
4039                                 data->last_end += l * (mincount - 1);
4040                             }
4041                         } else {
4042                             /* start offset must point into the last copy */
4043                             data->last_start_min += minnext * (mincount - 1);
4044                             data->last_start_max += is_inf ? I32_MAX
4045                                 : (maxcount - 1) * (minnext + data->pos_delta);
4046                         }
4047                     }
4048                     /* It is counted once already... */
4049                     data->pos_min += minnext * (mincount - counted);
4050                     data->pos_delta += - counted * deltanext +
4051                         (minnext + deltanext) * maxcount - minnext * mincount;
4052                     if (mincount != maxcount) {
4053                          /* Cannot extend fixed substrings found inside
4054                             the group.  */
4055                         SCAN_COMMIT(pRExC_state,data,minlenp);
4056                         if (mincount && last_str) {
4057                             SV * const sv = data->last_found;
4058                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4059                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4060
4061                             if (mg)
4062                                 mg->mg_len = -1;
4063                             sv_setsv(sv, last_str);
4064                             data->last_end = data->pos_min;
4065                             data->last_start_min =
4066                                 data->pos_min - CHR_SVLEN(last_str);
4067                             data->last_start_max = is_inf
4068                                 ? I32_MAX
4069                                 : data->pos_min + data->pos_delta
4070                                 - CHR_SVLEN(last_str);
4071                         }
4072                         data->longest = &(data->longest_float);
4073                     }
4074                     SvREFCNT_dec(last_str);
4075                 }
4076                 if (data && (fl & SF_HAS_EVAL))
4077                     data->flags |= SF_HAS_EVAL;
4078               optimize_curly_tail:
4079                 if (OP(oscan) != CURLYX) {
4080                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4081                            && NEXT_OFF(next))
4082                         NEXT_OFF(oscan) += NEXT_OFF(next);
4083                 }
4084                 continue;
4085             default:                    /* REF, ANYOFV, and CLUMP only? */
4086                 if (flags & SCF_DO_SUBSTR) {
4087                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4088                     data->longest = &(data->longest_float);
4089                 }
4090                 is_inf = is_inf_internal = 1;
4091                 if (flags & SCF_DO_STCLASS_OR)
4092                     cl_anything(pRExC_state, data->start_class);
4093                 flags &= ~SCF_DO_STCLASS;
4094                 break;
4095             }
4096         }
4097         else if (OP(scan) == LNBREAK) {
4098             if (flags & SCF_DO_STCLASS) {
4099                 int value = 0;
4100                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4101                 if (flags & SCF_DO_STCLASS_AND) {
4102                     for (value = 0; value < 256; value++)
4103                         if (!is_VERTWS_cp(value))
4104                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4105                 }
4106                 else {
4107                     for (value = 0; value < 256; value++)
4108                         if (is_VERTWS_cp(value))
4109                             ANYOF_BITMAP_SET(data->start_class, value);
4110                 }
4111                 if (flags & SCF_DO_STCLASS_OR)
4112                     cl_and(data->start_class, and_withp);
4113                 flags &= ~SCF_DO_STCLASS;
4114             }
4115             min++;
4116             delta++;    /* Because of the 2 char string cr-lf */
4117             if (flags & SCF_DO_SUBSTR) {
4118                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4119                 data->pos_min += 1;
4120                 data->pos_delta += 1;
4121                 data->longest = &(data->longest_float);
4122             }
4123         }
4124         else if (REGNODE_SIMPLE(OP(scan))) {
4125             int value = 0;
4126
4127             if (flags & SCF_DO_SUBSTR) {
4128                 SCAN_COMMIT(pRExC_state,data,minlenp);
4129                 data->pos_min++;
4130             }
4131             min++;
4132             if (flags & SCF_DO_STCLASS) {
4133                 int loop_max = 256;
4134                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4135
4136                 /* Some of the logic below assumes that switching
4137                    locale on will only add false positives. */
4138                 switch (PL_regkind[OP(scan)]) {
4139                     U8 classnum;
4140
4141                 case SANY:
4142                 default:
4143 #ifdef DEBUGGING
4144                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4145 #endif
4146                  do_default:
4147                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4148                         cl_anything(pRExC_state, data->start_class);
4149                     break;
4150                 case REG_ANY:
4151                     if (OP(scan) == SANY)
4152                         goto do_default;
4153                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4154                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4155                                 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4156                         cl_anything(pRExC_state, data->start_class);
4157                     }
4158                     if (flags & SCF_DO_STCLASS_AND || !value)
4159                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4160                     break;
4161                 case ANYOF:
4162                     if (flags & SCF_DO_STCLASS_AND)
4163                         cl_and(data->start_class,
4164                                (struct regnode_charclass_class*)scan);
4165                     else
4166                         cl_or(pRExC_state, data->start_class,
4167                               (struct regnode_charclass_class*)scan);
4168                     break;
4169                 case POSIXA:
4170                     loop_max = 128;
4171                     /* FALL THROUGH */
4172                 case POSIXL:
4173                 case POSIXD:
4174                 case POSIXU:
4175                     classnum = FLAGS(scan);
4176                     if (flags & SCF_DO_STCLASS_AND) {
4177                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4178                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4179                             for (value = 0; value < loop_max; value++) {
4180                                 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4181                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4182                                 }
4183                             }
4184                         }
4185                     }
4186                     else {
4187                         if (data->start_class->flags & ANYOF_LOCALE) {
4188                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4189                         }
4190                         else {
4191
4192                         /* Even if under locale, set the bits for non-locale
4193                          * in case it isn't a true locale-node.  This will
4194                          * create false positives if it truly is locale */
4195                         for (value = 0; value < loop_max; value++) {
4196                             if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4197                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4198                             }
4199                         }
4200                         }
4201                     }
4202                     break;
4203                 case NPOSIXA:
4204                     loop_max = 128;
4205                     /* FALL THROUGH */
4206                 case NPOSIXL:
4207                 case NPOSIXU:
4208                 case NPOSIXD:
4209                     classnum = FLAGS(scan);
4210                     if (flags & SCF_DO_STCLASS_AND) {
4211                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4212                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4213                             for (value = 0; value < loop_max; value++) {
4214                                 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4215                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4216                                 }
4217                             }
4218                         }
4219                     }
4220                     else {
4221                         if (data->start_class->flags & ANYOF_LOCALE) {
4222                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4223                         }
4224                         else {
4225
4226                         /* Even if under locale, set the bits for non-locale in
4227                          * case it isn't a true locale-node.  This will create
4228                          * false positives if it truly is locale */
4229                         for (value = 0; value < loop_max; value++) {
4230                             if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4231                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4232                             }
4233                         }
4234                         if (PL_regkind[OP(scan)] == NPOSIXD) {
4235                             data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4236                         }
4237                         }
4238                     }
4239                     break;
4240                 }
4241                 if (flags & SCF_DO_STCLASS_OR)
4242                     cl_and(data->start_class, and_withp);
4243                 flags &= ~SCF_DO_STCLASS;
4244             }
4245         }
4246         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4247             data->flags |= (OP(scan) == MEOL
4248                             ? SF_BEFORE_MEOL
4249                             : SF_BEFORE_SEOL);
4250             SCAN_COMMIT(pRExC_state, data, minlenp);
4251
4252         }
4253         else if (  PL_regkind[OP(scan)] == BRANCHJ
4254                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4255                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4256                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4257             if ( OP(scan) == UNLESSM &&
4258                  scan->flags == 0 &&
4259                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4260                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4261             ) {
4262                 regnode *opt;
4263                 regnode *upto= regnext(scan);
4264                 DEBUG_PARSE_r({
4265                     SV * const mysv_val=sv_newmortal();
4266                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4267
4268                     /*DEBUG_PARSE_MSG("opfail");*/
4269                     regprop(RExC_rx, mysv_val, upto);
4270                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4271                                   SvPV_nolen_const(mysv_val),
4272                                   (IV)REG_NODE_NUM(upto),
4273                                   (IV)(upto - scan)
4274                     );
4275                 });
4276                 OP(scan) = OPFAIL;
4277                 NEXT_OFF(scan) = upto - scan;
4278                 for (opt= scan + 1; opt < upto ; opt++)
4279                     OP(opt) = OPTIMIZED;
4280                 scan= upto;
4281                 continue;
4282             }
4283             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4284                 || OP(scan) == UNLESSM )
4285             {
4286                 /* Negative Lookahead/lookbehind
4287                    In this case we can't do fixed string optimisation.
4288                 */
4289
4290                 I32 deltanext, minnext, fake = 0;
4291                 regnode *nscan;
4292                 struct regnode_charclass_class intrnl;
4293                 int f = 0;
4294
4295                 data_fake.flags = 0;
4296                 if (data) {
4297                     data_fake.whilem_c = data->whilem_c;
4298                     data_fake.last_closep = data->last_closep;
4299                 }
4300                 else
4301                     data_fake.last_closep = &fake;
4302                 data_fake.pos_delta = delta;
4303                 if ( flags & SCF_DO_STCLASS && !scan->flags
4304                      && OP(scan) == IFMATCH ) { /* Lookahead */
4305                     cl_init(pRExC_state, &intrnl);
4306                     data_fake.start_class = &intrnl;
4307                     f |= SCF_DO_STCLASS_AND;
4308                 }
4309                 if (flags & SCF_WHILEM_VISITED_POS)
4310                     f |= SCF_WHILEM_VISITED_POS;
4311                 next = regnext(scan);
4312                 nscan = NEXTOPER(NEXTOPER(scan));
4313                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4314                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4315                 if (scan->flags) {
4316                     if (deltanext) {
4317                         FAIL("Variable length lookbehind not implemented");
4318                     }
4319                     else if (minnext > (I32)U8_MAX) {
4320                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4321                     }
4322                     scan->flags = (U8)minnext;
4323                 }
4324                 if (data) {
4325                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4326                         pars++;
4327                     if (data_fake.flags & SF_HAS_EVAL)
4328                         data->flags |= SF_HAS_EVAL;
4329                     data->whilem_c = data_fake.whilem_c;
4330                 }
4331                 if (f & SCF_DO_STCLASS_AND) {
4332                     if (flags & SCF_DO_STCLASS_OR) {
4333                         /* OR before, AND after: ideally we would recurse with
4334                          * data_fake to get the AND applied by study of the
4335                          * remainder of the pattern, and then derecurse;
4336                          * *** HACK *** for now just treat as "no information".
4337                          * See [perl #56690].
4338                          */
4339                         cl_init(pRExC_state, data->start_class);
4340                     }  else {
4341                         /* AND before and after: combine and continue */
4342                         const int was = TEST_SSC_EOS(data->start_class);
4343
4344                         cl_and(data->start_class, &intrnl);
4345                         if (was)
4346                             SET_SSC_EOS(data->start_class);
4347                     }
4348                 }
4349             }
4350 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4351             else {
4352                 /* Positive Lookahead/lookbehind
4353                    In this case we can do fixed string optimisation,
4354                    but we must be careful about it. Note in the case of
4355                    lookbehind the positions will be offset by the minimum
4356                    length of the pattern, something we won't know about
4357                    until after the recurse.
4358                 */
4359                 I32 deltanext, fake = 0;
4360                 regnode *nscan;
4361                 struct regnode_charclass_class intrnl;
4362                 int f = 0;
4363                 /* We use SAVEFREEPV so that when the full compile
4364                     is finished perl will clean up the allocated
4365                     minlens when it's all done. This way we don't
4366                     have to worry about freeing them when we know
4367                     they wont be used, which would be a pain.
4368                  */
4369                 I32 *minnextp;
4370                 Newx( minnextp, 1, I32 );
4371                 SAVEFREEPV(minnextp);
4372
4373                 if (data) {
4374                     StructCopy(data, &data_fake, scan_data_t);
4375                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4376                         f |= SCF_DO_SUBSTR;
4377                         if (scan->flags)
4378                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4379                         data_fake.last_found=newSVsv(data->last_found);
4380                     }
4381                 }
4382                 else
4383                     data_fake.last_closep = &fake;
4384                 data_fake.flags = 0;
4385                 data_fake.pos_delta = delta;
4386                 if (is_inf)
4387                     data_fake.flags |= SF_IS_INF;
4388                 if ( flags & SCF_DO_STCLASS && !scan->flags
4389                      && OP(scan) == IFMATCH ) { /* Lookahead */
4390                     cl_init(pRExC_state, &intrnl);
4391                     data_fake.start_class = &intrnl;
4392                     f |= SCF_DO_STCLASS_AND;
4393                 }
4394                 if (flags & SCF_WHILEM_VISITED_POS)
4395                     f |= SCF_WHILEM_VISITED_POS;
4396                 next = regnext(scan);
4397                 nscan = NEXTOPER(NEXTOPER(scan));
4398
4399                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4400                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4401                 if (scan->flags) {
4402                     if (deltanext) {
4403                         FAIL("Variable length lookbehind not implemented");
4404                     }
4405                     else if (*minnextp > (I32)U8_MAX) {
4406                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4407                     }
4408                     scan->flags = (U8)*minnextp;
4409                 }
4410
4411                 *minnextp += min;
4412
4413                 if (f & SCF_DO_STCLASS_AND) {
4414                     const int was = TEST_SSC_EOS(data.start_class);
4415
4416                     cl_and(data->start_class, &intrnl);
4417                     if (was)
4418                         SET_SSC_EOS(data->start_class);
4419                 }
4420                 if (data) {
4421                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4422                         pars++;
4423                     if (data_fake.flags & SF_HAS_EVAL)
4424                         data->flags |= SF_HAS_EVAL;
4425                     data->whilem_c = data_fake.whilem_c;
4426                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4427                         if (RExC_rx->minlen<*minnextp)
4428                             RExC_rx->minlen=*minnextp;
4429                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4430                         SvREFCNT_dec_NN(data_fake.last_found);
4431
4432                         if ( data_fake.minlen_fixed != minlenp )
4433                         {
4434                             data->offset_fixed= data_fake.offset_fixed;
4435                             data->minlen_fixed= data_fake.minlen_fixed;
4436                             data->lookbehind_fixed+= scan->flags;
4437                         }
4438                         if ( data_fake.minlen_float != minlenp )
4439                         {
4440                             data->minlen_float= data_fake.minlen_float;
4441                             data->offset_float_min=data_fake.offset_float_min;
4442                             data->offset_float_max=data_fake.offset_float_max;
4443                             data->lookbehind_float+= scan->flags;
4444                         }
4445                     }
4446                 }
4447             }
4448 #endif
4449         }
4450         else if (OP(scan) == OPEN) {
4451             if (stopparen != (I32)ARG(scan))
4452                 pars++;
4453         }
4454         else if (OP(scan) == CLOSE) {
4455             if (stopparen == (I32)ARG(scan)) {
4456                 break;
4457             }
4458             if ((I32)ARG(scan) == is_par) {
4459                 next = regnext(scan);
4460
4461                 if ( next && (OP(next) != WHILEM) && next < last)
4462                     is_par = 0;         /* Disable optimization */
4463             }
4464             if (data)
4465                 *(data->last_closep) = ARG(scan);
4466         }
4467         else if (OP(scan) == EVAL) {
4468                 if (data)
4469                     data->flags |= SF_HAS_EVAL;
4470         }
4471         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4472             if (flags & SCF_DO_SUBSTR) {
4473                 SCAN_COMMIT(pRExC_state,data,minlenp);
4474                 flags &= ~SCF_DO_SUBSTR;
4475             }
4476             if (data && OP(scan)==ACCEPT) {
4477                 data->flags |= SCF_SEEN_ACCEPT;
4478                 if (stopmin > min)
4479                     stopmin = min;
4480             }
4481         }
4482         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4483         {
4484                 if (flags & SCF_DO_SUBSTR) {
4485                     SCAN_COMMIT(pRExC_state,data,minlenp);
4486                     data->longest = &(data->longest_float);
4487                 }
4488                 is_inf = is_inf_internal = 1;
4489                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4490                     cl_anything(pRExC_state, data->start_class);
4491                 flags &= ~SCF_DO_STCLASS;
4492         }
4493         else if (OP(scan) == GPOS) {
4494             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4495                 !(delta || is_inf || (data && data->pos_delta)))
4496             {
4497                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4498                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4499                 if (RExC_rx->gofs < (U32)min)
4500                     RExC_rx->gofs = min;
4501             } else {
4502                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4503                 RExC_rx->gofs = 0;
4504             }
4505         }
4506 #ifdef TRIE_STUDY_OPT
4507 #ifdef FULL_TRIE_STUDY
4508         else if (PL_regkind[OP(scan)] == TRIE) {
4509             /* NOTE - There is similar code to this block above for handling
4510                BRANCH nodes on the initial study.  If you change stuff here
4511                check there too. */
4512             regnode *trie_node= scan;
4513             regnode *tail= regnext(scan);
4514             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4515             I32 max1 = 0, min1 = I32_MAX;
4516             struct regnode_charclass_class accum;
4517
4518             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4519                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4520             if (flags & SCF_DO_STCLASS)
4521                 cl_init_zero(pRExC_state, &accum);
4522
4523             if (!trie->jump) {
4524                 min1= trie->minlen;
4525                 max1= trie->maxlen;
4526             } else {
4527                 const regnode *nextbranch= NULL;
4528                 U32 word;
4529
4530                 for ( word=1 ; word <= trie->wordcount ; word++)
4531                 {
4532                     I32 deltanext=0, minnext=0, f = 0, fake;
4533                     struct regnode_charclass_class this_class;
4534
4535                     data_fake.flags = 0;
4536                     if (data) {
4537                         data_fake.whilem_c = data->whilem_c;
4538                         data_fake.last_closep = data->last_closep;
4539                     }
4540                     else
4541                         data_fake.last_closep = &fake;
4542                     data_fake.pos_delta = delta;
4543                     if (flags & SCF_DO_STCLASS) {
4544                         cl_init(pRExC_state, &this_class);
4545                         data_fake.start_class = &this_class;
4546                         f = SCF_DO_STCLASS_AND;
4547                     }
4548                     if (flags & SCF_WHILEM_VISITED_POS)
4549                         f |= SCF_WHILEM_VISITED_POS;
4550
4551                     if (trie->jump[word]) {
4552                         if (!nextbranch)
4553                             nextbranch = trie_node + trie->jump[0];
4554                         scan= trie_node + trie->jump[word];
4555                         /* We go from the jump point to the branch that follows
4556                            it. Note this means we need the vestigal unused branches
4557                            even though they arent otherwise used.
4558                          */
4559                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4560                             &deltanext, (regnode *)nextbranch, &data_fake,
4561                             stopparen, recursed, NULL, f,depth+1);
4562                     }
4563                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4564                         nextbranch= regnext((regnode*)nextbranch);
4565
4566                     if (min1 > (I32)(minnext + trie->minlen))
4567                         min1 = minnext + trie->minlen;
4568                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4569                         max1 = minnext + deltanext + trie->maxlen;
4570                     if (deltanext == I32_MAX)
4571                         is_inf = is_inf_internal = 1;
4572
4573                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4574                         pars++;
4575                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4576                         if ( stopmin > min + min1)
4577                             stopmin = min + min1;
4578                         flags &= ~SCF_DO_SUBSTR;
4579                         if (data)
4580                             data->flags |= SCF_SEEN_ACCEPT;
4581                     }
4582                     if (data) {
4583                         if (data_fake.flags & SF_HAS_EVAL)
4584                             data->flags |= SF_HAS_EVAL;
4585                         data->whilem_c = data_fake.whilem_c;
4586                     }
4587                     if (flags & SCF_DO_STCLASS)
4588                         cl_or(pRExC_state, &accum, &this_class);
4589                 }
4590             }
4591             if (flags & SCF_DO_SUBSTR) {
4592                 data->pos_min += min1;
4593                 data->pos_delta += max1 - min1;
4594                 if (max1 != min1 || is_inf)
4595                     data->longest = &(data->longest_float);
4596             }
4597             min += min1;
4598             delta += max1 - min1;
4599             if (flags & SCF_DO_STCLASS_OR) {
4600                 cl_or(pRExC_state, data->start_class, &accum);
4601                 if (min1) {
4602                     cl_and(data->start_class, and_withp);
4603                     flags &= ~SCF_DO_STCLASS;
4604                 }
4605             }
4606             else if (flags & SCF_DO_STCLASS_AND) {
4607                 if (min1) {
4608                     cl_and(data->start_class, &accum);
4609                     flags &= ~SCF_DO_STCLASS;
4610                 }
4611                 else {
4612                     /* Switch to OR mode: cache the old value of
4613                      * data->start_class */
4614                     INIT_AND_WITHP;
4615                     StructCopy(data->start_class, and_withp,
4616                                struct regnode_charclass_class);
4617                     flags &= ~SCF_DO_STCLASS_AND;
4618                     StructCopy(&accum, data->start_class,
4619                                struct regnode_charclass_class);
4620                     flags |= SCF_DO_STCLASS_OR;
4621                     SET_SSC_EOS(data->start_class);
4622                 }
4623             }
4624             scan= tail;
4625             continue;
4626         }
4627 #else
4628         else if (PL_regkind[OP(scan)] == TRIE) {
4629             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4630             U8*bang=NULL;
4631
4632             min += trie->minlen;
4633             delta += (trie->maxlen - trie->minlen);
4634             flags &= ~SCF_DO_STCLASS; /* xxx */
4635             if (flags & SCF_DO_SUBSTR) {
4636                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4637                 data->pos_min += trie->minlen;
4638                 data->pos_delta += (trie->maxlen - trie->minlen);
4639                 if (trie->maxlen != trie->minlen)
4640                     data->longest = &(data->longest_float);
4641             }
4642             if (trie->jump) /* no more substrings -- for now /grr*/
4643                 flags &= ~SCF_DO_SUBSTR;
4644         }
4645 #endif /* old or new */
4646 #endif /* TRIE_STUDY_OPT */
4647
4648         /* Else: zero-length, ignore. */
4649         scan = regnext(scan);
4650     }
4651     if (frame) {
4652         last = frame->last;
4653         scan = frame->next;
4654         stopparen = frame->stop;
4655         frame = frame->prev;
4656         goto fake_study_recurse;
4657     }
4658
4659   finish:
4660     assert(!frame);
4661     DEBUG_STUDYDATA("pre-fin:",data,depth);
4662
4663     *scanp = scan;
4664     *deltap = is_inf_internal ? I32_MAX : delta;
4665     if (flags & SCF_DO_SUBSTR && is_inf)
4666         data->pos_delta = I32_MAX - data->pos_min;
4667     if (is_par > (I32)U8_MAX)
4668         is_par = 0;
4669     if (is_par && pars==1 && data) {
4670         data->flags |= SF_IN_PAR;
4671         data->flags &= ~SF_HAS_PAR;
4672     }
4673     else if (pars && data) {
4674         data->flags |= SF_HAS_PAR;
4675         data->flags &= ~SF_IN_PAR;
4676     }
4677     if (flags & SCF_DO_STCLASS_OR)
4678         cl_and(data->start_class, and_withp);
4679     if (flags & SCF_TRIE_RESTUDY)
4680         data->flags |=  SCF_TRIE_RESTUDY;
4681
4682     DEBUG_STUDYDATA("post-fin:",data,depth);
4683
4684     return min < stopmin ? min : stopmin;
4685 }
4686
4687 STATIC U32
4688 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4689 {
4690     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4691
4692     PERL_ARGS_ASSERT_ADD_DATA;
4693
4694     Renewc(RExC_rxi->data,
4695            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4696            char, struct reg_data);
4697     if(count)
4698         Renew(RExC_rxi->data->what, count + n, U8);
4699     else
4700         Newx(RExC_rxi->data->what, n, U8);
4701     RExC_rxi->data->count = count + n;
4702     Copy(s, RExC_rxi->data->what + count, n, U8);
4703     return count;
4704 }
4705
4706 /*XXX: todo make this not included in a non debugging perl */
4707 #ifndef PERL_IN_XSUB_RE
4708 void
4709 Perl_reginitcolors(pTHX)
4710 {
4711     dVAR;
4712     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4713     if (s) {
4714         char *t = savepv(s);
4715         int i = 0;
4716         PL_colors[0] = t;
4717         while (++i < 6) {
4718             t = strchr(t, '\t');
4719             if (t) {
4720                 *t = '\0';
4721                 PL_colors[i] = ++t;
4722             }
4723             else
4724                 PL_colors[i] = t = (char *)"";
4725         }
4726     } else {
4727         int i = 0;
4728         while (i < 6)
4729             PL_colors[i++] = (char *)"";
4730     }
4731     PL_colorset = 1;
4732 }
4733 #endif
4734
4735
4736 #ifdef TRIE_STUDY_OPT
4737 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4738     STMT_START {                                            \
4739         if (                                                \
4740               (data.flags & SCF_TRIE_RESTUDY)               \
4741               && ! restudied++                              \
4742         ) {                                                 \
4743             dOsomething;                                    \
4744             goto reStudy;                                   \
4745         }                                                   \
4746     } STMT_END
4747 #else
4748 #define CHECK_RESTUDY_GOTO_butfirst
4749 #endif
4750
4751 /*
4752  * pregcomp - compile a regular expression into internal code
4753  *
4754  * Decides which engine's compiler to call based on the hint currently in
4755  * scope
4756  */
4757
4758 #ifndef PERL_IN_XSUB_RE
4759
4760 /* return the currently in-scope regex engine (or the default if none)  */
4761
4762 regexp_engine const *
4763 Perl_current_re_engine(pTHX)
4764 {
4765     dVAR;
4766
4767     if (IN_PERL_COMPILETIME) {
4768         HV * const table = GvHV(PL_hintgv);
4769         SV **ptr;
4770
4771         if (!table)
4772             return &PL_core_reg_engine;
4773         ptr = hv_fetchs(table, "regcomp", FALSE);
4774         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4775             return &PL_core_reg_engine;
4776         return INT2PTR(regexp_engine*,SvIV(*ptr));
4777     }
4778     else {
4779         SV *ptr;
4780         if (!PL_curcop->cop_hints_hash)
4781             return &PL_core_reg_engine;
4782         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4783         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4784             return &PL_core_reg_engine;
4785         return INT2PTR(regexp_engine*,SvIV(ptr));
4786     }
4787 }
4788
4789
4790 REGEXP *
4791 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4792 {
4793     dVAR;
4794     regexp_engine const *eng = current_re_engine();
4795     GET_RE_DEBUG_FLAGS_DECL;
4796
4797     PERL_ARGS_ASSERT_PREGCOMP;
4798
4799     /* Dispatch a request to compile a regexp to correct regexp engine. */
4800     DEBUG_COMPILE_r({
4801         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4802                         PTR2UV(eng));
4803     });
4804     return CALLREGCOMP_ENG(eng, pattern, flags);
4805 }
4806 #endif
4807
4808 /* public(ish) entry point for the perl core's own regex compiling code.
4809  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4810  * pattern rather than a list of OPs, and uses the internal engine rather
4811  * than the current one */
4812
4813 REGEXP *
4814 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4815 {
4816     SV *pat = pattern; /* defeat constness! */
4817     PERL_ARGS_ASSERT_RE_COMPILE;
4818     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4819 #ifdef PERL_IN_XSUB_RE
4820                                 &my_reg_engine,
4821 #else
4822                                 &PL_core_reg_engine,
4823 #endif
4824                                 NULL, NULL, rx_flags, 0);
4825 }
4826
4827 /* see if there are any run-time code blocks in the pattern.
4828  * False positives are allowed */
4829
4830 static bool
4831 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4832                     U32 pm_flags, char *pat, STRLEN plen)
4833 {
4834     int n = 0;
4835     STRLEN s;
4836
4837     /* avoid infinitely recursing when we recompile the pattern parcelled up
4838      * as qr'...'. A single constant qr// string can't have have any
4839      * run-time component in it, and thus, no runtime code. (A non-qr
4840      * string, however, can, e.g. $x =~ '(?{})') */
4841     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4842         return 0;
4843
4844     for (s = 0; s < plen; s++) {
4845         if (n < pRExC_state->num_code_blocks
4846             && s == pRExC_state->code_blocks[n].start)
4847         {
4848             s = pRExC_state->code_blocks[n].end;
4849             n++;
4850             continue;
4851         }
4852         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4853          * positives here */
4854         if (pat[s] == '(' && pat[s+1] == '?' &&
4855             (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4856         )
4857             return 1;
4858     }
4859     return 0;
4860 }
4861
4862 /* Handle run-time code blocks. We will already have compiled any direct
4863  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4864  * copy of it, but with any literal code blocks blanked out and
4865  * appropriate chars escaped; then feed it into
4866  *
4867  *    eval "qr'modified_pattern'"
4868  *
4869  * For example,
4870  *
4871  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
4872  *
4873  * becomes
4874  *
4875  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
4876  *
4877  * After eval_sv()-ing that, grab any new code blocks from the returned qr
4878  * and merge them with any code blocks of the original regexp.
4879  *
4880  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
4881  * instead, just save the qr and return FALSE; this tells our caller that
4882  * the original pattern needs upgrading to utf8.
4883  */
4884
4885 static bool
4886 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4887     char *pat, STRLEN plen)
4888 {
4889     SV *qr;
4890
4891     GET_RE_DEBUG_FLAGS_DECL;
4892
4893     if (pRExC_state->runtime_code_qr) {
4894         /* this is the second time we've been called; this should
4895          * only happen if the main pattern got upgraded to utf8
4896          * during compilation; re-use the qr we compiled first time
4897          * round (which should be utf8 too)
4898          */
4899         qr = pRExC_state->runtime_code_qr;
4900         pRExC_state->runtime_code_qr = NULL;
4901         assert(RExC_utf8 && SvUTF8(qr));
4902     }
4903     else {
4904         int n = 0;
4905         STRLEN s;
4906         char *p, *newpat;
4907         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
4908         SV *sv, *qr_ref;
4909         dSP;
4910
4911         /* determine how many extra chars we need for ' and \ escaping */
4912         for (s = 0; s < plen; s++) {
4913             if (pat[s] == '\'' || pat[s] == '\\')
4914                 newlen++;
4915         }
4916
4917         Newx(newpat, newlen, char);
4918         p = newpat;
4919         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
4920
4921         for (s = 0; s < plen; s++) {
4922             if (n < pRExC_state->num_code_blocks
4923                 && s == pRExC_state->code_blocks[n].start)
4924             {
4925                 /* blank out literal code block */
4926                 assert(pat[s] == '(');
4927                 while (s <= pRExC_state->code_blocks[n].end) {
4928                     *p++ = '_';
4929                     s++;
4930                 }
4931                 s--;
4932                 n++;
4933                 continue;
4934             }
4935             if (pat[s] == '\'' || pat[s] == '\\')
4936                 *p++ = '\\';
4937             *p++ = pat[s];
4938         }
4939         *p++ = '\'';
4940         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
4941             *p++ = 'x';
4942         *p++ = '\0';
4943         DEBUG_COMPILE_r({
4944             PerlIO_printf(Perl_debug_log,
4945                 "%sre-parsing pattern for runtime code:%s %s\n",
4946                 PL_colors[4],PL_colors[5],newpat);
4947         });
4948
4949         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
4950         Safefree(newpat);
4951
4952         ENTER;
4953         SAVETMPS;
4954         save_re_context();
4955         PUSHSTACKi(PERLSI_REQUIRE);
4956         /* this causes the toker to collapse \\ into \ when parsing
4957          * qr''; normally only q'' does this. It also alters hints
4958          * handling */
4959         PL_reg_state.re_reparsing = TRUE;
4960         eval_sv(sv, G_SCALAR);
4961         SvREFCNT_dec_NN(sv);
4962         SPAGAIN;
4963         qr_ref = POPs;
4964         PUTBACK;
4965         {
4966             SV * const errsv = ERRSV;
4967             if (SvTRUE_NN(errsv))
4968             {
4969                 Safefree(pRExC_state->code_blocks);
4970                 /* use croak_sv ? */
4971                 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
4972             }
4973         }
4974         assert(SvROK(qr_ref));
4975         qr = SvRV(qr_ref);
4976         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
4977         /* the leaving below frees the tmp qr_ref.
4978          * Give qr a life of its own */
4979         SvREFCNT_inc(qr);
4980         POPSTACK;
4981         FREETMPS;
4982         LEAVE;
4983
4984     }
4985
4986     if (!RExC_utf8 && SvUTF8(qr)) {
4987         /* first time through; the pattern got upgraded; save the
4988          * qr for the next time through */
4989         assert(!pRExC_state->runtime_code_qr);
4990         pRExC_state->runtime_code_qr = qr;
4991         return 0;
4992     }
4993
4994
4995     /* extract any code blocks within the returned qr//  */
4996
4997
4998     /* merge the main (r1) and run-time (r2) code blocks into one */
4999     {
5000         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5001         struct reg_code_block *new_block, *dst;
5002         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5003         int i1 = 0, i2 = 0;
5004
5005         if (!r2->num_code_blocks) /* we guessed wrong */
5006         {
5007             SvREFCNT_dec_NN(qr);
5008             return 1;
5009         }
5010
5011         Newx(new_block,
5012             r1->num_code_blocks + r2->num_code_blocks,
5013             struct reg_code_block);
5014         dst = new_block;
5015
5016         while (    i1 < r1->num_code_blocks
5017                 || i2 < r2->num_code_blocks)
5018         {
5019             struct reg_code_block *src;
5020             bool is_qr = 0;
5021
5022             if (i1 == r1->num_code_blocks) {
5023                 src = &r2->code_blocks[i2++];
5024                 is_qr = 1;
5025             }
5026             else if (i2 == r2->num_code_blocks)
5027                 src = &r1->code_blocks[i1++];
5028             else if (  r1->code_blocks[i1].start
5029                      < r2->code_blocks[i2].start)
5030             {
5031                 src = &r1->code_blocks[i1++];
5032                 assert(src->end < r2->code_blocks[i2].start);
5033             }
5034             else {
5035                 assert(  r1->code_blocks[i1].start
5036                        > r2->code_blocks[i2].start);
5037                 src = &r2->code_blocks[i2++];
5038                 is_qr = 1;
5039                 assert(src->end < r1->code_blocks[i1].start);
5040             }
5041
5042             assert(pat[src->start] == '(');
5043             assert(pat[src->end]   == ')');
5044             dst->start      = src->start;
5045             dst->end        = src->end;
5046             dst->block      = src->block;
5047             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5048                                     : src->src_regex;
5049             dst++;
5050         }
5051         r1->num_code_blocks += r2->num_code_blocks;
5052         Safefree(r1->code_blocks);
5053         r1->code_blocks = new_block;
5054     }
5055
5056     SvREFCNT_dec_NN(qr);
5057     return 1;
5058 }
5059
5060
5061 STATIC bool
5062 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5063 {
5064     /* This is the common code for setting up the floating and fixed length
5065      * string data extracted from Perlre_op_compile() below.  Returns a boolean
5066      * as to whether succeeded or not */
5067
5068     I32 t,ml;
5069
5070     if (! (longest_length
5071            || (eol /* Can't have SEOL and MULTI */
5072                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5073           )
5074             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5075         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5076     {
5077         return FALSE;
5078     }
5079
5080     /* copy the information about the longest from the reg_scan_data
5081         over to the program. */
5082     if (SvUTF8(sv_longest)) {
5083         *rx_utf8 = sv_longest;
5084         *rx_substr = NULL;
5085     } else {
5086         *rx_substr = sv_longest;
5087         *rx_utf8 = NULL;
5088     }
5089     /* end_shift is how many chars that must be matched that
5090         follow this item. We calculate it ahead of time as once the
5091         lookbehind offset is added in we lose the ability to correctly
5092         calculate it.*/
5093     ml = minlen ? *(minlen) : (I32)longest_length;
5094     *rx_end_shift = ml - offset
5095         - longest_length + (SvTAIL(sv_longest) != 0)
5096         + lookbehind;
5097
5098     t = (eol/* Can't have SEOL and MULTI */
5099          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5100     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5101
5102     return TRUE;
5103 }
5104
5105 /*
5106  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5107  * regular expression into internal code.
5108  * The pattern may be passed either as:
5109  *    a list of SVs (patternp plus pat_count)
5110  *    a list of OPs (expr)
5111  * If both are passed, the SV list is used, but the OP list indicates
5112  * which SVs are actually pre-compiled code blocks
5113  *
5114  * The SVs in the list have magic and qr overloading applied to them (and
5115  * the list may be modified in-place with replacement SVs in the latter
5116  * case).
5117  *
5118  * If the pattern hasn't changed from old_re, then old_re will be
5119  * returned.
5120  *
5121  * eng is the current engine. If that engine has an op_comp method, then
5122  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5123  * do the initial concatenation of arguments and pass on to the external
5124  * engine.
5125  *
5126  * If is_bare_re is not null, set it to a boolean indicating whether the
5127  * arg list reduced (after overloading) to a single bare regex which has
5128  * been returned (i.e. /$qr/).
5129  *
5130  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5131  *
5132  * pm_flags contains the PMf_* flags, typically based on those from the
5133  * pm_flags field of the related PMOP. Currently we're only interested in
5134  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5135  *
5136  * We can't allocate space until we know how big the compiled form will be,
5137  * but we can't compile it (and thus know how big it is) until we've got a
5138  * place to put the code.  So we cheat:  we compile it twice, once with code
5139  * generation turned off and size counting turned on, and once "for real".
5140  * This also means that we don't allocate space until we are sure that the
5141  * thing really will compile successfully, and we never have to move the
5142  * code and thus invalidate pointers into it.  (Note that it has to be in
5143  * one piece because free() must be able to free it all.) [NB: not true in perl]
5144  *
5145  * Beware that the optimization-preparation code in here knows about some
5146  * of the structure of the compiled regexp.  [I'll say.]
5147  */
5148
5149 REGEXP *
5150 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5151                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5152                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5153 {
5154     dVAR;
5155     REGEXP *rx;
5156     struct regexp *r;
5157     regexp_internal *ri;
5158     STRLEN plen;
5159     char  * VOL exp;
5160     char* xend;
5161     regnode *scan;
5162     I32 flags;
5163     I32 minlen = 0;
5164     U32 rx_flags;
5165     SV * VOL pat;
5166     SV * VOL code_blocksv = NULL;
5167
5168     /* these are all flags - maybe they should be turned
5169      * into a single int with different bit masks */
5170     I32 sawlookahead = 0;
5171     I32 sawplus = 0;
5172     I32 sawopen = 0;
5173     bool used_setjump = FALSE;
5174     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5175     bool code_is_utf8 = 0;
5176     bool VOL recompile = 0;
5177     bool runtime_code = 0;
5178     U8 jump_ret = 0;
5179     dJMPENV;
5180     scan_data_t data;
5181     RExC_state_t RExC_state;
5182     RExC_state_t * const pRExC_state = &RExC_state;
5183 #ifdef TRIE_STUDY_OPT
5184     int restudied;
5185     RExC_state_t copyRExC_state;
5186 #endif
5187     GET_RE_DEBUG_FLAGS_DECL;
5188
5189     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5190
5191     DEBUG_r(if (!PL_colorset) reginitcolors());
5192
5193 #ifndef PERL_IN_XSUB_RE
5194     /* Initialize these here instead of as-needed, as is quick and avoids
5195      * having to test them each time otherwise */
5196     if (! PL_AboveLatin1) {
5197         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5198         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5199         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5200
5201         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5202                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5203         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5204                                 = _new_invlist_C_array(PosixAlnum_invlist);
5205
5206         PL_L1Posix_ptrs[_CC_ALPHA]
5207                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5208         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5209
5210         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5211         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5212
5213         /* Cased is the same as Alpha in the ASCII range */
5214         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5215         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5216
5217         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5218         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5219
5220         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5221         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5222
5223         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5224         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5225
5226         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5227         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5228
5229         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5230         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5231
5232         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5233         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5234
5235         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5236         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5237         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5238         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5239
5240         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5241         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5242
5243         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5244
5245         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5246         PL_L1Posix_ptrs[_CC_WORDCHAR]
5247                                 = _new_invlist_C_array(L1PosixWord_invlist);
5248
5249         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5250         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5251
5252         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5253     }
5254 #endif
5255
5256     pRExC_state->code_blocks = NULL;
5257     pRExC_state->num_code_blocks = 0;
5258
5259     if (is_bare_re)
5260         *is_bare_re = FALSE;
5261
5262     if (expr && (expr->op_type == OP_LIST ||
5263                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5264
5265         /* is the source UTF8, and how many code blocks are there? */
5266         OP *o;
5267         int ncode = 0;
5268
5269         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5270             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5271                 code_is_utf8 = 1;
5272             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5273                 /* count of DO blocks */
5274                 ncode++;
5275         }
5276         if (ncode) {
5277             pRExC_state->num_code_blocks = ncode;
5278             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5279         }
5280     }
5281
5282     if (pat_count) {
5283         /* handle a list of SVs */
5284
5285         SV **svp;
5286
5287         /* apply magic and RE overloading to each arg */
5288         for (svp = patternp; svp < patternp + pat_count; svp++) {
5289             SV *rx = *svp;
5290             SvGETMAGIC(rx);
5291             if (SvROK(rx) && SvAMAGIC(rx)) {
5292                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5293                 if (sv) {
5294                     if (SvROK(sv))
5295                         sv = SvRV(sv);
5296                     if (SvTYPE(sv) != SVt_REGEXP)
5297                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5298                     *svp = sv;
5299                 }
5300             }
5301         }
5302
5303         if (pat_count > 1) {
5304             /* concat multiple args and find any code block indexes */
5305
5306             OP *o = NULL;
5307             int n = 0;
5308             bool utf8 = 0;
5309             STRLEN orig_patlen = 0;
5310
5311             if (pRExC_state->num_code_blocks) {
5312                 o = cLISTOPx(expr)->op_first;
5313                 assert(   o->op_type == OP_PUSHMARK
5314                        || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5315                        || o->op_type == OP_PADRANGE);
5316                 o = o->op_sibling;
5317             }
5318
5319             pat = newSVpvn("", 0);
5320             SAVEFREESV(pat);
5321
5322             /* determine if the pattern is going to be utf8 (needed
5323              * in advance to align code block indices correctly).
5324              * XXX This could fail to be detected for an arg with
5325              * overloading but not concat overloading; but the main effect
5326              * in this obscure case is to need a 'use re eval' for a
5327              * literal code block */
5328             for (svp = patternp; svp < patternp + pat_count; svp++) {
5329                 if (SvUTF8(*svp))
5330                     utf8 = 1;
5331             }
5332             if (utf8)
5333                 SvUTF8_on(pat);
5334
5335             for (svp = patternp; svp < patternp + pat_count; svp++) {
5336                 SV *sv, *msv = *svp;
5337                 SV *rx;
5338                 bool code = 0;
5339                 /* we make the assumption here that each op in the list of
5340                  * op_siblings maps to one SV pushed onto the stack,
5341                  * except for code blocks, with have both an OP_NULL and
5342                  * and OP_CONST.
5343                  * This allows us to match up the list of SVs against the
5344                  * list of OPs to find the next code block.
5345                  *
5346                  * Note that       PUSHMARK PADSV PADSV ..
5347                  * is optimised to
5348                  *                 PADRANGE NULL  NULL  ..
5349                  * so the alignment still works. */
5350                 if (o) {
5351                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5352                         assert(n < pRExC_state->num_code_blocks);
5353                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5354                         pRExC_state->code_blocks[n].block = o;
5355                         pRExC_state->code_blocks[n].src_regex = NULL;
5356                         n++;
5357                         code = 1;
5358                         o = o->op_sibling; /* skip CONST */
5359                         assert(o);
5360                     }
5361                     o = o->op_sibling;;
5362                 }
5363
5364                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5365                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5366                 {
5367                     sv_setsv(pat, sv);
5368                     /* overloading involved: all bets are off over literal
5369                      * code. Pretend we haven't seen it */
5370                     pRExC_state->num_code_blocks -= n;
5371                     n = 0;
5372                     rx = NULL;
5373
5374                 }
5375                 else  {
5376                     while (SvAMAGIC(msv)
5377                             && (sv = AMG_CALLunary(msv, string_amg))
5378                             && sv != msv
5379                             &&  !(   SvROK(msv)
5380                                   && SvROK(sv)
5381                                   && SvRV(msv) == SvRV(sv))
5382                     ) {
5383                         msv = sv;
5384                         SvGETMAGIC(msv);
5385                     }
5386                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5387                         msv = SvRV(msv);
5388                     orig_patlen = SvCUR(pat);
5389                     sv_catsv_nomg(pat, msv);
5390                     rx = msv;
5391                     if (code)
5392                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5393                 }
5394
5395                 /* extract any code blocks within any embedded qr//'s */
5396                 if (rx && SvTYPE(rx) == SVt_REGEXP
5397                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5398                 {
5399
5400                     RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5401                     if (ri->num_code_blocks) {
5402                         int i;
5403                         /* the presence of an embedded qr// with code means
5404                          * we should always recompile: the text of the
5405                          * qr// may not have changed, but it may be a
5406                          * different closure than last time */
5407                         recompile = 1;
5408                         Renew(pRExC_state->code_blocks,
5409                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5410                             struct reg_code_block);
5411                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5412                         for (i=0; i < ri->num_code_blocks; i++) {
5413                             struct reg_code_block *src, *dst;
5414                             STRLEN offset =  orig_patlen
5415                                 + ReANY((REGEXP *)rx)->pre_prefix;
5416                             assert(n < pRExC_state->num_code_blocks);
5417                             src = &ri->code_blocks[i];
5418                             dst = &pRExC_state->code_blocks[n];
5419                             dst->start      = src->start + offset;
5420                             dst->end        = src->end   + offset;
5421                             dst->block      = src->block;
5422                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5423                                                     src->src_regex
5424                                                         ? src->src_regex
5425                                                         : (REGEXP*)rx);
5426                             n++;
5427                         }
5428                     }
5429                 }
5430             }
5431             SvSETMAGIC(pat);
5432         }
5433         else {
5434             SV *sv;
5435             pat = *patternp;
5436             while (SvAMAGIC(pat)
5437                     && (sv = AMG_CALLunary(pat, string_amg))
5438                     && sv != pat)
5439             {
5440                 pat = sv;
5441                 SvGETMAGIC(pat);
5442             }
5443         }
5444
5445         /* handle bare regex: foo =~ $re */
5446         {
5447             SV *re = pat;
5448             if (SvROK(re))
5449                 re = SvRV(re);
5450             if (SvTYPE(re) == SVt_REGEXP) {
5451                 if (is_bare_re)
5452                     *is_bare_re = TRUE;
5453                 SvREFCNT_inc(re);
5454                 Safefree(pRExC_state->code_blocks);
5455                 return (REGEXP*)re;
5456             }
5457         }
5458     }
5459     else {
5460         /* not a list of SVs, so must be a list of OPs */
5461         assert(expr);
5462         if (expr->op_type == OP_LIST) {
5463             int i = -1;
5464             bool is_code = 0;
5465             OP *o;
5466
5467             pat = newSVpvn("", 0);
5468             SAVEFREESV(pat);
5469             if (code_is_utf8)
5470                 SvUTF8_on(pat);
5471
5472             /* given a list of CONSTs and DO blocks in expr, append all
5473              * the CONSTs to pat, and record the start and end of each
5474              * code block in code_blocks[] (each DO{} op is followed by an
5475              * OP_CONST containing the corresponding literal '(?{...})
5476              * text)
5477              */
5478             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5479                 if (o->op_type == OP_CONST) {
5480                     sv_catsv(pat, cSVOPo_sv);
5481                     if (is_code) {
5482                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5483                         is_code = 0;
5484                     }
5485                 }
5486                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5487                     assert(i+1 < pRExC_state->num_code_blocks);
5488                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5489                     pRExC_state->code_blocks[i].block = o;
5490                     pRExC_state->code_blocks[i].src_regex = NULL;
5491                     is_code = 1;
5492                 }
5493             }
5494         }
5495         else {
5496             assert(expr->op_type == OP_CONST);
5497             pat = cSVOPx_sv(expr);
5498         }
5499     }
5500
5501     exp = SvPV_nomg(pat, plen);
5502
5503     if (!eng->op_comp) {
5504         if ((SvUTF8(pat) && IN_BYTES)
5505                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5506         {
5507             /* make a temporary copy; either to convert to bytes,
5508              * or to avoid repeating get-magic / overloaded stringify */
5509             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5510                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5511         }
5512         Safefree(pRExC_state->code_blocks);
5513         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5514     }
5515
5516     /* ignore the utf8ness if the pattern is 0 length */
5517     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5518     RExC_uni_semantics = 0;
5519     RExC_contains_locale = 0;
5520     pRExC_state->runtime_code_qr = NULL;
5521
5522     /****************** LONG JUMP TARGET HERE***********************/
5523     /* Longjmp back to here if have to switch in midstream to utf8 */
5524     if (! RExC_orig_utf8) {
5525         JMPENV_PUSH(jump_ret);
5526         used_setjump = TRUE;
5527     }
5528
5529     if (jump_ret == 0) {    /* First time through */
5530         xend = exp + plen;
5531
5532         DEBUG_COMPILE_r({
5533             SV *dsv= sv_newmortal();
5534             RE_PV_QUOTED_DECL(s, RExC_utf8,
5535                 dsv, exp, plen, 60);
5536             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5537                            PL_colors[4],PL_colors[5],s);
5538         });
5539     }
5540     else {  /* longjumped back */
5541         U8 *src, *dst;
5542         int n=0;
5543         STRLEN s = 0, d = 0;
5544         bool do_end = 0;
5545
5546         /* If the cause for the longjmp was other than changing to utf8, pop
5547          * our own setjmp, and longjmp to the correct handler */
5548         if (jump_ret != UTF8_LONGJMP) {
5549             JMPENV_POP;
5550             JMPENV_JUMP(jump_ret);
5551         }
5552
5553         GET_RE_DEBUG_FLAGS;
5554
5555         /* It's possible to write a regexp in ascii that represents Unicode
5556         codepoints outside of the byte range, such as via \x{100}. If we
5557         detect such a sequence we have to convert the entire pattern to utf8
5558         and then recompile, as our sizing calculation will have been based
5559         on 1 byte == 1 character, but we will need to use utf8 to encode
5560         at least some part of the pattern, and therefore must convert the whole
5561         thing.
5562         -- dmq */
5563         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5564             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5565
5566         /* upgrade pattern to UTF8, and if there are code blocks,
5567          * recalculate the indices.
5568          * This is essentially an unrolled Perl_bytes_to_utf8() */
5569
5570         src = (U8*)SvPV_nomg(pat, plen);
5571         Newx(dst, plen * 2 + 1, U8);
5572
5573         while (s < plen) {
5574             const UV uv = NATIVE_TO_ASCII(src[s]);
5575             if (UNI_IS_INVARIANT(uv))
5576                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5577             else {
5578                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5579                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5580             }
5581             if (n < pRExC_state->num_code_blocks) {
5582                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5583                     pRExC_state->code_blocks[n].start = d;
5584                     assert(dst[d] == '(');
5585                     do_end = 1;
5586                 }
5587                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5588                     pRExC_state->code_blocks[n].end = d;
5589                     assert(dst[d] == ')');
5590                     do_end = 0;
5591                     n++;
5592                 }
5593             }
5594             s++;
5595             d++;
5596         }
5597         dst[d] = '\0';
5598         plen = d;
5599         exp = (char*) dst;
5600         xend = exp + plen;
5601         SAVEFREEPV(exp);
5602         RExC_orig_utf8 = RExC_utf8 = 1;
5603     }
5604
5605     /* return old regex if pattern hasn't changed */
5606
5607     if (   old_re
5608         && !recompile
5609         && !!RX_UTF8(old_re) == !!RExC_utf8
5610         && RX_PRECOMP(old_re)
5611         && RX_PRELEN(old_re) == plen
5612         && memEQ(RX_PRECOMP(old_re), exp, plen))
5613     {
5614         /* with runtime code, always recompile */
5615         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5616                                             exp, plen);
5617         if (!runtime_code) {
5618             if (used_setjump) {
5619                 JMPENV_POP;
5620             }
5621             Safefree(pRExC_state->code_blocks);
5622             return old_re;
5623         }
5624     }
5625     else if ((pm_flags & PMf_USE_RE_EVAL)
5626                 /* this second condition covers the non-regex literal case,
5627                  * i.e.  $foo =~ '(?{})'. */
5628                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5629                     && (PL_hints & HINT_RE_EVAL))
5630     )
5631         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5632                             exp, plen);
5633
5634 #ifdef TRIE_STUDY_OPT
5635     restudied = 0;
5636 #endif
5637
5638     rx_flags = orig_rx_flags;
5639
5640     if (initial_charset == REGEX_LOCALE_CHARSET) {
5641         RExC_contains_locale = 1;
5642     }
5643     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5644
5645         /* Set to use unicode semantics if the pattern is in utf8 and has the
5646          * 'depends' charset specified, as it means unicode when utf8  */
5647         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5648     }
5649
5650     RExC_precomp = exp;
5651     RExC_flags = rx_flags;
5652     RExC_pm_flags = pm_flags;
5653
5654     if (runtime_code) {
5655         if (TAINTING_get && TAINT_get)
5656             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5657
5658         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5659             /* whoops, we have a non-utf8 pattern, whilst run-time code
5660              * got compiled as utf8. Try again with a utf8 pattern */
5661              JMPENV_JUMP(UTF8_LONGJMP);
5662         }
5663     }
5664     assert(!pRExC_state->runtime_code_qr);
5665
5666     RExC_sawback = 0;
5667
5668     RExC_seen = 0;
5669     RExC_in_lookbehind = 0;
5670     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5671     RExC_extralen = 0;
5672     RExC_override_recoding = 0;
5673     RExC_in_multi_char_class = 0;
5674
5675     /* First pass: determine size, legality. */
5676     RExC_parse = exp;
5677     RExC_start = exp;
5678     RExC_end = xend;
5679     RExC_naughty = 0;
5680     RExC_npar = 1;
5681     RExC_nestroot = 0;
5682     RExC_size = 0L;
5683     RExC_emit = &PL_regdummy;
5684     RExC_whilem_seen = 0;
5685     RExC_open_parens = NULL;
5686     RExC_close_parens = NULL;
5687     RExC_opend = NULL;
5688     RExC_paren_names = NULL;
5689 #ifdef DEBUGGING
5690     RExC_paren_name_list = NULL;
5691 #endif
5692     RExC_recurse = NULL;
5693     RExC_recurse_count = 0;
5694     pRExC_state->code_index = 0;
5695
5696 #if 0 /* REGC() is (currently) a NOP at the first pass.
5697        * Clever compilers notice this and complain. --jhi */
5698     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5699 #endif
5700     DEBUG_PARSE_r(
5701         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5702         RExC_lastnum=0;
5703         RExC_lastparse=NULL;
5704     );
5705     /* reg may croak on us, not giving us a chance to free
5706        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5707        need it to survive as long as the regexp (qr/(?{})/).
5708        We must check that code_blocksv is not already set, because we may
5709        have longjmped back. */
5710     if (pRExC_state->code_blocks && !code_blocksv) {
5711         code_blocksv = newSV_type(SVt_PV);
5712         SAVEFREESV(code_blocksv);
5713         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5714         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5715     }
5716     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5717         RExC_precomp = NULL;
5718         return(NULL);
5719     }
5720     if (code_blocksv)
5721         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5722
5723     /* Here, finished first pass.  Get rid of any added setjmp */
5724     if (used_setjump) {
5725         JMPENV_POP;
5726     }
5727
5728     DEBUG_PARSE_r({
5729         PerlIO_printf(Perl_debug_log,
5730             "Required size %"IVdf" nodes\n"
5731             "Starting second pass (creation)\n",
5732             (IV)RExC_size);
5733         RExC_lastnum=0;
5734         RExC_lastparse=NULL;
5735     });
5736
5737     /* The first pass could have found things that force Unicode semantics */
5738     if ((RExC_utf8 || RExC_uni_semantics)
5739          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5740     {
5741         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5742     }
5743
5744     /* Small enough for pointer-storage convention?
5745        If extralen==0, this means that we will not need long jumps. */
5746     if (RExC_size >= 0x10000L && RExC_extralen)
5747         RExC_size += RExC_extralen;
5748     else
5749         RExC_extralen = 0;
5750     if (RExC_whilem_seen > 15)
5751         RExC_whilem_seen = 15;
5752
5753     /* Allocate space and zero-initialize. Note, the two step process
5754        of zeroing when in debug mode, thus anything assigned has to
5755        happen after that */
5756     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5757     r = ReANY(rx);
5758     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5759          char, regexp_internal);
5760     if ( r == NULL || ri == NULL )
5761         FAIL("Regexp out of space");
5762 #ifdef DEBUGGING
5763     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5764     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5765 #else
5766     /* bulk initialize base fields with 0. */
5767     Zero(ri, sizeof(regexp_internal), char);
5768 #endif
5769
5770     /* non-zero initialization begins here */
5771     RXi_SET( r, ri );
5772     r->engine= eng;
5773     r->extflags = rx_flags;
5774     if (pm_flags & PMf_IS_QR) {
5775         ri->code_blocks = pRExC_state->code_blocks;
5776         ri->num_code_blocks = pRExC_state->num_code_blocks;
5777     }
5778     else
5779     {
5780         int n;
5781         for (n = 0; n < pRExC_state->num_code_blocks; n++)
5782             if (pRExC_state->code_blocks[n].src_regex)
5783                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5784         SAVEFREEPV(pRExC_state->code_blocks);
5785     }
5786
5787     {
5788         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5789         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5790
5791         /* The caret is output if there are any defaults: if not all the STD
5792          * flags are set, or if no character set specifier is needed */
5793         bool has_default =
5794                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5795                     || ! has_charset);
5796         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5797         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5798                             >> RXf_PMf_STD_PMMOD_SHIFT);
5799         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5800         char *p;
5801         /* Allocate for the worst case, which is all the std flags are turned
5802          * on.  If more precision is desired, we could do a population count of
5803          * the flags set.  This could be done with a small lookup table, or by
5804          * shifting, masking and adding, or even, when available, assembly
5805          * language for a machine-language population count.
5806          * We never output a minus, as all those are defaults, so are
5807          * covered by the caret */
5808         const STRLEN wraplen = plen + has_p + has_runon
5809             + has_default       /* If needs a caret */
5810
5811                 /* If needs a character set specifier */
5812             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5813             + (sizeof(STD_PAT_MODS) - 1)
5814             + (sizeof("(?:)") - 1);
5815
5816         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5817         r->xpv_len_u.xpvlenu_pv = p;
5818         if (RExC_utf8)
5819             SvFLAGS(rx) |= SVf_UTF8;
5820         *p++='('; *p++='?';
5821
5822         /* If a default, cover it using the caret */
5823         if (has_default) {
5824             *p++= DEFAULT_PAT_MOD;
5825         }
5826         if (has_charset) {
5827             STRLEN len;
5828             const char* const name = get_regex_charset_name(r->extflags, &len);
5829             Copy(name, p, len, char);
5830             p += len;
5831         }
5832         if (has_p)
5833             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5834         {
5835             char ch;
5836             while((ch = *fptr++)) {
5837                 if(reganch & 1)
5838                     *p++ = ch;
5839                 reganch >>= 1;
5840             }
5841         }
5842
5843         *p++ = ':';
5844         Copy(RExC_precomp, p, plen, char);
5845         assert ((RX_WRAPPED(rx) - p) < 16);
5846         r->pre_prefix = p - RX_WRAPPED(rx);
5847         p += plen;
5848         if (has_runon)
5849             *p++ = '\n';
5850         *p++ = ')';
5851         *p = 0;
5852         SvCUR_set(rx, p - RX_WRAPPED(rx));
5853     }
5854
5855     r->intflags = 0;
5856     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5857
5858     if (RExC_seen & REG_SEEN_RECURSE) {
5859         Newxz(RExC_open_parens, RExC_npar,regnode *);
5860         SAVEFREEPV(RExC_open_parens);
5861         Newxz(RExC_close_parens,RExC_npar,regnode *);
5862         SAVEFREEPV(RExC_close_parens);
5863     }
5864
5865     /* Useful during FAIL. */
5866 #ifdef RE_TRACK_PATTERN_OFFSETS
5867     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5868     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5869                           "%s %"UVuf" bytes for offset annotations.\n",
5870                           ri->u.offsets ? "Got" : "Couldn't get",
5871                           (UV)((2*RExC_size+1) * sizeof(U32))));
5872 #endif
5873     SetProgLen(ri,RExC_size);
5874     RExC_rx_sv = rx;
5875     RExC_rx = r;
5876     RExC_rxi = ri;
5877
5878     /* Second pass: emit code. */
5879     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5880     RExC_pm_flags = pm_flags;
5881     RExC_parse = exp;
5882     RExC_end = xend;
5883     RExC_naughty = 0;
5884     RExC_npar = 1;
5885     RExC_emit_start = ri->program;
5886     RExC_emit = ri->program;
5887     RExC_emit_bound = ri->program + RExC_size + 1;
5888     pRExC_state->code_index = 0;
5889
5890     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5891     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5892         ReREFCNT_dec(rx);
5893         return(NULL);
5894     }
5895     /* XXXX To minimize changes to RE engine we always allocate
5896        3-units-long substrs field. */
5897     Newx(r->substrs, 1, struct reg_substr_data);
5898     if (RExC_recurse_count) {
5899         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5900         SAVEFREEPV(RExC_recurse);
5901     }
5902
5903 reStudy:
5904     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5905     Zero(r->substrs, 1, struct reg_substr_data);
5906
5907 #ifdef TRIE_STUDY_OPT
5908     if (!restudied) {
5909         StructCopy(&zero_scan_data, &data, scan_data_t);
5910         copyRExC_state = RExC_state;
5911     } else {
5912         U32 seen=RExC_seen;
5913         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5914
5915         RExC_state = copyRExC_state;
5916         if (seen & REG_TOP_LEVEL_BRANCHES)
5917             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5918         else
5919             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5920         StructCopy(&zero_scan_data, &data, scan_data_t);
5921     }
5922 #else
5923     StructCopy(&zero_scan_data, &data, scan_data_t);
5924 #endif
5925
5926     /* Dig out information for optimizations. */
5927     r->extflags = RExC_flags; /* was pm_op */
5928     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5929
5930     if (UTF)
5931         SvUTF8_on(rx);  /* Unicode in it? */
5932     ri->regstclass = NULL;
5933     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5934         r->intflags |= PREGf_NAUGHTY;
5935     scan = ri->program + 1;             /* First BRANCH. */
5936
5937     /* testing for BRANCH here tells us whether there is "must appear"
5938        data in the pattern. If there is then we can use it for optimisations */
5939     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5940         I32 fake;
5941         STRLEN longest_float_length, longest_fixed_length;
5942         struct regnode_charclass_class ch_class; /* pointed to by data */
5943         int stclass_flag;
5944         I32 last_close = 0; /* pointed to by data */
5945         regnode *first= scan;
5946         regnode *first_next= regnext(first);
5947         /*
5948          * Skip introductions and multiplicators >= 1
5949          * so that we can extract the 'meat' of the pattern that must
5950          * match in the large if() sequence following.
5951          * NOTE that EXACT is NOT covered here, as it is normally
5952          * picked up by the optimiser separately.
5953          *
5954          * This is unfortunate as the optimiser isnt handling lookahead
5955          * properly currently.
5956          *
5957          */
5958         while ((OP(first) == OPEN && (sawopen = 1)) ||
5959                /* An OR of *one* alternative - should not happen now. */
5960             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5961             /* for now we can't handle lookbehind IFMATCH*/
5962             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5963             (OP(first) == PLUS) ||
5964             (OP(first) == MINMOD) ||
5965                /* An {n,m} with n>0 */
5966             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5967             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5968         {
5969                 /*
5970                  * the only op that could be a regnode is PLUS, all the rest
5971                  * will be regnode_1 or regnode_2.
5972                  *
5973                  */
5974                 if (OP(first) == PLUS)
5975                     sawplus = 1;
5976                 else
5977                     first += regarglen[OP(first)];
5978
5979                 first = NEXTOPER(first);
5980                 first_next= regnext(first);
5981         }
5982
5983         /* Starting-point info. */
5984       again:
5985         DEBUG_PEEP("first:",first,0);
5986         /* Ignore EXACT as we deal with it later. */
5987         if (PL_regkind[OP(first)] == EXACT) {
5988             if (OP(first) == EXACT)
5989                 NOOP;   /* Empty, get anchored substr later. */
5990             else
5991                 ri->regstclass = first;
5992         }
5993 #ifdef TRIE_STCLASS
5994         else if (PL_regkind[OP(first)] == TRIE &&
5995                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5996         {
5997             regnode *trie_op;
5998             /* this can happen only on restudy */
5999             if ( OP(first) == TRIE ) {
6000                 struct regnode_1 *trieop = (struct regnode_1 *)
6001                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6002                 StructCopy(first,trieop,struct regnode_1);
6003                 trie_op=(regnode *)trieop;
6004             } else {
6005                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6006                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6007                 StructCopy(first,trieop,struct regnode_charclass);
6008                 trie_op=(regnode *)trieop;
6009             }
6010             OP(trie_op)+=2;
6011             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6012             ri->regstclass = trie_op;
6013         }
6014 #endif
6015         else if (REGNODE_SIMPLE(OP(first)))
6016             ri->regstclass = first;
6017         else if (PL_regkind[OP(first)] == BOUND ||
6018                  PL_regkind[OP(first)] == NBOUND)
6019             ri->regstclass = first;
6020         else if (PL_regkind[OP(first)] == BOL) {
6021             r->extflags |= (OP(first) == MBOL
6022                            ? RXf_ANCH_MBOL
6023                            : (OP(first) == SBOL
6024                               ? RXf_ANCH_SBOL
6025                               : RXf_ANCH_BOL));
6026             first = NEXTOPER(first);
6027             goto again;
6028         }
6029         else if (OP(first) == GPOS) {
6030             r->extflags |= RXf_ANCH_GPOS;
6031             first = NEXTOPER(first);
6032             goto again;
6033         }
6034         else if ((!sawopen || !RExC_sawback) &&
6035             (OP(first) == STAR &&
6036             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6037             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6038         {
6039             /* turn .* into ^.* with an implied $*=1 */
6040             const int type =
6041                 (OP(NEXTOPER(first)) == REG_ANY)
6042                     ? RXf_ANCH_MBOL
6043                     : RXf_ANCH_SBOL;
6044             r->extflags |= type;
6045             r->intflags |= PREGf_IMPLICIT;
6046             first = NEXTOPER(first);
6047             goto again;
6048         }
6049         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6050             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6051             /* x+ must match at the 1st pos of run of x's */
6052             r->intflags |= PREGf_SKIP;
6053
6054         /* Scan is after the zeroth branch, first is atomic matcher. */
6055 #ifdef TRIE_STUDY_OPT
6056         DEBUG_PARSE_r(
6057             if (!restudied)
6058                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6059                               (IV)(first - scan + 1))
6060         );
6061 #else
6062         DEBUG_PARSE_r(
6063             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6064                 (IV)(first - scan + 1))
6065         );
6066 #endif
6067
6068
6069         /*
6070         * If there's something expensive in the r.e., find the
6071         * longest literal string that must appear and make it the
6072         * regmust.  Resolve ties in favor of later strings, since
6073         * the regstart check works with the beginning of the r.e.
6074         * and avoiding duplication strengthens checking.  Not a
6075         * strong reason, but sufficient in the absence of others.
6076         * [Now we resolve ties in favor of the earlier string if
6077         * it happens that c_offset_min has been invalidated, since the
6078         * earlier string may buy us something the later one won't.]
6079         */
6080
6081         data.longest_fixed = newSVpvs("");
6082         data.longest_float = newSVpvs("");
6083         data.last_found = newSVpvs("");
6084         data.longest = &(data.longest_fixed);
6085         ENTER_with_name("study_chunk");
6086         SAVEFREESV(data.longest_fixed);
6087         SAVEFREESV(data.longest_float);
6088         SAVEFREESV(data.last_found);
6089         first = scan;
6090         if (!ri->regstclass) {
6091             cl_init(pRExC_state, &ch_class);
6092             data.start_class = &ch_class;
6093             stclass_flag = SCF_DO_STCLASS_AND;
6094         } else                          /* XXXX Check for BOUND? */
6095             stclass_flag = 0;
6096         data.last_closep = &last_close;
6097
6098         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6099             &data, -1, NULL, NULL,
6100             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6101
6102
6103         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6104
6105
6106         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6107              && data.last_start_min == 0 && data.last_end > 0
6108              && !RExC_seen_zerolen
6109              && !(RExC_seen & REG_SEEN_VERBARG)
6110              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6111             r->extflags |= RXf_CHECK_ALL;
6112         scan_commit(pRExC_state, &data,&minlen,0);
6113
6114         longest_float_length = CHR_SVLEN(data.longest_float);
6115
6116         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6117                    && data.offset_fixed == data.offset_float_min
6118                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6119             && S_setup_longest (aTHX_ pRExC_state,
6120                                     data.longest_float,
6121                                     &(r->float_utf8),
6122                                     &(r->float_substr),
6123                                     &(r->float_end_shift),
6124                                     data.lookbehind_float,
6125                                     data.offset_float_min,
6126                                     data.minlen_float,
6127                                     longest_float_length,
6128                                     data.flags & SF_FL_BEFORE_EOL,
6129                                     data.flags & SF_FL_BEFORE_MEOL))
6130         {
6131             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6132             r->float_max_offset = data.offset_float_max;
6133             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6134                 r->float_max_offset -= data.lookbehind_float;
6135             SvREFCNT_inc_simple_void_NN(data.longest_float);
6136         }
6137         else {
6138             r->float_substr = r->float_utf8 = NULL;
6139             longest_float_length = 0;
6140         }
6141
6142         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6143
6144         if (S_setup_longest (aTHX_ pRExC_state,
6145                                 data.longest_fixed,
6146                                 &(r->anchored_utf8),
6147                                 &(r->anchored_substr),
6148                                 &(r->anchored_end_shift),
6149                                 data.lookbehind_fixed,
6150                                 data.offset_fixed,
6151                                 data.minlen_fixed,
6152                                 longest_fixed_length,
6153                                 data.flags & SF_FIX_BEFORE_EOL,
6154                                 data.flags & SF_FIX_BEFORE_MEOL))
6155         {
6156             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6157             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6158         }
6159         else {
6160             r->anchored_substr = r->anchored_utf8 = NULL;
6161             longest_fixed_length = 0;
6162         }
6163         LEAVE_with_name("study_chunk");
6164
6165         if (ri->regstclass
6166             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6167             ri->regstclass = NULL;
6168
6169         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6170             && stclass_flag
6171             && ! TEST_SSC_EOS(data.start_class)
6172             && !cl_is_anything(data.start_class))
6173         {
6174             const U32 n = add_data(pRExC_state, 1, "f");
6175             OP(data.start_class) = ANYOF_SYNTHETIC;
6176
6177             Newx(RExC_rxi->data->data[n], 1,
6178                 struct regnode_charclass_class);
6179             StructCopy(data.start_class,
6180                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6181                        struct regnode_charclass_class);
6182             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6183             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6184             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6185                       regprop(r, sv, (regnode*)data.start_class);
6186                       PerlIO_printf(Perl_debug_log,
6187                                     "synthetic stclass \"%s\".\n",
6188                                     SvPVX_const(sv));});
6189         }
6190
6191         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6192         if (longest_fixed_length > longest_float_length) {
6193             r->check_end_shift = r->anchored_end_shift;
6194             r->check_substr = r->anchored_substr;
6195             r->check_utf8 = r->anchored_utf8;
6196             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6197             if (r->extflags & RXf_ANCH_SINGLE)
6198                 r->extflags |= RXf_NOSCAN;
6199         }
6200         else {
6201             r->check_end_shift = r->float_end_shift;
6202             r->check_substr = r->float_substr;
6203             r->check_utf8 = r->float_utf8;
6204             r->check_offset_min = r->float_min_offset;
6205             r->check_offset_max = r->float_max_offset;
6206         }
6207         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6208            This should be changed ASAP!  */
6209         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6210             r->extflags |= RXf_USE_INTUIT;
6211             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6212                 r->extflags |= RXf_INTUIT_TAIL;
6213         }
6214         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6215         if ( (STRLEN)minlen < longest_float_length )
6216             minlen= longest_float_length;
6217         if ( (STRLEN)minlen < longest_fixed_length )
6218             minlen= longest_fixed_length;
6219         */
6220     }
6221     else {
6222         /* Several toplevels. Best we can is to set minlen. */
6223         I32 fake;
6224         struct regnode_charclass_class ch_class;
6225         I32 last_close = 0;
6226
6227         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6228
6229         scan = ri->program + 1;
6230         cl_init(pRExC_state, &ch_class);
6231         data.start_class = &ch_class;
6232         data.last_closep = &last_close;
6233
6234
6235         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6236             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6237
6238         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6239
6240         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6241                 = r->float_substr = r->float_utf8 = NULL;
6242
6243         if (! TEST_SSC_EOS(data.start_class)
6244             && !cl_is_anything(data.start_class))
6245         {
6246             const U32 n = add_data(pRExC_state, 1, "f");
6247             OP(data.start_class) = ANYOF_SYNTHETIC;
6248
6249             Newx(RExC_rxi->data->data[n], 1,
6250                 struct regnode_charclass_class);
6251             StructCopy(data.start_class,
6252                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6253                        struct regnode_charclass_class);
6254             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6255             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6256             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6257                       regprop(r, sv, (regnode*)data.start_class);
6258                       PerlIO_printf(Perl_debug_log,
6259                                     "synthetic stclass \"%s\".\n",
6260                                     SvPVX_const(sv));});
6261         }
6262     }
6263
6264     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6265        the "real" pattern. */
6266     DEBUG_OPTIMISE_r({
6267         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6268                       (IV)minlen, (IV)r->minlen);
6269     });
6270     r->minlenret = minlen;
6271     if (r->minlen < minlen)
6272         r->minlen = minlen;
6273
6274     if (RExC_seen & REG_SEEN_GPOS)
6275         r->extflags |= RXf_GPOS_SEEN;
6276     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6277         r->extflags |= RXf_LOOKBEHIND_SEEN;
6278     if (pRExC_state->num_code_blocks)
6279         r->extflags |= RXf_EVAL_SEEN;
6280     if (RExC_seen & REG_SEEN_CANY)
6281         r->extflags |= RXf_CANY_SEEN;
6282     if (RExC_seen & REG_SEEN_VERBARG)
6283     {
6284         r->intflags |= PREGf_VERBARG_SEEN;
6285         r->extflags |= RXf_MODIFIES_VARS;
6286     }
6287     if (RExC_seen & REG_SEEN_CUTGROUP)
6288         r->intflags |= PREGf_CUTGROUP_SEEN;
6289     if (pm_flags & PMf_USE_RE_EVAL)
6290         r->intflags |= PREGf_USE_RE_EVAL;
6291     if (RExC_paren_names)
6292         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6293     else
6294         RXp_PAREN_NAMES(r) = NULL;
6295
6296 #ifdef STUPID_PATTERN_CHECKS
6297     if (RX_PRELEN(rx) == 0)
6298         r->extflags |= RXf_NULL;
6299     if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6300         r->extflags |= RXf_WHITE;
6301     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6302         r->extflags |= RXf_START_ONLY;
6303 #else
6304     {
6305         regnode *first = ri->program + 1;
6306         U8 fop = OP(first);
6307
6308         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6309             r->extflags |= RXf_NULL;
6310         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6311             r->extflags |= RXf_START_ONLY;
6312         else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
6313                              && OP(regnext(first)) == END)
6314             r->extflags |= RXf_WHITE;
6315     }
6316 #endif
6317 #ifdef DEBUGGING
6318     if (RExC_paren_names) {
6319         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6320         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6321     } else
6322 #endif
6323         ri->name_list_idx = 0;
6324
6325     if (RExC_recurse_count) {
6326         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6327             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6328             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6329         }
6330     }
6331     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6332     /* assume we don't need to swap parens around before we match */
6333
6334     DEBUG_DUMP_r({
6335         PerlIO_printf(Perl_debug_log,"Final program:\n");
6336         regdump(r);
6337     });
6338 #ifdef RE_TRACK_PATTERN_OFFSETS
6339     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6340         const U32 len = ri->u.offsets[0];
6341         U32 i;
6342         GET_RE_DEBUG_FLAGS_DECL;
6343         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6344         for (i = 1; i <= len; i++) {
6345             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6346                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6347                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6348             }
6349         PerlIO_printf(Perl_debug_log, "\n");
6350     });
6351 #endif
6352     return rx;
6353 }
6354
6355
6356 SV*
6357 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6358                     const U32 flags)
6359 {
6360     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6361
6362     PERL_UNUSED_ARG(value);
6363
6364     if (flags & RXapif_FETCH) {
6365         return reg_named_buff_fetch(rx, key, flags);
6366     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6367         Perl_croak_no_modify();
6368         return NULL;
6369     } else if (flags & RXapif_EXISTS) {
6370         return reg_named_buff_exists(rx, key, flags)
6371             ? &PL_sv_yes
6372             : &PL_sv_no;
6373     } else if (flags & RXapif_REGNAMES) {
6374         return reg_named_buff_all(rx, flags);
6375     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6376         return reg_named_buff_scalar(rx, flags);
6377     } else {
6378         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6379         return NULL;
6380     }
6381 }
6382
6383 SV*
6384 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6385                          const U32 flags)
6386 {
6387     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6388     PERL_UNUSED_ARG(lastkey);
6389
6390     if (flags & RXapif_FIRSTKEY)
6391         return reg_named_buff_firstkey(rx, flags);
6392     else if (flags & RXapif_NEXTKEY)
6393         return reg_named_buff_nextkey(rx, flags);
6394     else {
6395         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6396         return NULL;
6397     }
6398 }
6399
6400 SV*
6401 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6402                           const U32 flags)
6403 {
6404     AV *retarray = NULL;
6405     SV *ret;
6406     struct regexp *const rx = ReANY(r);
6407
6408     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6409
6410     if (flags & RXapif_ALL)
6411         retarray=newAV();
6412
6413     if (rx && RXp_PAREN_NAMES(rx)) {
6414         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6415         if (he_str) {
6416             IV i;
6417             SV* sv_dat=HeVAL(he_str);
6418             I32 *nums=(I32*)SvPVX(sv_dat);
6419             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6420                 if ((I32)(rx->nparens) >= nums[i]
6421                     && rx->offs[nums[i]].start != -1
6422                     && rx->offs[nums[i]].end != -1)
6423                 {
6424                     ret = newSVpvs("");
6425                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6426                     if (!retarray)
6427                         return ret;
6428                 } else {
6429                     if (retarray)
6430                         ret = newSVsv(&PL_sv_undef);
6431                 }
6432                 if (retarray)
6433                     av_push(retarray, ret);
6434             }
6435             if (retarray)
6436                 return newRV_noinc(MUTABLE_SV(retarray));
6437         }
6438     }
6439     return NULL;
6440 }
6441
6442 bool
6443 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6444                            const U32 flags)
6445 {
6446     struct regexp *const rx = ReANY(r);
6447
6448     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6449
6450     if (rx && RXp_PAREN_NAMES(rx)) {
6451         if (flags & RXapif_ALL) {
6452             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6453         } else {
6454             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6455             if (sv) {
6456                 SvREFCNT_dec_NN(sv);
6457                 return TRUE;
6458             } else {
6459                 return FALSE;
6460             }
6461         }
6462     } else {
6463         return FALSE;
6464     }
6465 }
6466
6467 SV*
6468 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6469 {
6470     struct regexp *const rx = ReANY(r);
6471
6472     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6473
6474     if ( rx && RXp_PAREN_NAMES(rx) ) {
6475         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6476
6477         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6478     } else {
6479         return FALSE;
6480     }
6481 }
6482
6483 SV*
6484 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6485 {
6486     struct regexp *const rx = ReANY(r);
6487     GET_RE_DEBUG_FLAGS_DECL;
6488
6489     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6490
6491     if (rx && RXp_PAREN_NAMES(rx)) {
6492         HV *hv = RXp_PAREN_NAMES(rx);
6493         HE *temphe;
6494         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6495             IV i;
6496             IV parno = 0;
6497             SV* sv_dat = HeVAL(temphe);
6498             I32 *nums = (I32*)SvPVX(sv_dat);
6499             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6500                 if ((I32)(rx->lastparen) >= nums[i] &&
6501                     rx->offs[nums[i]].start != -1 &&
6502                     rx->offs[nums[i]].end != -1)
6503                 {
6504                     parno = nums[i];
6505                     break;
6506                 }
6507             }
6508             if (parno || flags & RXapif_ALL) {
6509                 return newSVhek(HeKEY_hek(temphe));
6510             }
6511         }
6512     }
6513     return NULL;
6514 }
6515
6516 SV*
6517 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6518 {
6519     SV *ret;
6520     AV *av;
6521     I32 length;
6522     struct regexp *const rx = ReANY(r);
6523
6524     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6525
6526     if (rx && RXp_PAREN_NAMES(rx)) {
6527         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6528             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6529         } else if (flags & RXapif_ONE) {
6530             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6531             av = MUTABLE_AV(SvRV(ret));
6532             length = av_len(av);
6533             SvREFCNT_dec_NN(ret);
6534             return newSViv(length + 1);
6535         } else {
6536             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6537             return NULL;
6538         }
6539     }
6540     return &PL_sv_undef;
6541 }
6542
6543 SV*
6544 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6545 {
6546     struct regexp *const rx = ReANY(r);
6547     AV *av = newAV();
6548
6549     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6550
6551     if (rx && RXp_PAREN_NAMES(rx)) {
6552         HV *hv= RXp_PAREN_NAMES(rx);
6553         HE *temphe;
6554         (void)hv_iterinit(hv);
6555         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6556             IV i;
6557             IV parno = 0;
6558             SV* sv_dat = HeVAL(temphe);
6559             I32 *nums = (I32*)SvPVX(sv_dat);
6560             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6561                 if ((I32)(rx->lastparen) >= nums[i] &&
6562                     rx->offs[nums[i]].start != -1 &&
6563                     rx->offs[nums[i]].end != -1)
6564                 {
6565                     parno = nums[i];
6566                     break;
6567                 }
6568             }
6569             if (parno || flags & RXapif_ALL) {
6570                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6571             }
6572         }
6573     }
6574
6575     return newRV_noinc(MUTABLE_SV(av));
6576 }
6577
6578 void
6579 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6580                              SV * const sv)
6581 {
6582     struct regexp *const rx = ReANY(r);
6583     char *s = NULL;
6584     I32 i = 0;
6585     I32 s1, t1;
6586     I32 n = paren;
6587
6588     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6589
6590     if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6591            || n == RX_BUFF_IDX_CARET_FULLMATCH
6592            || n == RX_BUFF_IDX_CARET_POSTMATCH
6593          )
6594          && !(rx->extflags & RXf_PMf_KEEPCOPY)
6595     )
6596         goto ret_undef;
6597
6598     if (!rx->subbeg)
6599         goto ret_undef;
6600
6601     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6602         /* no need to distinguish between them any more */
6603         n = RX_BUFF_IDX_FULLMATCH;
6604
6605     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6606         && rx->offs[0].start != -1)
6607     {
6608         /* $`, ${^PREMATCH} */
6609         i = rx->offs[0].start;
6610         s = rx->subbeg;
6611     }
6612     else
6613     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6614         && rx->offs[0].end != -1)
6615     {
6616         /* $', ${^POSTMATCH} */
6617         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6618         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6619     }
6620     else
6621     if ( 0 <= n && n <= (I32)rx->nparens &&
6622         (s1 = rx->offs[n].start) != -1 &&
6623         (t1 = rx->offs[n].end) != -1)
6624     {
6625         /* $&, ${^MATCH},  $1 ... */
6626         i = t1 - s1;
6627         s = rx->subbeg + s1 - rx->suboffset;
6628     } else {
6629         goto ret_undef;
6630     }
6631
6632     assert(s >= rx->subbeg);
6633     assert(rx->sublen >= (s - rx->subbeg) + i );
6634     if (i >= 0) {
6635 #if NO_TAINT_SUPPORT
6636         sv_setpvn(sv, s, i);
6637 #else
6638         const int oldtainted = TAINT_get;
6639         TAINT_NOT;
6640         sv_setpvn(sv, s, i);
6641         TAINT_set(oldtainted);
6642 #endif
6643         if ( (rx->extflags & RXf_CANY_SEEN)
6644             ? (RXp_MATCH_UTF8(rx)
6645                         && (!i || is_utf8_string((U8*)s, i)))
6646             : (RXp_MATCH_UTF8(rx)) )
6647         {
6648             SvUTF8_on(sv);
6649         }
6650         else
6651             SvUTF8_off(sv);
6652         if (TAINTING_get) {
6653             if (RXp_MATCH_TAINTED(rx)) {
6654                 if (SvTYPE(sv) >= SVt_PVMG) {
6655                     MAGIC* const mg = SvMAGIC(sv);
6656                     MAGIC* mgt;
6657                     TAINT;
6658                     SvMAGIC_set(sv, mg->mg_moremagic);
6659                     SvTAINT(sv);
6660                     if ((mgt = SvMAGIC(sv))) {
6661                         mg->mg_moremagic = mgt;
6662                         SvMAGIC_set(sv, mg);
6663                     }
6664                 } else {
6665                     TAINT;
6666                     SvTAINT(sv);
6667                 }
6668             } else
6669                 SvTAINTED_off(sv);
6670         }
6671     } else {
6672       ret_undef:
6673         sv_setsv(sv,&PL_sv_undef);
6674         return;
6675     }
6676 }
6677
6678 void
6679 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6680                                                          SV const * const value)
6681 {
6682     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6683
6684     PERL_UNUSED_ARG(rx);
6685     PERL_UNUSED_ARG(paren);
6686     PERL_UNUSED_ARG(value);
6687
6688     if (!PL_localizing)
6689         Perl_croak_no_modify();
6690 }
6691
6692 I32
6693 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6694                               const I32 paren)
6695 {
6696     struct regexp *const rx = ReANY(r);
6697     I32 i;
6698     I32 s1, t1;
6699
6700     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6701
6702     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6703     switch (paren) {
6704       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6705          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6706             goto warn_undef;
6707         /*FALLTHROUGH*/
6708
6709       case RX_BUFF_IDX_PREMATCH:       /* $` */
6710         if (rx->offs[0].start != -1) {
6711                         i = rx->offs[0].start;
6712                         if (i > 0) {
6713                                 s1 = 0;
6714                                 t1 = i;
6715                                 goto getlen;
6716                         }
6717             }
6718         return 0;
6719
6720       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6721          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6722             goto warn_undef;
6723       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6724             if (rx->offs[0].end != -1) {
6725                         i = rx->sublen - rx->offs[0].end;
6726                         if (i > 0) {
6727                                 s1 = rx->offs[0].end;
6728                                 t1 = rx->sublen;
6729                                 goto getlen;
6730                         }
6731             }
6732         return 0;
6733
6734       case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6735          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6736             goto warn_undef;
6737         /*FALLTHROUGH*/
6738
6739       /* $& / ${^MATCH}, $1, $2, ... */
6740       default:
6741             if (paren <= (I32)rx->nparens &&
6742             (s1 = rx->offs[paren].start) != -1 &&
6743             (t1 = rx->offs[paren].end) != -1)
6744             {
6745             i = t1 - s1;
6746             goto getlen;
6747         } else {
6748           warn_undef:
6749             if (ckWARN(WARN_UNINITIALIZED))
6750                 report_uninit((const SV *)sv);
6751             return 0;
6752         }
6753     }
6754   getlen:
6755     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6756         const char * const s = rx->subbeg - rx->suboffset + s1;
6757         const U8 *ep;
6758         STRLEN el;
6759
6760         i = t1 - s1;
6761         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6762                         i = el;
6763     }
6764     return i;
6765 }
6766
6767 SV*
6768 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6769 {
6770     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6771         PERL_UNUSED_ARG(rx);
6772         if (0)
6773             return NULL;
6774         else
6775             return newSVpvs("Regexp");
6776 }
6777
6778 /* Scans the name of a named buffer from the pattern.
6779  * If flags is REG_RSN_RETURN_NULL returns null.
6780  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6781  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6782  * to the parsed name as looked up in the RExC_paren_names hash.
6783  * If there is an error throws a vFAIL().. type exception.
6784  */
6785
6786 #define REG_RSN_RETURN_NULL    0
6787 #define REG_RSN_RETURN_NAME    1
6788 #define REG_RSN_RETURN_DATA    2
6789
6790 STATIC SV*
6791 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6792 {
6793     char *name_start = RExC_parse;
6794
6795     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6796
6797     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6798          /* skip IDFIRST by using do...while */
6799         if (UTF)
6800             do {
6801                 RExC_parse += UTF8SKIP(RExC_parse);
6802             } while (isWORDCHAR_utf8((U8*)RExC_parse));
6803         else
6804             do {
6805                 RExC_parse++;
6806             } while (isWORDCHAR(*RExC_parse));
6807     } else {
6808         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6809         vFAIL("Group name must start with a non-digit word character");
6810     }
6811     if ( flags ) {
6812         SV* sv_name
6813             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6814                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6815         if ( flags == REG_RSN_RETURN_NAME)
6816             return sv_name;
6817         else if (flags==REG_RSN_RETURN_DATA) {
6818             HE *he_str = NULL;
6819             SV *sv_dat = NULL;
6820             if ( ! sv_name )      /* should not happen*/
6821                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6822             if (RExC_paren_names)
6823                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6824             if ( he_str )
6825                 sv_dat = HeVAL(he_str);
6826             if ( ! sv_dat )
6827                 vFAIL("Reference to nonexistent named group");
6828             return sv_dat;
6829         }
6830         else {
6831             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6832                        (unsigned long) flags);
6833         }
6834         assert(0); /* NOT REACHED */
6835     }
6836     return NULL;
6837 }
6838
6839 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6840     int rem=(int)(RExC_end - RExC_parse);                       \
6841     int cut;                                                    \
6842     int num;                                                    \
6843     int iscut=0;                                                \
6844     if (rem>10) {                                               \
6845         rem=10;                                                 \
6846         iscut=1;                                                \
6847     }                                                           \
6848     cut=10-rem;                                                 \
6849     if (RExC_lastparse!=RExC_parse)                             \
6850         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6851             rem, RExC_parse,                                    \
6852             cut + 4,                                            \
6853             iscut ? "..." : "<"                                 \
6854         );                                                      \
6855     else                                                        \
6856         PerlIO_printf(Perl_debug_log,"%16s","");                \
6857                                                                 \
6858     if (SIZE_ONLY)                                              \
6859        num = RExC_size + 1;                                     \
6860     else                                                        \
6861        num=REG_NODE_NUM(RExC_emit);                             \
6862     if (RExC_lastnum!=num)                                      \
6863        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6864     else                                                        \
6865        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6866     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6867         (int)((depth*2)), "",                                   \
6868         (funcname)                                              \
6869     );                                                          \
6870     RExC_lastnum=num;                                           \
6871     RExC_lastparse=RExC_parse;                                  \
6872 })
6873
6874
6875
6876 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6877     DEBUG_PARSE_MSG((funcname));                            \
6878     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6879 })
6880 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6881     DEBUG_PARSE_MSG((funcname));                            \
6882     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6883 })
6884
6885 /* This section of code defines the inversion list object and its methods.  The
6886  * interfaces are highly subject to change, so as much as possible is static to
6887  * this file.  An inversion list is here implemented as a malloc'd C UV array
6888  * with some added info that is placed as UVs at the beginning in a header
6889  * portion.  An inversion list for Unicode is an array of code points, sorted
6890  * by ordinal number.  The zeroth element is the first code point in the list.
6891  * The 1th element is the first element beyond that not in the list.  In other
6892  * words, the first range is
6893  *  invlist[0]..(invlist[1]-1)
6894  * The other ranges follow.  Thus every element whose index is divisible by two
6895  * marks the beginning of a range that is in the list, and every element not
6896  * divisible by two marks the beginning of a range not in the list.  A single
6897  * element inversion list that contains the single code point N generally
6898  * consists of two elements
6899  *  invlist[0] == N
6900  *  invlist[1] == N+1
6901  * (The exception is when N is the highest representable value on the
6902  * machine, in which case the list containing just it would be a single
6903  * element, itself.  By extension, if the last range in the list extends to
6904  * infinity, then the first element of that range will be in the inversion list
6905  * at a position that is divisible by two, and is the final element in the
6906  * list.)
6907  * Taking the complement (inverting) an inversion list is quite simple, if the
6908  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6909  * This implementation reserves an element at the beginning of each inversion
6910  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
6911  * actual beginning of the list is either that element if 0, or the next one if
6912  * 1.
6913  *
6914  * More about inversion lists can be found in "Unicode Demystified"
6915  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6916  * More will be coming when functionality is added later.
6917  *
6918  * The inversion list data structure is currently implemented as an SV pointing
6919  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6920  * array of UV whose memory management is automatically handled by the existing
6921  * facilities for SV's.
6922  *
6923  * Some of the methods should always be private to the implementation, and some
6924  * should eventually be made public */
6925
6926 /* The header definitions are in F<inline_invlist.c> */
6927 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6928 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6929
6930 #define INVLIST_INITIAL_LEN 10
6931
6932 PERL_STATIC_INLINE UV*
6933 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6934 {
6935     /* Returns a pointer to the first element in the inversion list's array.
6936      * This is called upon initialization of an inversion list.  Where the
6937      * array begins depends on whether the list has the code point U+0000
6938      * in it or not.  The other parameter tells it whether the code that
6939      * follows this call is about to put a 0 in the inversion list or not.
6940      * The first element is either the element with 0, if 0, or the next one,
6941      * if 1 */
6942
6943     UV* zero = get_invlist_zero_addr(invlist);
6944
6945     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6946
6947     /* Must be empty */
6948     assert(! *_get_invlist_len_addr(invlist));
6949
6950     /* 1^1 = 0; 1^0 = 1 */
6951     *zero = 1 ^ will_have_0;
6952     return zero + *zero;
6953 }
6954
6955 PERL_STATIC_INLINE UV*
6956 S_invlist_array(pTHX_ SV* const invlist)
6957 {
6958     /* Returns the pointer to the inversion list's array.  Every time the
6959      * length changes, this needs to be called in case malloc or realloc moved
6960      * it */
6961
6962     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6963
6964     /* Must not be empty.  If these fail, you probably didn't check for <len>
6965      * being non-zero before trying to get the array */
6966     assert(*_get_invlist_len_addr(invlist));
6967     assert(*get_invlist_zero_addr(invlist) == 0
6968            || *get_invlist_zero_addr(invlist) == 1);
6969
6970     /* The array begins either at the element reserved for zero if the
6971      * list contains 0 (that element will be set to 0), or otherwise the next
6972      * element (in which case the reserved element will be set to 1). */
6973     return (UV *) (get_invlist_zero_addr(invlist)
6974                    + *get_invlist_zero_addr(invlist));
6975 }
6976
6977 PERL_STATIC_INLINE void
6978 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6979 {
6980     /* Sets the current number of elements stored in the inversion list */
6981
6982     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6983
6984     *_get_invlist_len_addr(invlist) = len;
6985
6986     assert(len <= SvLEN(invlist));
6987
6988     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6989     /* If the list contains U+0000, that element is part of the header,
6990      * and should not be counted as part of the array.  It will contain
6991      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
6992      * subtract:
6993      *  SvCUR_set(invlist,
6994      *            TO_INTERNAL_SIZE(len
6995      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
6996      * But, this is only valid if len is not 0.  The consequences of not doing
6997      * this is that the memory allocation code may think that 1 more UV is
6998      * being used than actually is, and so might do an unnecessary grow.  That
6999      * seems worth not bothering to make this the precise amount.
7000      *
7001      * Note that when inverting, SvCUR shouldn't change */
7002 }
7003
7004 PERL_STATIC_INLINE IV*
7005 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7006 {
7007     /* Return the address of the UV that is reserved to hold the cached index
7008      * */
7009
7010     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7011
7012     return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7013 }
7014
7015 PERL_STATIC_INLINE IV
7016 S_invlist_previous_index(pTHX_ SV* const invlist)
7017 {
7018     /* Returns cached index of previous search */
7019
7020     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7021
7022     return *get_invlist_previous_index_addr(invlist);
7023 }
7024
7025 PERL_STATIC_INLINE void
7026 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7027 {
7028     /* Caches <index> for later retrieval */
7029
7030     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7031
7032     assert(index == 0 || index < (int) _invlist_len(invlist));
7033
7034     *get_invlist_previous_index_addr(invlist) = index;
7035 }
7036
7037 PERL_STATIC_INLINE UV
7038 S_invlist_max(pTHX_ SV* const invlist)
7039 {
7040     /* Returns the maximum number of elements storable in the inversion list's
7041      * array, without having to realloc() */
7042
7043     PERL_ARGS_ASSERT_INVLIST_MAX;
7044
7045     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7046            ? _invlist_len(invlist)
7047            : FROM_INTERNAL_SIZE(SvLEN(invlist));
7048 }
7049
7050 PERL_STATIC_INLINE UV*
7051 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7052 {
7053     /* Return the address of the UV that is reserved to hold 0 if the inversion
7054      * list contains 0.  This has to be the last element of the heading, as the
7055      * list proper starts with either it if 0, or the next element if not.
7056      * (But we force it to contain either 0 or 1) */
7057
7058     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7059
7060     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7061 }
7062
7063 #ifndef PERL_IN_XSUB_RE
7064 SV*
7065 Perl__new_invlist(pTHX_ IV initial_size)
7066 {
7067
7068     /* Return a pointer to a newly constructed inversion list, with enough
7069      * space to store 'initial_size' elements.  If that number is negative, a
7070      * system default is used instead */
7071
7072     SV* new_list;
7073
7074     if (initial_size < 0) {
7075         initial_size = INVLIST_INITIAL_LEN;
7076     }
7077
7078     /* Allocate the initial space */
7079     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7080     invlist_set_len(new_list, 0);
7081
7082     /* Force iterinit() to be used to get iteration to work */
7083     *get_invlist_iter_addr(new_list) = UV_MAX;
7084
7085     /* This should force a segfault if a method doesn't initialize this
7086      * properly */
7087     *get_invlist_zero_addr(new_list) = UV_MAX;
7088
7089     *get_invlist_previous_index_addr(new_list) = 0;
7090     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7091 #if HEADER_LENGTH != 5
7092 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7093 #endif
7094
7095     return new_list;
7096 }
7097 #endif
7098
7099 STATIC SV*
7100 S__new_invlist_C_array(pTHX_ UV* list)
7101 {
7102     /* Return a pointer to a newly constructed inversion list, initialized to
7103      * point to <list>, which has to be in the exact correct inversion list
7104      * form, including internal fields.  Thus this is a dangerous routine that
7105      * should not be used in the wrong hands */
7106
7107     SV* invlist = newSV_type(SVt_PV);
7108
7109     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7110
7111     SvPV_set(invlist, (char *) list);
7112     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7113                                shouldn't touch it */
7114     SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7115
7116     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7117         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7118     }
7119
7120     /* Initialize the iteration pointer.
7121      * XXX This could be done at compile time in charclass_invlists.h, but I
7122      * (khw) am not confident that the suffixes for specifying the C constant
7123      * UV_MAX are portable, e.g.  'ull' on a 32 bit machine that is configured
7124      * to use 64 bits; might need a Configure probe */
7125     invlist_iterfinish(invlist);
7126
7127     return invlist;
7128 }
7129
7130 STATIC void
7131 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7132 {
7133     /* Grow the maximum size of an inversion list */
7134
7135     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7136
7137     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7138 }
7139
7140 PERL_STATIC_INLINE void
7141 S_invlist_trim(pTHX_ SV* const invlist)
7142 {
7143     PERL_ARGS_ASSERT_INVLIST_TRIM;
7144
7145     /* Change the length of the inversion list to how many entries it currently
7146      * has */
7147
7148     SvPV_shrink_to_cur((SV *) invlist);
7149 }
7150
7151 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7152
7153 STATIC void
7154 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7155 {
7156    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7157     * the end of the inversion list.  The range must be above any existing
7158     * ones. */
7159
7160     UV* array;
7161     UV max = invlist_max(invlist);
7162     UV len = _invlist_len(invlist);
7163
7164     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7165
7166     if (len == 0) { /* Empty lists must be initialized */
7167         array = _invlist_array_init(invlist, start == 0);
7168     }
7169     else {
7170         /* Here, the existing list is non-empty. The current max entry in the
7171          * list is generally the first value not in the set, except when the
7172          * set extends to the end of permissible values, in which case it is
7173          * the first entry in that final set, and so this call is an attempt to
7174          * append out-of-order */
7175
7176         UV final_element = len - 1;
7177         array = invlist_array(invlist);
7178         if (array[final_element] > start
7179             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7180         {
7181             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7182                        array[final_element], start,
7183                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7184         }
7185
7186         /* Here, it is a legal append.  If the new range begins with the first
7187          * value not in the set, it is extending the set, so the new first
7188          * value not in the set is one greater than the newly extended range.
7189          * */
7190         if (array[final_element] == start) {
7191             if (end != UV_MAX) {
7192                 array[final_element] = end + 1;
7193             }
7194             else {
7195                 /* But if the end is the maximum representable on the machine,
7196                  * just let the range that this would extend to have no end */
7197                 invlist_set_len(invlist, len - 1);
7198             }
7199             return;
7200         }
7201     }
7202
7203     /* Here the new range doesn't extend any existing set.  Add it */
7204
7205     len += 2;   /* Includes an element each for the start and end of range */
7206
7207     /* If overflows the existing space, extend, which may cause the array to be
7208      * moved */
7209     if (max < len) {
7210         invlist_extend(invlist, len);
7211         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7212                                            failure in invlist_array() */
7213         array = invlist_array(invlist);
7214     }
7215     else {
7216         invlist_set_len(invlist, len);
7217     }
7218
7219     /* The next item on the list starts the range, the one after that is
7220      * one past the new range.  */
7221     array[len - 2] = start;
7222     if (end != UV_MAX) {
7223         array[len - 1] = end + 1;
7224     }
7225     else {
7226         /* But if the end is the maximum representable on the machine, just let
7227          * the range have no end */
7228         invlist_set_len(invlist, len - 1);
7229     }
7230 }
7231
7232 #ifndef PERL_IN_XSUB_RE
7233
7234 IV
7235 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7236 {
7237     /* Searches the inversion list for the entry that contains the input code
7238      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7239      * return value is the index into the list's array of the range that
7240      * contains <cp> */
7241
7242     IV low = 0;
7243     IV mid;
7244     IV high = _invlist_len(invlist);
7245     const IV highest_element = high - 1;
7246     const UV* array;
7247
7248     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7249
7250     /* If list is empty, return failure. */
7251     if (high == 0) {
7252         return -1;
7253     }
7254
7255     /* (We can't get the array unless we know the list is non-empty) */
7256     array = invlist_array(invlist);
7257
7258     mid = invlist_previous_index(invlist);
7259     assert(mid >=0 && mid <= highest_element);
7260
7261     /* <mid> contains the cache of the result of the previous call to this
7262      * function (0 the first time).  See if this call is for the same result,
7263      * or if it is for mid-1.  This is under the theory that calls to this
7264      * function will often be for related code points that are near each other.
7265      * And benchmarks show that caching gives better results.  We also test
7266      * here if the code point is within the bounds of the list.  These tests
7267      * replace others that would have had to be made anyway to make sure that
7268      * the array bounds were not exceeded, and these give us extra information
7269      * at the same time */
7270     if (cp >= array[mid]) {
7271         if (cp >= array[highest_element]) {
7272             return highest_element;
7273         }
7274
7275         /* Here, array[mid] <= cp < array[highest_element].  This means that
7276          * the final element is not the answer, so can exclude it; it also
7277          * means that <mid> is not the final element, so can refer to 'mid + 1'
7278          * safely */
7279         if (cp < array[mid + 1]) {
7280             return mid;
7281         }
7282         high--;
7283         low = mid + 1;
7284     }
7285     else { /* cp < aray[mid] */
7286         if (cp < array[0]) { /* Fail if outside the array */
7287             return -1;
7288         }
7289         high = mid;
7290         if (cp >= array[mid - 1]) {
7291             goto found_entry;
7292         }
7293     }
7294
7295     /* Binary search.  What we are looking for is <i> such that
7296      *  array[i] <= cp < array[i+1]
7297      * The loop below converges on the i+1.  Note that there may not be an
7298      * (i+1)th element in the array, and things work nonetheless */
7299     while (low < high) {
7300         mid = (low + high) / 2;
7301         assert(mid <= highest_element);
7302         if (array[mid] <= cp) { /* cp >= array[mid] */
7303             low = mid + 1;
7304
7305             /* We could do this extra test to exit the loop early.
7306             if (cp < array[low]) {
7307                 return mid;
7308             }
7309             */
7310         }
7311         else { /* cp < array[mid] */
7312             high = mid;
7313         }
7314     }
7315
7316   found_entry:
7317     high--;
7318     invlist_set_previous_index(invlist, high);
7319     return high;
7320 }
7321
7322 void
7323 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7324 {
7325     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7326      * but is used when the swash has an inversion list.  This makes this much
7327      * faster, as it uses a binary search instead of a linear one.  This is
7328      * intimately tied to that function, and perhaps should be in utf8.c,
7329      * except it is intimately tied to inversion lists as well.  It assumes
7330      * that <swatch> is all 0's on input */
7331
7332     UV current = start;
7333     const IV len = _invlist_len(invlist);
7334     IV i;
7335     const UV * array;
7336
7337     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7338
7339     if (len == 0) { /* Empty inversion list */
7340         return;
7341     }
7342
7343     array = invlist_array(invlist);
7344
7345     /* Find which element it is */
7346     i = _invlist_search(invlist, start);
7347
7348     /* We populate from <start> to <end> */
7349     while (current < end) {
7350         UV upper;
7351
7352         /* The inversion list gives the results for every possible code point
7353          * after the first one in the list.  Only those ranges whose index is
7354          * even are ones that the inversion list matches.  For the odd ones,
7355          * and if the initial code point is not in the list, we have to skip
7356          * forward to the next element */
7357         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7358             i++;
7359             if (i >= len) { /* Finished if beyond the end of the array */
7360                 return;
7361             }
7362             current = array[i];
7363             if (current >= end) {   /* Finished if beyond the end of what we
7364                                        are populating */
7365                 if (LIKELY(end < UV_MAX)) {
7366                     return;
7367                 }
7368
7369                 /* We get here when the upper bound is the maximum
7370                  * representable on the machine, and we are looking for just
7371                  * that code point.  Have to special case it */
7372                 i = len;
7373                 goto join_end_of_list;
7374             }
7375         }
7376         assert(current >= start);
7377
7378         /* The current range ends one below the next one, except don't go past
7379          * <end> */
7380         i++;
7381         upper = (i < len && array[i] < end) ? array[i] : end;
7382
7383         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7384          * for each code point in it */
7385         for (; current < upper; current++) {
7386             const STRLEN offset = (STRLEN)(current - start);
7387             swatch[offset >> 3] |= 1 << (offset & 7);
7388         }
7389
7390     join_end_of_list:
7391
7392         /* Quit if at the end of the list */
7393         if (i >= len) {
7394
7395             /* But first, have to deal with the highest possible code point on
7396              * the platform.  The previous code assumes that <end> is one
7397              * beyond where we want to populate, but that is impossible at the
7398              * platform's infinity, so have to handle it specially */
7399             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7400             {
7401                 const STRLEN offset = (STRLEN)(end - start);
7402                 swatch[offset >> 3] |= 1 << (offset & 7);
7403             }
7404             return;
7405         }
7406
7407         /* Advance to the next range, which will be for code points not in the
7408          * inversion list */
7409         current = array[i];
7410     }
7411
7412     return;
7413 }
7414
7415 void
7416 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7417 {
7418     /* Take the union of two inversion lists and point <output> to it.  *output
7419      * should be defined upon input, and if it points to one of the two lists,
7420      * the reference count to that list will be decremented.  The first list,
7421      * <a>, may be NULL, in which case a copy of the second list is returned.
7422      * If <complement_b> is TRUE, the union is taken of the complement
7423      * (inversion) of <b> instead of b itself.
7424      *
7425      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7426      * Richard Gillam, published by Addison-Wesley, and explained at some
7427      * length there.  The preface says to incorporate its examples into your
7428      * code at your own risk.
7429      *
7430      * The algorithm is like a merge sort.
7431      *
7432      * XXX A potential performance improvement is to keep track as we go along
7433      * if only one of the inputs contributes to the result, meaning the other
7434      * is a subset of that one.  In that case, we can skip the final copy and
7435      * return the larger of the input lists, but then outside code might need
7436      * to keep track of whether to free the input list or not */
7437
7438     UV* array_a;    /* a's array */
7439     UV* array_b;
7440     UV len_a;       /* length of a's array */
7441     UV len_b;
7442
7443     SV* u;                      /* the resulting union */
7444     UV* array_u;
7445     UV len_u;
7446
7447     UV i_a = 0;             /* current index into a's array */
7448     UV i_b = 0;
7449     UV i_u = 0;
7450
7451     /* running count, as explained in the algorithm source book; items are
7452      * stopped accumulating and are output when the count changes to/from 0.
7453      * The count is incremented when we start a range that's in the set, and
7454      * decremented when we start a range that's not in the set.  So its range
7455      * is 0 to 2.  Only when the count is zero is something not in the set.
7456      */
7457     UV count = 0;
7458
7459     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7460     assert(a != b);
7461
7462     /* If either one is empty, the union is the other one */
7463     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7464         if (*output == a) {
7465             if (a != NULL) {
7466                 SvREFCNT_dec_NN(a);
7467             }
7468         }
7469         if (*output != b) {
7470             *output = invlist_clone(b);
7471             if (complement_b) {
7472                 _invlist_invert(*output);
7473             }
7474         } /* else *output already = b; */
7475         return;
7476     }
7477     else if ((len_b = _invlist_len(b)) == 0) {
7478         if (*output == b) {
7479             SvREFCNT_dec_NN(b);
7480         }
7481
7482         /* The complement of an empty list is a list that has everything in it,
7483          * so the union with <a> includes everything too */
7484         if (complement_b) {
7485             if (a == *output) {
7486                 SvREFCNT_dec_NN(a);
7487             }
7488             *output = _new_invlist(1);
7489             _append_range_to_invlist(*output, 0, UV_MAX);
7490         }
7491         else if (*output != a) {
7492             *output = invlist_clone(a);
7493         }
7494         /* else *output already = a; */
7495         return;
7496     }
7497
7498     /* Here both lists exist and are non-empty */
7499     array_a = invlist_array(a);
7500     array_b = invlist_array(b);
7501
7502     /* If are to take the union of 'a' with the complement of b, set it
7503      * up so are looking at b's complement. */
7504     if (complement_b) {
7505
7506         /* To complement, we invert: if the first element is 0, remove it.  To
7507          * do this, we just pretend the array starts one later, and clear the
7508          * flag as we don't have to do anything else later */
7509         if (array_b[0] == 0) {
7510             array_b++;
7511             len_b--;
7512             complement_b = FALSE;
7513         }
7514         else {
7515
7516             /* But if the first element is not zero, we unshift a 0 before the
7517              * array.  The data structure reserves a space for that 0 (which
7518              * should be a '1' right now), so physical shifting is unneeded,
7519              * but temporarily change that element to 0.  Before exiting the
7520              * routine, we must restore the element to '1' */
7521             array_b--;
7522             len_b++;
7523             array_b[0] = 0;
7524         }
7525     }
7526
7527     /* Size the union for the worst case: that the sets are completely
7528      * disjoint */
7529     u = _new_invlist(len_a + len_b);
7530
7531     /* Will contain U+0000 if either component does */
7532     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7533                                       || (len_b > 0 && array_b[0] == 0));
7534
7535     /* Go through each list item by item, stopping when exhausted one of
7536      * them */
7537     while (i_a < len_a && i_b < len_b) {
7538         UV cp;      /* The element to potentially add to the union's array */
7539         bool cp_in_set;   /* is it in the the input list's set or not */
7540
7541         /* We need to take one or the other of the two inputs for the union.
7542          * Since we are merging two sorted lists, we take the smaller of the
7543          * next items.  In case of a tie, we take the one that is in its set
7544          * first.  If we took one not in the set first, it would decrement the
7545          * count, possibly to 0 which would cause it to be output as ending the
7546          * range, and the next time through we would take the same number, and
7547          * output it again as beginning the next range.  By doing it the
7548          * opposite way, there is no possibility that the count will be
7549          * momentarily decremented to 0, and thus the two adjoining ranges will
7550          * be seamlessly merged.  (In a tie and both are in the set or both not
7551          * in the set, it doesn't matter which we take first.) */
7552         if (array_a[i_a] < array_b[i_b]
7553             || (array_a[i_a] == array_b[i_b]
7554                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7555         {
7556             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7557             cp= array_a[i_a++];
7558         }
7559         else {
7560             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7561             cp= array_b[i_b++];
7562         }
7563
7564         /* Here, have chosen which of the two inputs to look at.  Only output
7565          * if the running count changes to/from 0, which marks the
7566          * beginning/end of a range in that's in the set */
7567         if (cp_in_set) {
7568             if (count == 0) {
7569                 array_u[i_u++] = cp;
7570             }
7571             count++;
7572         }
7573         else {
7574             count--;
7575             if (count == 0) {
7576                 array_u[i_u++] = cp;
7577             }
7578         }
7579     }
7580
7581     /* Here, we are finished going through at least one of the lists, which
7582      * means there is something remaining in at most one.  We check if the list
7583      * that hasn't been exhausted is positioned such that we are in the middle
7584      * of a range in its set or not.  (i_a and i_b point to the element beyond
7585      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7586      * is potentially more to output.
7587      * There are four cases:
7588      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7589      *     in the union is entirely from the non-exhausted set.
7590      *  2) Both were in their sets, count is 2.  Nothing further should
7591      *     be output, as everything that remains will be in the exhausted
7592      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7593      *     that
7594      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7595      *     Nothing further should be output because the union includes
7596      *     everything from the exhausted set.  Not decrementing ensures that.
7597      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7598      *     decrementing to 0 insures that we look at the remainder of the
7599      *     non-exhausted set */
7600     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7601         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7602     {
7603         count--;
7604     }
7605
7606     /* The final length is what we've output so far, plus what else is about to
7607      * be output.  (If 'count' is non-zero, then the input list we exhausted
7608      * has everything remaining up to the machine's limit in its set, and hence
7609      * in the union, so there will be no further output. */
7610     len_u = i_u;
7611     if (count == 0) {
7612         /* At most one of the subexpressions will be non-zero */
7613         len_u += (len_a - i_a) + (len_b - i_b);
7614     }
7615
7616     /* Set result to final length, which can change the pointer to array_u, so
7617      * re-find it */
7618     if (len_u != _invlist_len(u)) {
7619         invlist_set_len(u, len_u);
7620         invlist_trim(u);
7621         array_u = invlist_array(u);
7622     }
7623
7624     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7625      * the other) ended with everything above it not in its set.  That means
7626      * that the remaining part of the union is precisely the same as the
7627      * non-exhausted list, so can just copy it unchanged.  (If both list were
7628      * exhausted at the same time, then the operations below will be both 0.)
7629      */
7630     if (count == 0) {
7631         IV copy_count; /* At most one will have a non-zero copy count */
7632         if ((copy_count = len_a - i_a) > 0) {
7633             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7634         }
7635         else if ((copy_count = len_b - i_b) > 0) {
7636             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7637         }
7638     }
7639
7640     /*  We may be removing a reference to one of the inputs */
7641     if (a == *output || b == *output) {
7642         assert(! invlist_is_iterating(*output));
7643         SvREFCNT_dec_NN(*output);
7644     }
7645
7646     /* If we've changed b, restore it */
7647     if (complement_b) {
7648         array_b[0] = 1;
7649     }
7650
7651     *output = u;
7652     return;
7653 }
7654
7655 void
7656 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7657 {
7658     /* Take the intersection of two inversion lists and point <i> to it.  *i
7659      * should be defined upon input, and if it points to one of the two lists,
7660      * the reference count to that list will be decremented.
7661      * If <complement_b> is TRUE, the result will be the intersection of <a>
7662      * and the complement (or inversion) of <b> instead of <b> directly.
7663      *
7664      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7665      * Richard Gillam, published by Addison-Wesley, and explained at some
7666      * length there.  The preface says to incorporate its examples into your
7667      * code at your own risk.  In fact, it had bugs
7668      *
7669      * The algorithm is like a merge sort, and is essentially the same as the
7670      * union above
7671      */
7672
7673     UV* array_a;                /* a's array */
7674     UV* array_b;
7675     UV len_a;   /* length of a's array */
7676     UV len_b;
7677
7678     SV* r;                   /* the resulting intersection */
7679     UV* array_r;
7680     UV len_r;
7681
7682     UV i_a = 0;             /* current index into a's array */
7683     UV i_b = 0;
7684     UV i_r = 0;
7685
7686     /* running count, as explained in the algorithm source book; items are
7687      * stopped accumulating and are output when the count changes to/from 2.
7688      * The count is incremented when we start a range that's in the set, and
7689      * decremented when we start a range that's not in the set.  So its range
7690      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7691      */
7692     UV count = 0;
7693
7694     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7695     assert(a != b);
7696
7697     /* Special case if either one is empty */
7698     len_a = _invlist_len(a);
7699     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7700
7701         if (len_a != 0 && complement_b) {
7702
7703             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7704              * be empty.  Here, also we are using 'b's complement, which hence
7705              * must be every possible code point.  Thus the intersection is
7706              * simply 'a'. */
7707             if (*i != a) {
7708                 *i = invlist_clone(a);
7709
7710                 if (*i == b) {
7711                     SvREFCNT_dec_NN(b);
7712                 }
7713             }
7714             /* else *i is already 'a' */
7715             return;
7716         }
7717
7718         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7719          * intersection must be empty */
7720         if (*i == a) {
7721             SvREFCNT_dec_NN(a);
7722         }
7723         else if (*i == b) {
7724             SvREFCNT_dec_NN(b);
7725         }
7726         *i = _new_invlist(0);
7727         return;
7728     }
7729
7730     /* Here both lists exist and are non-empty */
7731     array_a = invlist_array(a);
7732     array_b = invlist_array(b);
7733
7734     /* If are to take the intersection of 'a' with the complement of b, set it
7735      * up so are looking at b's complement. */
7736     if (complement_b) {
7737
7738         /* To complement, we invert: if the first element is 0, remove it.  To
7739          * do this, we just pretend the array starts one later, and clear the
7740          * flag as we don't have to do anything else later */
7741         if (array_b[0] == 0) {
7742             array_b++;
7743             len_b--;
7744             complement_b = FALSE;
7745         }
7746         else {
7747
7748             /* But if the first element is not zero, we unshift a 0 before the
7749              * array.  The data structure reserves a space for that 0 (which
7750              * should be a '1' right now), so physical shifting is unneeded,
7751              * but temporarily change that element to 0.  Before exiting the
7752              * routine, we must restore the element to '1' */
7753             array_b--;
7754             len_b++;
7755             array_b[0] = 0;
7756         }
7757     }
7758
7759     /* Size the intersection for the worst case: that the intersection ends up
7760      * fragmenting everything to be completely disjoint */
7761     r= _new_invlist(len_a + len_b);
7762
7763     /* Will contain U+0000 iff both components do */
7764     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7765                                      && len_b > 0 && array_b[0] == 0);
7766
7767     /* Go through each list item by item, stopping when exhausted one of
7768      * them */
7769     while (i_a < len_a && i_b < len_b) {
7770         UV cp;      /* The element to potentially add to the intersection's
7771                        array */
7772         bool cp_in_set; /* Is it in the input list's set or not */
7773
7774         /* We need to take one or the other of the two inputs for the
7775          * intersection.  Since we are merging two sorted lists, we take the
7776          * smaller of the next items.  In case of a tie, we take the one that
7777          * is not in its set first (a difference from the union algorithm).  If
7778          * we took one in the set first, it would increment the count, possibly
7779          * to 2 which would cause it to be output as starting a range in the
7780          * intersection, and the next time through we would take that same
7781          * number, and output it again as ending the set.  By doing it the
7782          * opposite of this, there is no possibility that the count will be
7783          * momentarily incremented to 2.  (In a tie and both are in the set or
7784          * both not in the set, it doesn't matter which we take first.) */
7785         if (array_a[i_a] < array_b[i_b]
7786             || (array_a[i_a] == array_b[i_b]
7787                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7788         {
7789             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7790             cp= array_a[i_a++];
7791         }
7792         else {
7793             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7794             cp= array_b[i_b++];
7795         }
7796
7797         /* Here, have chosen which of the two inputs to look at.  Only output
7798          * if the running count changes to/from 2, which marks the
7799          * beginning/end of a range that's in the intersection */
7800         if (cp_in_set) {
7801             count++;
7802             if (count == 2) {
7803                 array_r[i_r++] = cp;
7804             }
7805         }
7806         else {
7807             if (count == 2) {
7808                 array_r[i_r++] = cp;
7809             }
7810             count--;
7811         }
7812     }
7813
7814     /* Here, we are finished going through at least one of the lists, which
7815      * means there is something remaining in at most one.  We check if the list
7816      * that has been exhausted is positioned such that we are in the middle
7817      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7818      * the ones we care about.)  There are four cases:
7819      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7820      *     nothing left in the intersection.
7821      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7822      *     above 2.  What should be output is exactly that which is in the
7823      *     non-exhausted set, as everything it has is also in the intersection
7824      *     set, and everything it doesn't have can't be in the intersection
7825      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7826      *     gets incremented to 2.  Like the previous case, the intersection is
7827      *     everything that remains in the non-exhausted set.
7828      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7829      *     remains 1.  And the intersection has nothing more. */
7830     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7831         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7832     {
7833         count++;
7834     }
7835
7836     /* The final length is what we've output so far plus what else is in the
7837      * intersection.  At most one of the subexpressions below will be non-zero */
7838     len_r = i_r;
7839     if (count >= 2) {
7840         len_r += (len_a - i_a) + (len_b - i_b);
7841     }
7842
7843     /* Set result to final length, which can change the pointer to array_r, so
7844      * re-find it */
7845     if (len_r != _invlist_len(r)) {
7846         invlist_set_len(r, len_r);
7847         invlist_trim(r);
7848         array_r = invlist_array(r);
7849     }
7850
7851     /* Finish outputting any remaining */
7852     if (count >= 2) { /* At most one will have a non-zero copy count */
7853         IV copy_count;
7854         if ((copy_count = len_a - i_a) > 0) {
7855             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7856         }
7857         else if ((copy_count = len_b - i_b) > 0) {
7858             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7859         }
7860     }
7861
7862     /*  We may be removing a reference to one of the inputs */
7863     if (a == *i || b == *i) {
7864         assert(! invlist_is_iterating(*i));
7865         SvREFCNT_dec_NN(*i);
7866     }
7867
7868     /* If we've changed b, restore it */
7869     if (complement_b) {
7870         array_b[0] = 1;
7871     }
7872
7873     *i = r;
7874     return;
7875 }
7876
7877 SV*
7878 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7879 {
7880     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7881      * set.  A pointer to the inversion list is returned.  This may actually be
7882      * a new list, in which case the passed in one has been destroyed.  The
7883      * passed in inversion list can be NULL, in which case a new one is created
7884      * with just the one range in it */
7885
7886     SV* range_invlist;
7887     UV len;
7888
7889     if (invlist == NULL) {
7890         invlist = _new_invlist(2);
7891         len = 0;
7892     }
7893     else {
7894         len = _invlist_len(invlist);
7895     }
7896
7897     /* If comes after the final entry, can just append it to the end */
7898     if (len == 0
7899         || start >= invlist_array(invlist)
7900                                     [_invlist_len(invlist) - 1])
7901     {
7902         _append_range_to_invlist(invlist, start, end);
7903         return invlist;
7904     }
7905
7906     /* Here, can't just append things, create and return a new inversion list
7907      * which is the union of this range and the existing inversion list */
7908     range_invlist = _new_invlist(2);
7909     _append_range_to_invlist(range_invlist, start, end);
7910
7911     _invlist_union(invlist, range_invlist, &invlist);
7912
7913     /* The temporary can be freed */
7914     SvREFCNT_dec_NN(range_invlist);
7915
7916     return invlist;
7917 }
7918
7919 #endif
7920
7921 PERL_STATIC_INLINE SV*
7922 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7923     return _add_range_to_invlist(invlist, cp, cp);
7924 }
7925
7926 #ifndef PERL_IN_XSUB_RE
7927 void
7928 Perl__invlist_invert(pTHX_ SV* const invlist)
7929 {
7930     /* Complement the input inversion list.  This adds a 0 if the list didn't
7931      * have a zero; removes it otherwise.  As described above, the data
7932      * structure is set up so that this is very efficient */
7933
7934     UV* len_pos = _get_invlist_len_addr(invlist);
7935
7936     PERL_ARGS_ASSERT__INVLIST_INVERT;
7937
7938     assert(! invlist_is_iterating(invlist));
7939
7940     /* The inverse of matching nothing is matching everything */
7941     if (*len_pos == 0) {
7942         _append_range_to_invlist(invlist, 0, UV_MAX);
7943         return;
7944     }
7945
7946     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7947      * zero element was a 0, so it is being removed, so the length decrements
7948      * by 1; and vice-versa.  SvCUR is unaffected */
7949     if (*get_invlist_zero_addr(invlist) ^= 1) {
7950         (*len_pos)--;
7951     }
7952     else {
7953         (*len_pos)++;
7954     }
7955 }
7956
7957 void
7958 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7959 {
7960     /* Complement the input inversion list (which must be a Unicode property,
7961      * all of which don't match above the Unicode maximum code point.)  And
7962      * Perl has chosen to not have the inversion match above that either.  This
7963      * adds a 0x110000 if the list didn't end with it, and removes it if it did
7964      */
7965
7966     UV len;
7967     UV* array;
7968
7969     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7970
7971     _invlist_invert(invlist);
7972
7973     len = _invlist_len(invlist);
7974
7975     if (len != 0) { /* If empty do nothing */
7976         array = invlist_array(invlist);
7977         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7978             /* Add 0x110000.  First, grow if necessary */
7979             len++;
7980             if (invlist_max(invlist) < len) {
7981                 invlist_extend(invlist, len);
7982                 array = invlist_array(invlist);
7983             }
7984             invlist_set_len(invlist, len);
7985             array[len - 1] = PERL_UNICODE_MAX + 1;
7986         }
7987         else {  /* Remove the 0x110000 */
7988             invlist_set_len(invlist, len - 1);
7989         }
7990     }
7991
7992     return;
7993 }
7994 #endif
7995
7996 PERL_STATIC_INLINE SV*
7997 S_invlist_clone(pTHX_ SV* const invlist)
7998 {
7999
8000     /* Return a new inversion list that is a copy of the input one, which is
8001      * unchanged */
8002
8003     /* Need to allocate extra space to accommodate Perl's addition of a
8004      * trailing NUL to SvPV's, since it thinks they are always strings */
8005     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8006     STRLEN length = SvCUR(invlist);
8007
8008     PERL_ARGS_ASSERT_INVLIST_CLONE;
8009
8010     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8011     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8012
8013     return new_invlist;
8014 }
8015
8016 PERL_STATIC_INLINE UV*
8017 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8018 {
8019     /* Return the address of the UV that contains the current iteration
8020      * position */
8021
8022     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8023
8024     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8025 }
8026
8027 PERL_STATIC_INLINE UV*
8028 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8029 {
8030     /* Return the address of the UV that contains the version id. */
8031
8032     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8033
8034     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8035 }
8036
8037 PERL_STATIC_INLINE void
8038 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8039 {
8040     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8041
8042     *get_invlist_iter_addr(invlist) = 0;
8043 }
8044
8045 PERL_STATIC_INLINE void
8046 S_invlist_iterfinish(pTHX_ SV* invlist)
8047 {
8048     /* Terminate iterator for invlist.  This is to catch development errors.
8049      * Any iteration that is interrupted before completed should call this
8050      * function.  Functions that add code points anywhere else but to the end
8051      * of an inversion list assert that they are not in the middle of an
8052      * iteration.  If they were, the addition would make the iteration
8053      * problematical: if the iteration hadn't reached the place where things
8054      * were being added, it would be ok */
8055
8056     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8057
8058     *get_invlist_iter_addr(invlist) = UV_MAX;
8059 }
8060
8061 STATIC bool
8062 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8063 {
8064     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8065      * This call sets in <*start> and <*end>, the next range in <invlist>.
8066      * Returns <TRUE> if successful and the next call will return the next
8067      * range; <FALSE> if was already at the end of the list.  If the latter,
8068      * <*start> and <*end> are unchanged, and the next call to this function
8069      * will start over at the beginning of the list */
8070
8071     UV* pos = get_invlist_iter_addr(invlist);
8072     UV len = _invlist_len(invlist);
8073     UV *array;
8074
8075     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8076
8077     if (*pos >= len) {
8078         *pos = UV_MAX;  /* Force iterinit() to be required next time */
8079         return FALSE;
8080     }
8081
8082     array = invlist_array(invlist);
8083
8084     *start = array[(*pos)++];
8085
8086     if (*pos >= len) {
8087         *end = UV_MAX;
8088     }
8089     else {
8090         *end = array[(*pos)++] - 1;
8091     }
8092
8093     return TRUE;
8094 }
8095
8096 PERL_STATIC_INLINE bool
8097 S_invlist_is_iterating(pTHX_ SV* const invlist)
8098 {
8099     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8100
8101     return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8102 }
8103
8104 PERL_STATIC_INLINE UV
8105 S_invlist_highest(pTHX_ SV* const invlist)
8106 {
8107     /* Returns the highest code point that matches an inversion list.  This API
8108      * has an ambiguity, as it returns 0 under either the highest is actually
8109      * 0, or if the list is empty.  If this distinction matters to you, check
8110      * for emptiness before calling this function */
8111
8112     UV len = _invlist_len(invlist);
8113     UV *array;
8114
8115     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8116
8117     if (len == 0) {
8118         return 0;
8119     }
8120
8121     array = invlist_array(invlist);
8122
8123     /* The last element in the array in the inversion list always starts a
8124      * range that goes to infinity.  That range may be for code points that are
8125      * matched in the inversion list, or it may be for ones that aren't
8126      * matched.  In the latter case, the highest code point in the set is one
8127      * less than the beginning of this range; otherwise it is the final element
8128      * of this range: infinity */
8129     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8130            ? UV_MAX
8131            : array[len - 1] - 1;
8132 }
8133
8134 #ifndef PERL_IN_XSUB_RE
8135 SV *
8136 Perl__invlist_contents(pTHX_ SV* const invlist)
8137 {
8138     /* Get the contents of an inversion list into a string SV so that they can
8139      * be printed out.  It uses the format traditionally done for debug tracing
8140      */
8141
8142     UV start, end;
8143     SV* output = newSVpvs("\n");
8144
8145     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8146
8147     assert(! invlist_is_iterating(invlist));
8148
8149     invlist_iterinit(invlist);
8150     while (invlist_iternext(invlist, &start, &end)) {
8151         if (end == UV_MAX) {
8152             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8153         }
8154         else if (end != start) {
8155             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8156                     start,       end);
8157         }
8158         else {
8159             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8160         }
8161     }
8162
8163     return output;
8164 }
8165 #endif
8166
8167 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8168 void
8169 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8170 {
8171     /* Dumps out the ranges in an inversion list.  The string 'header'
8172      * if present is output on a line before the first range */
8173
8174     UV start, end;
8175
8176     PERL_ARGS_ASSERT__INVLIST_DUMP;
8177
8178     if (header && strlen(header)) {
8179         PerlIO_printf(Perl_debug_log, "%s\n", header);
8180     }
8181     if (invlist_is_iterating(invlist)) {
8182         PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8183         return;
8184     }
8185
8186     invlist_iterinit(invlist);
8187     while (invlist_iternext(invlist, &start, &end)) {
8188         if (end == UV_MAX) {
8189             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8190         }
8191         else if (end != start) {
8192             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8193                                                  start,         end);
8194         }
8195         else {
8196             PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8197         }
8198     }
8199 }
8200 #endif
8201
8202 #if 0
8203 bool
8204 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8205 {
8206     /* Return a boolean as to if the two passed in inversion lists are
8207      * identical.  The final argument, if TRUE, says to take the complement of
8208      * the second inversion list before doing the comparison */
8209
8210     UV* array_a = invlist_array(a);
8211     UV* array_b = invlist_array(b);
8212     UV len_a = _invlist_len(a);
8213     UV len_b = _invlist_len(b);
8214
8215     UV i = 0;               /* current index into the arrays */
8216     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8217
8218     PERL_ARGS_ASSERT__INVLISTEQ;
8219
8220     /* If are to compare 'a' with the complement of b, set it
8221      * up so are looking at b's complement. */
8222     if (complement_b) {
8223
8224         /* The complement of nothing is everything, so <a> would have to have
8225          * just one element, starting at zero (ending at infinity) */
8226         if (len_b == 0) {
8227             return (len_a == 1 && array_a[0] == 0);
8228         }
8229         else if (array_b[0] == 0) {
8230
8231             /* Otherwise, to complement, we invert.  Here, the first element is
8232              * 0, just remove it.  To do this, we just pretend the array starts
8233              * one later, and clear the flag as we don't have to do anything
8234              * else later */
8235
8236             array_b++;
8237             len_b--;
8238             complement_b = FALSE;
8239         }
8240         else {
8241
8242             /* But if the first element is not zero, we unshift a 0 before the
8243              * array.  The data structure reserves a space for that 0 (which
8244              * should be a '1' right now), so physical shifting is unneeded,
8245              * but temporarily change that element to 0.  Before exiting the
8246              * routine, we must restore the element to '1' */
8247             array_b--;
8248             len_b++;
8249             array_b[0] = 0;
8250         }
8251     }
8252
8253     /* Make sure that the lengths are the same, as well as the final element
8254      * before looping through the remainder.  (Thus we test the length, final,
8255      * and first elements right off the bat) */
8256     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8257         retval = FALSE;
8258     }
8259     else for (i = 0; i < len_a - 1; i++) {
8260         if (array_a[i] != array_b[i]) {
8261             retval = FALSE;
8262             break;
8263         }
8264     }
8265
8266     if (complement_b) {
8267         array_b[0] = 1;
8268     }
8269     return retval;
8270 }
8271 #endif
8272
8273 #undef HEADER_LENGTH
8274 #undef INVLIST_INITIAL_LENGTH
8275 #undef TO_INTERNAL_SIZE
8276 #undef FROM_INTERNAL_SIZE
8277 #undef INVLIST_LEN_OFFSET
8278 #undef INVLIST_ZERO_OFFSET
8279 #undef INVLIST_ITER_OFFSET
8280 #undef INVLIST_VERSION_ID
8281 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8282
8283 /* End of inversion list object */
8284
8285 /*
8286  - reg - regular expression, i.e. main body or parenthesized thing
8287  *
8288  * Caller must absorb opening parenthesis.
8289  *
8290  * Combining parenthesis handling with the base level of regular expression
8291  * is a trifle forced, but the need to tie the tails of the branches to what
8292  * follows makes it hard to avoid.
8293  */
8294 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8295 #ifdef DEBUGGING
8296 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8297 #else
8298 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8299 #endif
8300
8301 STATIC regnode *
8302 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8303     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8304 {
8305     dVAR;
8306     regnode *ret;               /* Will be the head of the group. */
8307     regnode *br;
8308     regnode *lastbr;
8309     regnode *ender = NULL;
8310     I32 parno = 0;
8311     I32 flags;
8312     U32 oregflags = RExC_flags;
8313     bool have_branch = 0;
8314     bool is_open = 0;
8315     I32 freeze_paren = 0;
8316     I32 after_freeze = 0;
8317
8318     /* for (?g), (?gc), and (?o) warnings; warning
8319        about (?c) will warn about (?g) -- japhy    */
8320
8321 #define WASTED_O  0x01
8322 #define WASTED_G  0x02
8323 #define WASTED_C  0x04
8324 #define WASTED_GC (0x02|0x04)
8325     I32 wastedflags = 0x00;
8326
8327     char * parse_start = RExC_parse; /* MJD */
8328     char * const oregcomp_parse = RExC_parse;
8329
8330     GET_RE_DEBUG_FLAGS_DECL;
8331
8332     PERL_ARGS_ASSERT_REG;
8333     DEBUG_PARSE("reg ");
8334
8335     *flagp = 0;                         /* Tentatively. */
8336
8337
8338     /* Make an OPEN node, if parenthesized. */
8339     if (paren) {
8340         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8341             char *start_verb = RExC_parse;
8342             STRLEN verb_len = 0;
8343             char *start_arg = NULL;
8344             unsigned char op = 0;
8345             int argok = 1;
8346             int internal_argval = 0; /* internal_argval is only useful if !argok */
8347             while ( *RExC_parse && *RExC_parse != ')' ) {
8348                 if ( *RExC_parse == ':' ) {
8349                     start_arg = RExC_parse + 1;
8350                     break;
8351                 }
8352                 RExC_parse++;
8353             }
8354             ++start_verb;
8355             verb_len = RExC_parse - start_verb;
8356             if ( start_arg ) {
8357                 RExC_parse++;
8358                 while ( *RExC_parse && *RExC_parse != ')' )
8359                     RExC_parse++;
8360                 if ( *RExC_parse != ')' )
8361                     vFAIL("Unterminated verb pattern argument");
8362                 if ( RExC_parse == start_arg )
8363                     start_arg = NULL;
8364             } else {
8365                 if ( *RExC_parse != ')' )
8366                     vFAIL("Unterminated verb pattern");
8367             }
8368
8369             switch ( *start_verb ) {
8370             case 'A':  /* (*ACCEPT) */
8371                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8372                     op = ACCEPT;
8373                     internal_argval = RExC_nestroot;
8374                 }
8375                 break;
8376             case 'C':  /* (*COMMIT) */
8377                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8378                     op = COMMIT;
8379                 break;
8380             case 'F':  /* (*FAIL) */
8381                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8382                     op = OPFAIL;
8383                     argok = 0;
8384                 }
8385                 break;
8386             case ':':  /* (*:NAME) */
8387             case 'M':  /* (*MARK:NAME) */
8388                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8389                     op = MARKPOINT;
8390                     argok = -1;
8391                 }
8392                 break;
8393             case 'P':  /* (*PRUNE) */
8394                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8395                     op = PRUNE;
8396                 break;
8397             case 'S':   /* (*SKIP) */
8398                 if ( memEQs(start_verb,verb_len,"SKIP") )
8399                     op = SKIP;
8400                 break;
8401             case 'T':  /* (*THEN) */
8402                 /* [19:06] <TimToady> :: is then */
8403                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8404                     op = CUTGROUP;
8405                     RExC_seen |= REG_SEEN_CUTGROUP;
8406                 }
8407                 break;
8408             }
8409             if ( ! op ) {
8410                 RExC_parse++;
8411                 vFAIL3("Unknown verb pattern '%.*s'",
8412                     verb_len, start_verb);
8413             }
8414             if ( argok ) {
8415                 if ( start_arg && internal_argval ) {
8416                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8417                         verb_len, start_verb);
8418                 } else if ( argok < 0 && !start_arg ) {
8419                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8420                         verb_len, start_verb);
8421                 } else {
8422                     ret = reganode(pRExC_state, op, internal_argval);
8423                     if ( ! internal_argval && ! SIZE_ONLY ) {
8424                         if (start_arg) {
8425                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8426                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8427                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8428                             ret->flags = 0;
8429                         } else {
8430                             ret->flags = 1;
8431                         }
8432                     }
8433                 }
8434                 if (!internal_argval)
8435                     RExC_seen |= REG_SEEN_VERBARG;
8436             } else if ( start_arg ) {
8437                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8438                         verb_len, start_verb);
8439             } else {
8440                 ret = reg_node(pRExC_state, op);
8441             }
8442             nextchar(pRExC_state);
8443             return ret;
8444         } else
8445         if (*RExC_parse == '?') { /* (?...) */
8446             bool is_logical = 0;
8447             const char * const seqstart = RExC_parse;
8448             bool has_use_defaults = FALSE;
8449
8450             RExC_parse++;
8451             paren = *RExC_parse++;
8452             ret = NULL;                 /* For look-ahead/behind. */
8453             switch (paren) {
8454
8455             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8456                 paren = *RExC_parse++;
8457                 if ( paren == '<')         /* (?P<...>) named capture */
8458                     goto named_capture;
8459                 else if (paren == '>') {   /* (?P>name) named recursion */
8460                     goto named_recursion;
8461                 }
8462                 else if (paren == '=') {   /* (?P=...)  named backref */
8463                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8464                        you change this make sure you change that */
8465                     char* name_start = RExC_parse;
8466                     U32 num = 0;
8467                     SV *sv_dat = reg_scan_name(pRExC_state,
8468                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8469                     if (RExC_parse == name_start || *RExC_parse != ')')
8470                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8471
8472                     if (!SIZE_ONLY) {
8473                         num = add_data( pRExC_state, 1, "S" );
8474                         RExC_rxi->data->data[num]=(void*)sv_dat;
8475                         SvREFCNT_inc_simple_void(sv_dat);
8476                     }
8477                     RExC_sawback = 1;
8478                     ret = reganode(pRExC_state,
8479                                    ((! FOLD)
8480                                      ? NREF
8481                                      : (ASCII_FOLD_RESTRICTED)
8482                                        ? NREFFA
8483                                        : (AT_LEAST_UNI_SEMANTICS)
8484                                          ? NREFFU
8485                                          : (LOC)
8486                                            ? NREFFL
8487                                            : NREFF),
8488                                     num);
8489                     *flagp |= HASWIDTH;
8490
8491                     Set_Node_Offset(ret, parse_start+1);
8492                     Set_Node_Cur_Length(ret); /* MJD */
8493
8494                     nextchar(pRExC_state);
8495                     return ret;
8496                 }
8497                 RExC_parse++;
8498                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8499                 /*NOTREACHED*/
8500             case '<':           /* (?<...) */
8501                 if (*RExC_parse == '!')
8502                     paren = ',';
8503                 else if (*RExC_parse != '=')
8504               named_capture:
8505                 {               /* (?<...>) */
8506                     char *name_start;
8507                     SV *svname;
8508                     paren= '>';
8509             case '\'':          /* (?'...') */
8510                     name_start= RExC_parse;
8511                     svname = reg_scan_name(pRExC_state,
8512                         SIZE_ONLY ?  /* reverse test from the others */
8513                         REG_RSN_RETURN_NAME :
8514                         REG_RSN_RETURN_NULL);
8515                     if (RExC_parse == name_start) {
8516                         RExC_parse++;
8517                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8518                         /*NOTREACHED*/
8519                     }
8520                     if (*RExC_parse != paren)
8521                         vFAIL2("Sequence (?%c... not terminated",
8522                             paren=='>' ? '<' : paren);
8523                     if (SIZE_ONLY) {
8524                         HE *he_str;
8525                         SV *sv_dat = NULL;
8526                         if (!svname) /* shouldn't happen */
8527                             Perl_croak(aTHX_
8528                                 "panic: reg_scan_name returned NULL");
8529                         if (!RExC_paren_names) {
8530                             RExC_paren_names= newHV();
8531                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8532 #ifdef DEBUGGING
8533                             RExC_paren_name_list= newAV();
8534                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8535 #endif
8536                         }
8537                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8538                         if ( he_str )
8539                             sv_dat = HeVAL(he_str);
8540                         if ( ! sv_dat ) {
8541                             /* croak baby croak */
8542                             Perl_croak(aTHX_
8543                                 "panic: paren_name hash element allocation failed");
8544                         } else if ( SvPOK(sv_dat) ) {
8545                             /* (?|...) can mean we have dupes so scan to check
8546                                its already been stored. Maybe a flag indicating
8547                                we are inside such a construct would be useful,
8548                                but the arrays are likely to be quite small, so
8549                                for now we punt -- dmq */
8550                             IV count = SvIV(sv_dat);
8551                             I32 *pv = (I32*)SvPVX(sv_dat);
8552                             IV i;
8553                             for ( i = 0 ; i < count ; i++ ) {
8554                                 if ( pv[i] == RExC_npar ) {
8555                                     count = 0;
8556                                     break;
8557                                 }
8558                             }
8559                             if ( count ) {
8560                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8561                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8562                                 pv[count] = RExC_npar;
8563                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8564                             }
8565                         } else {
8566                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8567                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8568                             SvIOK_on(sv_dat);
8569                             SvIV_set(sv_dat, 1);
8570                         }
8571 #ifdef DEBUGGING
8572                         /* Yes this does cause a memory leak in debugging Perls */
8573                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8574                             SvREFCNT_dec_NN(svname);
8575 #endif
8576
8577                         /*sv_dump(sv_dat);*/
8578                     }
8579                     nextchar(pRExC_state);
8580                     paren = 1;
8581                     goto capturing_parens;
8582                 }
8583                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8584                 RExC_in_lookbehind++;
8585                 RExC_parse++;
8586             case '=':           /* (?=...) */
8587                 RExC_seen_zerolen++;
8588                 break;
8589             case '!':           /* (?!...) */
8590                 RExC_seen_zerolen++;
8591                 if (*RExC_parse == ')') {
8592                     ret=reg_node(pRExC_state, OPFAIL);
8593                     nextchar(pRExC_state);
8594                     return ret;
8595                 }
8596                 break;
8597             case '|':           /* (?|...) */
8598                 /* branch reset, behave like a (?:...) except that
8599                    buffers in alternations share the same numbers */
8600                 paren = ':';
8601                 after_freeze = freeze_paren = RExC_npar;
8602                 break;
8603             case ':':           /* (?:...) */
8604             case '>':           /* (?>...) */
8605                 break;
8606             case '$':           /* (?$...) */
8607             case '@':           /* (?@...) */
8608                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8609                 break;
8610             case '#':           /* (?#...) */
8611                 while (*RExC_parse && *RExC_parse != ')')
8612                     RExC_parse++;
8613                 if (*RExC_parse != ')')
8614                     FAIL("Sequence (?#... not terminated");
8615                 nextchar(pRExC_state);
8616                 *flagp = TRYAGAIN;
8617                 return NULL;
8618             case '0' :           /* (?0) */
8619             case 'R' :           /* (?R) */
8620                 if (*RExC_parse != ')')
8621                     FAIL("Sequence (?R) not terminated");
8622                 ret = reg_node(pRExC_state, GOSTART);
8623                 *flagp |= POSTPONED;
8624                 nextchar(pRExC_state);
8625                 return ret;
8626                 /*notreached*/
8627             { /* named and numeric backreferences */
8628                 I32 num;
8629             case '&':            /* (?&NAME) */
8630                 parse_start = RExC_parse - 1;
8631               named_recursion:
8632                 {
8633                     SV *sv_dat = reg_scan_name(pRExC_state,
8634                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8635                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8636                 }
8637                 goto gen_recurse_regop;
8638                 assert(0); /* NOT REACHED */
8639             case '+':
8640                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8641                     RExC_parse++;
8642                     vFAIL("Illegal pattern");
8643                 }
8644                 goto parse_recursion;
8645                 /* NOT REACHED*/
8646             case '-': /* (?-1) */
8647                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8648                     RExC_parse--; /* rewind to let it be handled later */
8649                     goto parse_flags;
8650                 }
8651                 /*FALLTHROUGH */
8652             case '1': case '2': case '3': case '4': /* (?1) */
8653             case '5': case '6': case '7': case '8': case '9':
8654                 RExC_parse--;
8655               parse_recursion:
8656                 num = atoi(RExC_parse);
8657                 parse_start = RExC_parse - 1; /* MJD */
8658                 if (*RExC_parse == '-')
8659                     RExC_parse++;
8660                 while (isDIGIT(*RExC_parse))
8661                         RExC_parse++;
8662                 if (*RExC_parse!=')')
8663                     vFAIL("Expecting close bracket");
8664
8665               gen_recurse_regop:
8666                 if ( paren == '-' ) {
8667                     /*
8668                     Diagram of capture buffer numbering.
8669                     Top line is the normal capture buffer numbers
8670                     Bottom line is the negative indexing as from
8671                     the X (the (?-2))
8672
8673                     +   1 2    3 4 5 X          6 7
8674                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8675                     -   5 4    3 2 1 X          x x
8676
8677                     */
8678                     num = RExC_npar + num;
8679                     if (num < 1)  {
8680                         RExC_parse++;
8681                         vFAIL("Reference to nonexistent group");
8682                     }
8683                 } else if ( paren == '+' ) {
8684                     num = RExC_npar + num - 1;
8685                 }
8686
8687                 ret = reganode(pRExC_state, GOSUB, num);
8688                 if (!SIZE_ONLY) {
8689                     if (num > (I32)RExC_rx->nparens) {
8690                         RExC_parse++;
8691                         vFAIL("Reference to nonexistent group");
8692                     }
8693                     ARG2L_SET( ret, RExC_recurse_count++);
8694                     RExC_emit++;
8695                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8696                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8697                 } else {
8698                     RExC_size++;
8699                 }
8700                 RExC_seen |= REG_SEEN_RECURSE;
8701                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8702                 Set_Node_Offset(ret, parse_start); /* MJD */
8703
8704                 *flagp |= POSTPONED;
8705                 nextchar(pRExC_state);
8706                 return ret;
8707             } /* named and numeric backreferences */
8708             assert(0); /* NOT REACHED */
8709
8710             case '?':           /* (??...) */
8711                 is_logical = 1;
8712                 if (*RExC_parse != '{') {
8713                     RExC_parse++;
8714                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8715                     /*NOTREACHED*/
8716                 }
8717                 *flagp |= POSTPONED;
8718                 paren = *RExC_parse++;
8719                 /* FALL THROUGH */
8720             case '{':           /* (?{...}) */
8721             {
8722                 U32 n = 0;
8723                 struct reg_code_block *cb;
8724
8725                 RExC_seen_zerolen++;
8726
8727                 if (   !pRExC_state->num_code_blocks
8728                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8729                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8730                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8731                             - RExC_start)
8732                 ) {
8733                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8734                         FAIL("panic: Sequence (?{...}): no code block found\n");
8735                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8736                 }
8737                 /* this is a pre-compiled code block (?{...}) */
8738                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8739                 RExC_parse = RExC_start + cb->end;
8740                 if (!SIZE_ONLY) {
8741                     OP *o = cb->block;
8742                     if (cb->src_regex) {
8743                         n = add_data(pRExC_state, 2, "rl");
8744                         RExC_rxi->data->data[n] =
8745                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8746                         RExC_rxi->data->data[n+1] = (void*)o;
8747                     }
8748                     else {
8749                         n = add_data(pRExC_state, 1,
8750                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8751                         RExC_rxi->data->data[n] = (void*)o;
8752                     }
8753                 }
8754                 pRExC_state->code_index++;
8755                 nextchar(pRExC_state);
8756
8757                 if (is_logical) {
8758                     regnode *eval;
8759                     ret = reg_node(pRExC_state, LOGICAL);
8760                     eval = reganode(pRExC_state, EVAL, n);
8761                     if (!SIZE_ONLY) {
8762                         ret->flags = 2;
8763                         /* for later propagation into (??{}) return value */
8764                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8765                     }
8766                     REGTAIL(pRExC_state, ret, eval);
8767                     /* deal with the length of this later - MJD */
8768                     return ret;
8769                 }
8770                 ret = reganode(pRExC_state, EVAL, n);
8771                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8772                 Set_Node_Offset(ret, parse_start);
8773                 return ret;
8774             }
8775             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8776             {
8777                 int is_define= 0;
8778                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8779                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8780                         || RExC_parse[1] == '<'
8781                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8782                         I32 flag;
8783
8784                         ret = reg_node(pRExC_state, LOGICAL);
8785                         if (!SIZE_ONLY)
8786                             ret->flags = 1;
8787                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8788                         goto insert_if;
8789                     }
8790                 }
8791                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8792                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8793                 {
8794                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8795                     char *name_start= RExC_parse++;
8796                     U32 num = 0;
8797                     SV *sv_dat=reg_scan_name(pRExC_state,
8798                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8799                     if (RExC_parse == name_start || *RExC_parse != ch)
8800                         vFAIL2("Sequence (?(%c... not terminated",
8801                             (ch == '>' ? '<' : ch));
8802                     RExC_parse++;
8803                     if (!SIZE_ONLY) {
8804                         num = add_data( pRExC_state, 1, "S" );
8805                         RExC_rxi->data->data[num]=(void*)sv_dat;
8806                         SvREFCNT_inc_simple_void(sv_dat);
8807                     }
8808                     ret = reganode(pRExC_state,NGROUPP,num);
8809                     goto insert_if_check_paren;
8810                 }
8811                 else if (RExC_parse[0] == 'D' &&
8812                          RExC_parse[1] == 'E' &&
8813                          RExC_parse[2] == 'F' &&
8814                          RExC_parse[3] == 'I' &&
8815                          RExC_parse[4] == 'N' &&
8816                          RExC_parse[5] == 'E')
8817                 {
8818                     ret = reganode(pRExC_state,DEFINEP,0);
8819                     RExC_parse +=6 ;
8820                     is_define = 1;
8821                     goto insert_if_check_paren;
8822                 }
8823                 else if (RExC_parse[0] == 'R') {
8824                     RExC_parse++;
8825                     parno = 0;
8826                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8827                         parno = atoi(RExC_parse++);
8828                         while (isDIGIT(*RExC_parse))
8829                             RExC_parse++;
8830                     } else if (RExC_parse[0] == '&') {
8831                         SV *sv_dat;
8832                         RExC_parse++;
8833                         sv_dat = reg_scan_name(pRExC_state,
8834                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8835                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8836                     }
8837                     ret = reganode(pRExC_state,INSUBP,parno);
8838                     goto insert_if_check_paren;
8839                 }
8840                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8841                     /* (?(1)...) */
8842                     char c;
8843                     parno = atoi(RExC_parse++);
8844
8845                     while (isDIGIT(*RExC_parse))
8846                         RExC_parse++;
8847                     ret = reganode(pRExC_state, GROUPP, parno);
8848
8849                  insert_if_check_paren:
8850                     if ((c = *nextchar(pRExC_state)) != ')')
8851                         vFAIL("Switch condition not recognized");
8852                   insert_if:
8853                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8854                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8855                     if (br == NULL)
8856                         br = reganode(pRExC_state, LONGJMP, 0);
8857                     else
8858                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8859                     c = *nextchar(pRExC_state);
8860                     if (flags&HASWIDTH)
8861                         *flagp |= HASWIDTH;
8862                     if (c == '|') {
8863                         if (is_define)
8864                             vFAIL("(?(DEFINE)....) does not allow branches");
8865                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8866                         regbranch(pRExC_state, &flags, 1,depth+1);
8867                         REGTAIL(pRExC_state, ret, lastbr);
8868                         if (flags&HASWIDTH)
8869                             *flagp |= HASWIDTH;
8870                         c = *nextchar(pRExC_state);
8871                     }
8872                     else
8873                         lastbr = NULL;
8874                     if (c != ')')
8875                         vFAIL("Switch (?(condition)... contains too many branches");
8876                     ender = reg_node(pRExC_state, TAIL);
8877                     REGTAIL(pRExC_state, br, ender);
8878                     if (lastbr) {
8879                         REGTAIL(pRExC_state, lastbr, ender);
8880                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8881                     }
8882                     else
8883                         REGTAIL(pRExC_state, ret, ender);
8884                     RExC_size++; /* XXX WHY do we need this?!!
8885                                     For large programs it seems to be required
8886                                     but I can't figure out why. -- dmq*/
8887                     return ret;
8888                 }
8889                 else {
8890                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8891                 }
8892             }
8893             case 0:
8894                 RExC_parse--; /* for vFAIL to print correctly */
8895                 vFAIL("Sequence (? incomplete");
8896                 break;
8897             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8898                                        that follow */
8899                 has_use_defaults = TRUE;
8900                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8901                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8902                                                 ? REGEX_UNICODE_CHARSET
8903                                                 : REGEX_DEPENDS_CHARSET);
8904                 goto parse_flags;
8905             default:
8906                 --RExC_parse;
8907                 parse_flags:      /* (?i) */
8908             {
8909                 U32 posflags = 0, negflags = 0;
8910                 U32 *flagsp = &posflags;
8911                 char has_charset_modifier = '\0';
8912                 regex_charset cs = get_regex_charset(RExC_flags);
8913                 if (cs == REGEX_DEPENDS_CHARSET
8914                     && (RExC_utf8 || RExC_uni_semantics))
8915                 {
8916                     cs = REGEX_UNICODE_CHARSET;
8917                 }
8918
8919                 while (*RExC_parse) {
8920                     /* && strchr("iogcmsx", *RExC_parse) */
8921                     /* (?g), (?gc) and (?o) are useless here
8922                        and must be globally applied -- japhy */
8923                     switch (*RExC_parse) {
8924                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8925                     case LOCALE_PAT_MOD:
8926                         if (has_charset_modifier) {
8927                             goto excess_modifier;
8928                         }
8929                         else if (flagsp == &negflags) {
8930                             goto neg_modifier;
8931                         }
8932                         cs = REGEX_LOCALE_CHARSET;
8933                         has_charset_modifier = LOCALE_PAT_MOD;
8934                         RExC_contains_locale = 1;
8935                         break;
8936                     case UNICODE_PAT_MOD:
8937                         if (has_charset_modifier) {
8938                             goto excess_modifier;
8939                         }
8940                         else if (flagsp == &negflags) {
8941                             goto neg_modifier;
8942                         }
8943                         cs = REGEX_UNICODE_CHARSET;
8944                         has_charset_modifier = UNICODE_PAT_MOD;
8945                         break;
8946                     case ASCII_RESTRICT_PAT_MOD:
8947                         if (flagsp == &negflags) {
8948                             goto neg_modifier;
8949                         }
8950                         if (has_charset_modifier) {
8951                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8952                                 goto excess_modifier;
8953                             }
8954                             /* Doubled modifier implies more restricted */
8955                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8956                         }
8957                         else {
8958                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8959                         }
8960                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8961                         break;
8962                     case DEPENDS_PAT_MOD:
8963                         if (has_use_defaults) {
8964                             goto fail_modifiers;
8965                         }
8966                         else if (flagsp == &negflags) {
8967                             goto neg_modifier;
8968                         }
8969                         else if (has_charset_modifier) {
8970                             goto excess_modifier;
8971                         }
8972
8973                         /* The dual charset means unicode semantics if the
8974                          * pattern (or target, not known until runtime) are
8975                          * utf8, or something in the pattern indicates unicode
8976                          * semantics */
8977                         cs = (RExC_utf8 || RExC_uni_semantics)
8978                              ? REGEX_UNICODE_CHARSET
8979                              : REGEX_DEPENDS_CHARSET;
8980                         has_charset_modifier = DEPENDS_PAT_MOD;
8981                         break;
8982                     excess_modifier:
8983                         RExC_parse++;
8984                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8985                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8986                         }
8987                         else if (has_charset_modifier == *(RExC_parse - 1)) {
8988                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8989                         }
8990                         else {
8991                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8992                         }
8993                         /*NOTREACHED*/
8994                     neg_modifier:
8995                         RExC_parse++;
8996                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8997                         /*NOTREACHED*/
8998                     case ONCE_PAT_MOD: /* 'o' */
8999                     case GLOBAL_PAT_MOD: /* 'g' */
9000                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9001                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9002                             if (! (wastedflags & wflagbit) ) {
9003                                 wastedflags |= wflagbit;
9004                                 vWARN5(
9005                                     RExC_parse + 1,
9006                                     "Useless (%s%c) - %suse /%c modifier",
9007                                     flagsp == &negflags ? "?-" : "?",
9008                                     *RExC_parse,
9009                                     flagsp == &negflags ? "don't " : "",
9010                                     *RExC_parse
9011                                 );
9012                             }
9013                         }
9014                         break;
9015
9016                     case CONTINUE_PAT_MOD: /* 'c' */
9017                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9018                             if (! (wastedflags & WASTED_C) ) {
9019                                 wastedflags |= WASTED_GC;
9020                                 vWARN3(
9021                                     RExC_parse + 1,
9022                                     "Useless (%sc) - %suse /gc modifier",
9023                                     flagsp == &negflags ? "?-" : "?",
9024                                     flagsp == &negflags ? "don't " : ""
9025                                 );
9026                             }
9027                         }
9028                         break;
9029                     case KEEPCOPY_PAT_MOD: /* 'p' */
9030                         if (flagsp == &negflags) {
9031                             if (SIZE_ONLY)
9032                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9033                         } else {
9034                             *flagsp |= RXf_PMf_KEEPCOPY;
9035                         }
9036                         break;
9037                     case '-':
9038                         /* A flag is a default iff it is following a minus, so
9039                          * if there is a minus, it means will be trying to
9040                          * re-specify a default which is an error */
9041                         if (has_use_defaults || flagsp == &negflags) {
9042             fail_modifiers:
9043                             RExC_parse++;
9044                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9045                             /*NOTREACHED*/
9046                         }
9047                         flagsp = &negflags;
9048                         wastedflags = 0;  /* reset so (?g-c) warns twice */
9049                         break;
9050                     case ':':
9051                         paren = ':';
9052                         /*FALLTHROUGH*/
9053                     case ')':
9054                         RExC_flags |= posflags;
9055                         RExC_flags &= ~negflags;
9056                         set_regex_charset(&RExC_flags, cs);
9057                         if (paren != ':') {
9058                             oregflags |= posflags;
9059                             oregflags &= ~negflags;
9060                             set_regex_charset(&oregflags, cs);
9061                         }
9062                         nextchar(pRExC_state);
9063                         if (paren != ':') {
9064                             *flagp = TRYAGAIN;
9065                             return NULL;
9066                         } else {
9067                             ret = NULL;
9068                             goto parse_rest;
9069                         }
9070                         /*NOTREACHED*/
9071                     default:
9072                         RExC_parse++;
9073                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9074                         /*NOTREACHED*/
9075                     }
9076                     ++RExC_parse;
9077                 }
9078             }} /* one for the default block, one for the switch */
9079         }
9080         else {                  /* (...) */
9081           capturing_parens:
9082             parno = RExC_npar;
9083             RExC_npar++;
9084
9085             ret = reganode(pRExC_state, OPEN, parno);
9086             if (!SIZE_ONLY ){
9087                 if (!RExC_nestroot)
9088                     RExC_nestroot = parno;
9089                 if (RExC_seen & REG_SEEN_RECURSE
9090                     && !RExC_open_parens[parno-1])
9091                 {
9092                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9093                         "Setting open paren #%"IVdf" to %d\n",
9094                         (IV)parno, REG_NODE_NUM(ret)));
9095                     RExC_open_parens[parno-1]= ret;
9096                 }
9097             }
9098             Set_Node_Length(ret, 1); /* MJD */
9099             Set_Node_Offset(ret, RExC_parse); /* MJD */
9100             is_open = 1;
9101         }
9102     }
9103     else                        /* ! paren */
9104         ret = NULL;
9105
9106    parse_rest:
9107     /* Pick up the branches, linking them together. */
9108     parse_start = RExC_parse;   /* MJD */
9109     br = regbranch(pRExC_state, &flags, 1,depth+1);
9110
9111     /*     branch_len = (paren != 0); */
9112
9113     if (br == NULL)
9114         return(NULL);
9115     if (*RExC_parse == '|') {
9116         if (!SIZE_ONLY && RExC_extralen) {
9117             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9118         }
9119         else {                  /* MJD */
9120             reginsert(pRExC_state, BRANCH, br, depth+1);
9121             Set_Node_Length(br, paren != 0);
9122             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9123         }
9124         have_branch = 1;
9125         if (SIZE_ONLY)
9126             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9127     }
9128     else if (paren == ':') {
9129         *flagp |= flags&SIMPLE;
9130     }
9131     if (is_open) {                              /* Starts with OPEN. */
9132         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9133     }
9134     else if (paren != '?')              /* Not Conditional */
9135         ret = br;
9136     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9137     lastbr = br;
9138     while (*RExC_parse == '|') {
9139         if (!SIZE_ONLY && RExC_extralen) {
9140             ender = reganode(pRExC_state, LONGJMP,0);
9141             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9142         }
9143         if (SIZE_ONLY)
9144             RExC_extralen += 2;         /* Account for LONGJMP. */
9145         nextchar(pRExC_state);
9146         if (freeze_paren) {
9147             if (RExC_npar > after_freeze)
9148                 after_freeze = RExC_npar;
9149             RExC_npar = freeze_paren;
9150         }
9151         br = regbranch(pRExC_state, &flags, 0, depth+1);
9152
9153         if (br == NULL)
9154             return(NULL);
9155         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9156         lastbr = br;
9157         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9158     }
9159
9160     if (have_branch || paren != ':') {
9161         /* Make a closing node, and hook it on the end. */
9162         switch (paren) {
9163         case ':':
9164             ender = reg_node(pRExC_state, TAIL);
9165             break;
9166         case 1:
9167             ender = reganode(pRExC_state, CLOSE, parno);
9168             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9169                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9170                         "Setting close paren #%"IVdf" to %d\n",
9171                         (IV)parno, REG_NODE_NUM(ender)));
9172                 RExC_close_parens[parno-1]= ender;
9173                 if (RExC_nestroot == parno)
9174                     RExC_nestroot = 0;
9175             }
9176             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9177             Set_Node_Length(ender,1); /* MJD */
9178             break;
9179         case '<':
9180         case ',':
9181         case '=':
9182         case '!':
9183             *flagp &= ~HASWIDTH;
9184             /* FALL THROUGH */
9185         case '>':
9186             ender = reg_node(pRExC_state, SUCCEED);
9187             break;
9188         case 0:
9189             ender = reg_node(pRExC_state, END);
9190             if (!SIZE_ONLY) {
9191                 assert(!RExC_opend); /* there can only be one! */
9192                 RExC_opend = ender;
9193             }
9194             break;
9195         }
9196         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9197             SV * const mysv_val1=sv_newmortal();
9198             SV * const mysv_val2=sv_newmortal();
9199             DEBUG_PARSE_MSG("lsbr");
9200             regprop(RExC_rx, mysv_val1, lastbr);
9201             regprop(RExC_rx, mysv_val2, ender);
9202             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9203                           SvPV_nolen_const(mysv_val1),
9204                           (IV)REG_NODE_NUM(lastbr),
9205                           SvPV_nolen_const(mysv_val2),
9206                           (IV)REG_NODE_NUM(ender),
9207                           (IV)(ender - lastbr)
9208             );
9209         });
9210         REGTAIL(pRExC_state, lastbr, ender);
9211
9212         if (have_branch && !SIZE_ONLY) {
9213             char is_nothing= 1;
9214             if (depth==1)
9215                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9216
9217             /* Hook the tails of the branches to the closing node. */
9218             for (br = ret; br; br = regnext(br)) {
9219                 const U8 op = PL_regkind[OP(br)];
9220                 if (op == BRANCH) {
9221                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9222                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9223                         is_nothing= 0;
9224                 }
9225                 else if (op == BRANCHJ) {
9226                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9227                     /* for now we always disable this optimisation * /
9228                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9229                     */
9230                         is_nothing= 0;
9231                 }
9232             }
9233             if (is_nothing) {
9234                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9235                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9236                     SV * const mysv_val1=sv_newmortal();
9237                     SV * const mysv_val2=sv_newmortal();
9238                     DEBUG_PARSE_MSG("NADA");
9239                     regprop(RExC_rx, mysv_val1, ret);
9240                     regprop(RExC_rx, mysv_val2, ender);
9241                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9242                                   SvPV_nolen_const(mysv_val1),
9243                                   (IV)REG_NODE_NUM(ret),
9244                                   SvPV_nolen_const(mysv_val2),
9245                                   (IV)REG_NODE_NUM(ender),
9246                                   (IV)(ender - ret)
9247                     );
9248                 });
9249                 OP(br)= NOTHING;
9250                 if (OP(ender) == TAIL) {
9251                     NEXT_OFF(br)= 0;
9252                     RExC_emit= br + 1;
9253                 } else {
9254                     regnode *opt;
9255                     for ( opt= br + 1; opt < ender ; opt++ )
9256                         OP(opt)= OPTIMIZED;
9257                     NEXT_OFF(br)= ender - br;
9258                 }
9259             }
9260         }
9261     }
9262
9263     {
9264         const char *p;
9265         static const char parens[] = "=!<,>";
9266
9267         if (paren && (p = strchr(parens, paren))) {
9268             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9269             int flag = (p - parens) > 1;
9270
9271             if (paren == '>')
9272                 node = SUSPEND, flag = 0;
9273             reginsert(pRExC_state, node,ret, depth+1);
9274             Set_Node_Cur_Length(ret);
9275             Set_Node_Offset(ret, parse_start + 1);
9276             ret->flags = flag;
9277             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9278         }
9279     }
9280
9281     /* Check for proper termination. */
9282     if (paren) {
9283         RExC_flags = oregflags;
9284         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9285             RExC_parse = oregcomp_parse;
9286             vFAIL("Unmatched (");
9287         }
9288     }
9289     else if (!paren && RExC_parse < RExC_end) {
9290         if (*RExC_parse == ')') {
9291             RExC_parse++;
9292             vFAIL("Unmatched )");
9293         }
9294         else
9295             FAIL("Junk on end of regexp");      /* "Can't happen". */
9296         assert(0); /* NOTREACHED */
9297     }
9298
9299     if (RExC_in_lookbehind) {
9300         RExC_in_lookbehind--;
9301     }
9302     if (after_freeze > RExC_npar)
9303         RExC_npar = after_freeze;
9304     return(ret);
9305 }
9306
9307 /*
9308  - regbranch - one alternative of an | operator
9309  *
9310  * Implements the concatenation operator.
9311  */
9312 STATIC regnode *
9313 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9314 {
9315     dVAR;
9316     regnode *ret;
9317     regnode *chain = NULL;
9318     regnode *latest;
9319     I32 flags = 0, c = 0;
9320     GET_RE_DEBUG_FLAGS_DECL;
9321
9322     PERL_ARGS_ASSERT_REGBRANCH;
9323
9324     DEBUG_PARSE("brnc");
9325
9326     if (first)
9327         ret = NULL;
9328     else {
9329         if (!SIZE_ONLY && RExC_extralen)
9330             ret = reganode(pRExC_state, BRANCHJ,0);
9331         else {
9332             ret = reg_node(pRExC_state, BRANCH);
9333             Set_Node_Length(ret, 1);
9334         }
9335     }
9336
9337     if (!first && SIZE_ONLY)
9338         RExC_extralen += 1;                     /* BRANCHJ */
9339
9340     *flagp = WORST;                     /* Tentatively. */
9341
9342     RExC_parse--;
9343     nextchar(pRExC_state);
9344     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9345         flags &= ~TRYAGAIN;
9346         latest = regpiece(pRExC_state, &flags,depth+1);
9347         if (latest == NULL) {
9348             if (flags & TRYAGAIN)
9349                 continue;
9350             return(NULL);
9351         }
9352         else if (ret == NULL)
9353             ret = latest;
9354         *flagp |= flags&(HASWIDTH|POSTPONED);
9355         if (chain == NULL)      /* First piece. */
9356             *flagp |= flags&SPSTART;
9357         else {
9358             RExC_naughty++;
9359             REGTAIL(pRExC_state, chain, latest);
9360         }
9361         chain = latest;
9362         c++;
9363     }
9364     if (chain == NULL) {        /* Loop ran zero times. */
9365         chain = reg_node(pRExC_state, NOTHING);
9366         if (ret == NULL)
9367             ret = chain;
9368     }
9369     if (c == 1) {
9370         *flagp |= flags&SIMPLE;
9371     }
9372
9373     return ret;
9374 }
9375
9376 /*
9377  - regpiece - something followed by possible [*+?]
9378  *
9379  * Note that the branching code sequences used for ? and the general cases
9380  * of * and + are somewhat optimized:  they use the same NOTHING node as
9381  * both the endmarker for their branch list and the body of the last branch.
9382  * It might seem that this node could be dispensed with entirely, but the
9383  * endmarker role is not redundant.
9384  */
9385 STATIC regnode *
9386 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9387 {
9388     dVAR;
9389     regnode *ret;
9390     char op;
9391     char *next;
9392     I32 flags;
9393     const char * const origparse = RExC_parse;
9394     I32 min;
9395     I32 max = REG_INFTY;
9396 #ifdef RE_TRACK_PATTERN_OFFSETS
9397     char *parse_start;
9398 #endif
9399     const char *maxpos = NULL;
9400
9401     /* Save the original in case we change the emitted regop to a FAIL. */
9402     regnode * const orig_emit = RExC_emit;
9403
9404     GET_RE_DEBUG_FLAGS_DECL;
9405
9406     PERL_ARGS_ASSERT_REGPIECE;
9407
9408     DEBUG_PARSE("piec");
9409
9410     ret = regatom(pRExC_state, &flags,depth+1);
9411     if (ret == NULL) {
9412         if (flags & TRYAGAIN)
9413             *flagp |= TRYAGAIN;
9414         return(NULL);
9415     }
9416
9417     op = *RExC_parse;
9418
9419     if (op == '{' && regcurly(RExC_parse)) {
9420         maxpos = NULL;
9421 #ifdef RE_TRACK_PATTERN_OFFSETS
9422         parse_start = RExC_parse; /* MJD */
9423 #endif
9424         next = RExC_parse + 1;
9425         while (isDIGIT(*next) || *next == ',') {
9426             if (*next == ',') {
9427                 if (maxpos)
9428                     break;
9429                 else
9430                     maxpos = next;
9431             }
9432             next++;
9433         }
9434         if (*next == '}') {             /* got one */
9435             if (!maxpos)
9436                 maxpos = next;
9437             RExC_parse++;
9438             min = atoi(RExC_parse);
9439             if (*maxpos == ',')
9440                 maxpos++;
9441             else
9442                 maxpos = RExC_parse;
9443             max = atoi(maxpos);
9444             if (!max && *maxpos != '0')
9445                 max = REG_INFTY;                /* meaning "infinity" */
9446             else if (max >= REG_INFTY)
9447                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9448             RExC_parse = next;
9449             nextchar(pRExC_state);
9450             if (max < min) {    /* If can't match, warn and optimize to fail
9451                                    unconditionally */
9452                 if (SIZE_ONLY) {
9453                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9454
9455                     /* We can't back off the size because we have to reserve
9456                      * enough space for all the things we are about to throw
9457                      * away, but we can shrink it by the ammount we are about
9458                      * to re-use here */
9459                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9460                 }
9461                 else {
9462                     RExC_emit = orig_emit;
9463                 }
9464                 ret = reg_node(pRExC_state, OPFAIL);
9465                 return ret;
9466             }
9467             else if (max == 0) {    /* replace {0} with a nothing node */
9468                 if (SIZE_ONLY) {
9469                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
9470                 }
9471                 else {
9472                     RExC_emit = orig_emit;
9473                 }
9474                 ret = reg_node(pRExC_state, NOTHING);
9475                 return ret;
9476             }
9477
9478         do_curly:
9479             if ((flags&SIMPLE)) {
9480                 RExC_naughty += 2 + RExC_naughty / 2;
9481                 reginsert(pRExC_state, CURLY, ret, depth+1);
9482                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9483                 Set_Node_Cur_Length(ret);
9484             }
9485             else {
9486                 regnode * const w = reg_node(pRExC_state, WHILEM);
9487
9488                 w->flags = 0;
9489                 REGTAIL(pRExC_state, ret, w);
9490                 if (!SIZE_ONLY && RExC_extralen) {
9491                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9492                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9493                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9494                 }
9495                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9496                                 /* MJD hk */
9497                 Set_Node_Offset(ret, parse_start+1);
9498                 Set_Node_Length(ret,
9499                                 op == '{' ? (RExC_parse - parse_start) : 1);
9500
9501                 if (!SIZE_ONLY && RExC_extralen)
9502                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9503                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9504                 if (SIZE_ONLY)
9505                     RExC_whilem_seen++, RExC_extralen += 3;
9506                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9507             }
9508             ret->flags = 0;
9509
9510             if (min > 0)
9511                 *flagp = WORST;
9512             if (max > 0)
9513                 *flagp |= HASWIDTH;
9514             if (!SIZE_ONLY) {
9515                 ARG1_SET(ret, (U16)min);
9516                 ARG2_SET(ret, (U16)max);
9517             }
9518
9519             goto nest_check;
9520         }
9521     }
9522
9523     if (!ISMULT1(op)) {
9524         *flagp = flags;
9525         return(ret);
9526     }
9527
9528 #if 0                           /* Now runtime fix should be reliable. */
9529
9530     /* if this is reinstated, don't forget to put this back into perldiag:
9531
9532             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9533
9534            (F) The part of the regexp subject to either the * or + quantifier
9535            could match an empty string. The {#} shows in the regular
9536            expression about where the problem was discovered.
9537
9538     */
9539
9540     if (!(flags&HASWIDTH) && op != '?')
9541       vFAIL("Regexp *+ operand could be empty");
9542 #endif
9543
9544 #ifdef RE_TRACK_PATTERN_OFFSETS
9545     parse_start = RExC_parse;
9546 #endif
9547     nextchar(pRExC_state);
9548
9549     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9550
9551     if (op == '*' && (flags&SIMPLE)) {
9552         reginsert(pRExC_state, STAR, ret, depth+1);
9553         ret->flags = 0;
9554         RExC_naughty += 4;
9555     }
9556     else if (op == '*') {
9557         min = 0;
9558         goto do_curly;
9559     }
9560     else if (op == '+' && (flags&SIMPLE)) {
9561         reginsert(pRExC_state, PLUS, ret, depth+1);
9562         ret->flags = 0;
9563         RExC_naughty += 3;
9564     }
9565     else if (op == '+') {
9566         min = 1;
9567         goto do_curly;
9568     }
9569     else if (op == '?') {
9570         min = 0; max = 1;
9571         goto do_curly;
9572     }
9573   nest_check:
9574     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9575         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9576         ckWARN3reg(RExC_parse,
9577                    "%.*s matches null string many times",
9578                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9579                    origparse);
9580         (void)ReREFCNT_inc(RExC_rx_sv);
9581     }
9582
9583     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9584         nextchar(pRExC_state);
9585         reginsert(pRExC_state, MINMOD, ret, depth+1);
9586         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9587     }
9588 #ifndef REG_ALLOW_MINMOD_SUSPEND
9589     else
9590 #endif
9591     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9592         regnode *ender;
9593         nextchar(pRExC_state);
9594         ender = reg_node(pRExC_state, SUCCEED);
9595         REGTAIL(pRExC_state, ret, ender);
9596         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9597         ret->flags = 0;
9598         ender = reg_node(pRExC_state, TAIL);
9599         REGTAIL(pRExC_state, ret, ender);
9600         /*ret= ender;*/
9601     }
9602
9603     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9604         RExC_parse++;
9605         vFAIL("Nested quantifiers");
9606     }
9607
9608     return(ret);
9609 }
9610
9611 STATIC bool
9612 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9613 {
9614
9615  /* This is expected to be called by a parser routine that has recognized '\N'
9616    and needs to handle the rest. RExC_parse is expected to point at the first
9617    char following the N at the time of the call.  On successful return,
9618    RExC_parse has been updated to point to just after the sequence identified
9619    by this routine, and <*flagp> has been updated.
9620
9621    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9622    character class.
9623
9624    \N may begin either a named sequence, or if outside a character class, mean
9625    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9626    attempted to decide which, and in the case of a named sequence, converted it
9627    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9628    where c1... are the characters in the sequence.  For single-quoted regexes,
9629    the tokenizer passes the \N sequence through unchanged; this code will not
9630    attempt to determine this nor expand those, instead raising a syntax error.
9631    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9632    or there is no '}', it signals that this \N occurrence means to match a
9633    non-newline.
9634
9635    Only the \N{U+...} form should occur in a character class, for the same
9636    reason that '.' inside a character class means to just match a period: it
9637    just doesn't make sense.
9638
9639    The function raises an error (via vFAIL), and doesn't return for various
9640    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9641    success; it returns FALSE otherwise.
9642
9643    If <valuep> is non-null, it means the caller can accept an input sequence
9644    consisting of a just a single code point; <*valuep> is set to that value
9645    if the input is such.
9646
9647    If <node_p> is non-null it signifies that the caller can accept any other
9648    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9649    is set as follows:
9650     1) \N means not-a-NL: points to a newly created REG_ANY node;
9651     2) \N{}:              points to a new NOTHING node;
9652     3) otherwise:         points to a new EXACT node containing the resolved
9653                           string.
9654    Note that FALSE is returned for single code point sequences if <valuep> is
9655    null.
9656  */
9657
9658     char * endbrace;    /* '}' following the name */
9659     char* p;
9660     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9661                            stream */
9662     bool has_multiple_chars; /* true if the input stream contains a sequence of
9663                                 more than one character */
9664
9665     GET_RE_DEBUG_FLAGS_DECL;
9666
9667     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9668
9669     GET_RE_DEBUG_FLAGS;
9670
9671     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9672
9673     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9674      * modifier.  The other meaning does not */
9675     p = (RExC_flags & RXf_PMf_EXTENDED)
9676         ? regwhite( pRExC_state, RExC_parse )
9677         : RExC_parse;
9678
9679     /* Disambiguate between \N meaning a named character versus \N meaning
9680      * [^\n].  The former is assumed when it can't be the latter. */
9681     if (*p != '{' || regcurly(p)) {
9682         RExC_parse = p;
9683         if (! node_p) {
9684             /* no bare \N in a charclass */
9685             if (in_char_class) {
9686                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9687             }
9688             return FALSE;
9689         }
9690         nextchar(pRExC_state);
9691         *node_p = reg_node(pRExC_state, REG_ANY);
9692         *flagp |= HASWIDTH|SIMPLE;
9693         RExC_naughty++;
9694         RExC_parse--;
9695         Set_Node_Length(*node_p, 1); /* MJD */
9696         return TRUE;
9697     }
9698
9699     /* Here, we have decided it should be a named character or sequence */
9700
9701     /* The test above made sure that the next real character is a '{', but
9702      * under the /x modifier, it could be separated by space (or a comment and
9703      * \n) and this is not allowed (for consistency with \x{...} and the
9704      * tokenizer handling of \N{NAME}). */
9705     if (*RExC_parse != '{') {
9706         vFAIL("Missing braces on \\N{}");
9707     }
9708
9709     RExC_parse++;       /* Skip past the '{' */
9710
9711     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9712         || ! (endbrace == RExC_parse            /* nothing between the {} */
9713               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9714                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9715     {
9716         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9717         vFAIL("\\N{NAME} must be resolved by the lexer");
9718     }
9719
9720     if (endbrace == RExC_parse) {   /* empty: \N{} */
9721         bool ret = TRUE;
9722         if (node_p) {
9723             *node_p = reg_node(pRExC_state,NOTHING);
9724         }
9725         else if (in_char_class) {
9726             if (SIZE_ONLY && in_char_class) {
9727                 ckWARNreg(RExC_parse,
9728                         "Ignoring zero length \\N{} in character class"
9729                 );
9730             }
9731             ret = FALSE;
9732         }
9733         else {
9734             return FALSE;
9735         }
9736         nextchar(pRExC_state);
9737         return ret;
9738     }
9739
9740     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9741     RExC_parse += 2;    /* Skip past the 'U+' */
9742
9743     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9744
9745     /* Code points are separated by dots.  If none, there is only one code
9746      * point, and is terminated by the brace */
9747     has_multiple_chars = (endchar < endbrace);
9748
9749     if (valuep && (! has_multiple_chars || in_char_class)) {
9750         /* We only pay attention to the first char of
9751         multichar strings being returned in char classes. I kinda wonder
9752         if this makes sense as it does change the behaviour
9753         from earlier versions, OTOH that behaviour was broken
9754         as well. XXX Solution is to recharacterize as
9755         [rest-of-class]|multi1|multi2... */
9756
9757         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9758         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9759             | PERL_SCAN_DISALLOW_PREFIX
9760             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9761
9762         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9763
9764         /* The tokenizer should have guaranteed validity, but it's possible to
9765          * bypass it by using single quoting, so check */
9766         if (length_of_hex == 0
9767             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9768         {
9769             RExC_parse += length_of_hex;        /* Includes all the valid */
9770             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9771                             ? UTF8SKIP(RExC_parse)
9772                             : 1;
9773             /* Guard against malformed utf8 */
9774             if (RExC_parse >= endchar) {
9775                 RExC_parse = endchar;
9776             }
9777             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9778         }
9779
9780         if (in_char_class && has_multiple_chars) {
9781             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9782         }
9783
9784         RExC_parse = endbrace + 1;
9785     }
9786     else if (! node_p || ! has_multiple_chars) {
9787
9788         /* Here, the input is legal, but not according to the caller's
9789          * options.  We fail without advancing the parse, so that the
9790          * caller can try again */
9791         RExC_parse = p;
9792         return FALSE;
9793     }
9794     else {
9795
9796         /* What is done here is to convert this to a sub-pattern of the form
9797          * (?:\x{char1}\x{char2}...)
9798          * and then call reg recursively.  That way, it retains its atomicness,
9799          * while not having to worry about special handling that some code
9800          * points may have.  toke.c has converted the original Unicode values
9801          * to native, so that we can just pass on the hex values unchanged.  We
9802          * do have to set a flag to keep recoding from happening in the
9803          * recursion */
9804
9805         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9806         STRLEN len;
9807         char *orig_end = RExC_end;
9808         I32 flags;
9809
9810         while (RExC_parse < endbrace) {
9811
9812             /* Convert to notation the rest of the code understands */
9813             sv_catpv(substitute_parse, "\\x{");
9814             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9815             sv_catpv(substitute_parse, "}");
9816
9817             /* Point to the beginning of the next character in the sequence. */
9818             RExC_parse = endchar + 1;
9819             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9820         }
9821         sv_catpv(substitute_parse, ")");
9822
9823         RExC_parse = SvPV(substitute_parse, len);
9824
9825         /* Don't allow empty number */
9826         if (len < 8) {
9827             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9828         }
9829         RExC_end = RExC_parse + len;
9830
9831         /* The values are Unicode, and therefore not subject to recoding */
9832         RExC_override_recoding = 1;
9833
9834         *node_p = reg(pRExC_state, 1, &flags, depth+1);
9835         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9836
9837         RExC_parse = endbrace;
9838         RExC_end = orig_end;
9839         RExC_override_recoding = 0;
9840
9841         nextchar(pRExC_state);
9842     }
9843
9844     return TRUE;
9845 }
9846
9847
9848 /*
9849  * reg_recode
9850  *
9851  * It returns the code point in utf8 for the value in *encp.
9852  *    value: a code value in the source encoding
9853  *    encp:  a pointer to an Encode object
9854  *
9855  * If the result from Encode is not a single character,
9856  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9857  */
9858 STATIC UV
9859 S_reg_recode(pTHX_ const char value, SV **encp)
9860 {
9861     STRLEN numlen = 1;
9862     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9863     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9864     const STRLEN newlen = SvCUR(sv);
9865     UV uv = UNICODE_REPLACEMENT;
9866
9867     PERL_ARGS_ASSERT_REG_RECODE;
9868
9869     if (newlen)
9870         uv = SvUTF8(sv)
9871              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9872              : *(U8*)s;
9873
9874     if (!newlen || numlen != newlen) {
9875         uv = UNICODE_REPLACEMENT;
9876         *encp = NULL;
9877     }
9878     return uv;
9879 }
9880
9881 PERL_STATIC_INLINE U8
9882 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9883 {
9884     U8 op;
9885
9886     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9887
9888     if (! FOLD) {
9889         return EXACT;
9890     }
9891
9892     op = get_regex_charset(RExC_flags);
9893     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9894         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9895                  been, so there is no hole */
9896     }
9897
9898     return op + EXACTF;
9899 }
9900
9901 PERL_STATIC_INLINE void
9902 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9903 {
9904     /* This knows the details about sizing an EXACTish node, setting flags for
9905      * it (by setting <*flagp>, and potentially populating it with a single
9906      * character.
9907      *
9908      * If <len> (the length in bytes) is non-zero, this function assumes that
9909      * the node has already been populated, and just does the sizing.  In this
9910      * case <code_point> should be the final code point that has already been
9911      * placed into the node.  This value will be ignored except that under some
9912      * circumstances <*flagp> is set based on it.
9913      *
9914      * If <len> is zero, the function assumes that the node is to contain only
9915      * the single character given by <code_point> and calculates what <len>
9916      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
9917      * additionally will populate the node's STRING with <code_point>, if <len>
9918      * is 0.  In both cases <*flagp> is appropriately set
9919      *
9920      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9921      * folded (the latter only when the rules indicate it can match 'ss') */
9922
9923     bool len_passed_in = cBOOL(len != 0);
9924     U8 character[UTF8_MAXBYTES_CASE+1];
9925
9926     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9927
9928     if (! len_passed_in) {
9929         if (UTF) {
9930             if (FOLD) {
9931                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9932             }
9933             else {
9934                 uvchr_to_utf8( character, code_point);
9935                 len = UTF8SKIP(character);
9936             }
9937         }
9938         else if (! FOLD
9939                  || code_point != LATIN_SMALL_LETTER_SHARP_S
9940                  || ASCII_FOLD_RESTRICTED
9941                  || ! AT_LEAST_UNI_SEMANTICS)
9942         {
9943             *character = (U8) code_point;
9944             len = 1;
9945         }
9946         else {
9947             *character = 's';
9948             *(character + 1) = 's';
9949             len = 2;
9950         }
9951     }
9952
9953     if (SIZE_ONLY) {
9954         RExC_size += STR_SZ(len);
9955     }
9956     else {
9957         RExC_emit += STR_SZ(len);
9958         STR_LEN(node) = len;
9959         if (! len_passed_in) {
9960             Copy((char *) character, STRING(node), len, char);
9961         }
9962     }
9963
9964     *flagp |= HASWIDTH;
9965
9966     /* A single character node is SIMPLE, except for the special-cased SHARP S
9967      * under /di. */
9968     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
9969         && (code_point != LATIN_SMALL_LETTER_SHARP_S
9970             || ! FOLD || ! DEPENDS_SEMANTICS))
9971     {
9972         *flagp |= SIMPLE;
9973     }
9974 }
9975
9976 /*
9977  - regatom - the lowest level
9978
9979    Try to identify anything special at the start of the pattern. If there
9980    is, then handle it as required. This may involve generating a single regop,
9981    such as for an assertion; or it may involve recursing, such as to
9982    handle a () structure.
9983
9984    If the string doesn't start with something special then we gobble up
9985    as much literal text as we can.
9986
9987    Once we have been able to handle whatever type of thing started the
9988    sequence, we return.
9989
9990    Note: we have to be careful with escapes, as they can be both literal
9991    and special, and in the case of \10 and friends, context determines which.
9992
9993    A summary of the code structure is:
9994
9995    switch (first_byte) {
9996         cases for each special:
9997             handle this special;
9998             break;
9999         case '\\':
10000             switch (2nd byte) {
10001                 cases for each unambiguous special:
10002                     handle this special;
10003                     break;
10004                 cases for each ambigous special/literal:
10005                     disambiguate;
10006                     if (special)  handle here
10007                     else goto defchar;
10008                 default: // unambiguously literal:
10009                     goto defchar;
10010             }
10011         default:  // is a literal char
10012             // FALL THROUGH
10013         defchar:
10014             create EXACTish node for literal;
10015             while (more input and node isn't full) {
10016                 switch (input_byte) {
10017                    cases for each special;
10018                        make sure parse pointer is set so that the next call to
10019                            regatom will see this special first
10020                        goto loopdone; // EXACTish node terminated by prev. char
10021                    default:
10022                        append char to EXACTISH node;
10023                 }
10024                 get next input byte;
10025             }
10026         loopdone:
10027    }
10028    return the generated node;
10029
10030    Specifically there are two separate switches for handling
10031    escape sequences, with the one for handling literal escapes requiring
10032    a dummy entry for all of the special escapes that are actually handled
10033    by the other.
10034 */
10035
10036 STATIC regnode *
10037 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10038 {
10039     dVAR;
10040     regnode *ret = NULL;
10041     I32 flags;
10042     char *parse_start = RExC_parse;
10043     U8 op;
10044     int invert = 0;
10045
10046     GET_RE_DEBUG_FLAGS_DECL;
10047
10048     *flagp = WORST;             /* Tentatively. */
10049
10050     DEBUG_PARSE("atom");
10051
10052     PERL_ARGS_ASSERT_REGATOM;
10053
10054 tryagain:
10055     switch ((U8)*RExC_parse) {
10056     case '^':
10057         RExC_seen_zerolen++;
10058         nextchar(pRExC_state);
10059         if (RExC_flags & RXf_PMf_MULTILINE)
10060             ret = reg_node(pRExC_state, MBOL);
10061         else if (RExC_flags & RXf_PMf_SINGLELINE)
10062             ret = reg_node(pRExC_state, SBOL);
10063         else
10064             ret = reg_node(pRExC_state, BOL);
10065         Set_Node_Length(ret, 1); /* MJD */
10066         break;
10067     case '$':
10068         nextchar(pRExC_state);
10069         if (*RExC_parse)
10070             RExC_seen_zerolen++;
10071         if (RExC_flags & RXf_PMf_MULTILINE)
10072             ret = reg_node(pRExC_state, MEOL);
10073         else if (RExC_flags & RXf_PMf_SINGLELINE)
10074             ret = reg_node(pRExC_state, SEOL);
10075         else
10076             ret = reg_node(pRExC_state, EOL);
10077         Set_Node_Length(ret, 1); /* MJD */
10078         break;
10079     case '.':
10080         nextchar(pRExC_state);
10081         if (RExC_flags & RXf_PMf_SINGLELINE)
10082             ret = reg_node(pRExC_state, SANY);
10083         else
10084             ret = reg_node(pRExC_state, REG_ANY);
10085         *flagp |= HASWIDTH|SIMPLE;
10086         RExC_naughty++;
10087         Set_Node_Length(ret, 1); /* MJD */
10088         break;
10089     case '[':
10090     {
10091         char * const oregcomp_parse = ++RExC_parse;
10092         ret = regclass(pRExC_state, flagp,depth+1);
10093         if (*RExC_parse != ']') {
10094             RExC_parse = oregcomp_parse;
10095             vFAIL("Unmatched [");
10096         }
10097         nextchar(pRExC_state);
10098         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10099         break;
10100     }
10101     case '(':
10102         nextchar(pRExC_state);
10103         ret = reg(pRExC_state, 1, &flags,depth+1);
10104         if (ret == NULL) {
10105                 if (flags & TRYAGAIN) {
10106                     if (RExC_parse == RExC_end) {
10107                          /* Make parent create an empty node if needed. */
10108                         *flagp |= TRYAGAIN;
10109                         return(NULL);
10110                     }
10111                     goto tryagain;
10112                 }
10113                 return(NULL);
10114         }
10115         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10116         break;
10117     case '|':
10118     case ')':
10119         if (flags & TRYAGAIN) {
10120             *flagp |= TRYAGAIN;
10121             return NULL;
10122         }
10123         vFAIL("Internal urp");
10124                                 /* Supposed to be caught earlier. */
10125         break;
10126     case '?':
10127     case '+':
10128     case '*':
10129         RExC_parse++;
10130         vFAIL("Quantifier follows nothing");
10131         break;
10132     case '\\':
10133         /* Special Escapes
10134
10135            This switch handles escape sequences that resolve to some kind
10136            of special regop and not to literal text. Escape sequnces that
10137            resolve to literal text are handled below in the switch marked
10138            "Literal Escapes".
10139
10140            Every entry in this switch *must* have a corresponding entry
10141            in the literal escape switch. However, the opposite is not
10142            required, as the default for this switch is to jump to the
10143            literal text handling code.
10144         */
10145         switch ((U8)*++RExC_parse) {
10146             U8 arg;
10147         /* Special Escapes */
10148         case 'A':
10149             RExC_seen_zerolen++;
10150             ret = reg_node(pRExC_state, SBOL);
10151             *flagp |= SIMPLE;
10152             goto finish_meta_pat;
10153         case 'G':
10154             ret = reg_node(pRExC_state, GPOS);
10155             RExC_seen |= REG_SEEN_GPOS;
10156             *flagp |= SIMPLE;
10157             goto finish_meta_pat;
10158         case 'K':
10159             RExC_seen_zerolen++;
10160             ret = reg_node(pRExC_state, KEEPS);
10161             *flagp |= SIMPLE;
10162             /* XXX:dmq : disabling in-place substitution seems to
10163              * be necessary here to avoid cases of memory corruption, as
10164              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10165              */
10166             RExC_seen |= REG_SEEN_LOOKBEHIND;
10167             goto finish_meta_pat;
10168         case 'Z':
10169             ret = reg_node(pRExC_state, SEOL);
10170             *flagp |= SIMPLE;
10171             RExC_seen_zerolen++;                /* Do not optimize RE away */
10172             goto finish_meta_pat;
10173         case 'z':
10174             ret = reg_node(pRExC_state, EOS);
10175             *flagp |= SIMPLE;
10176             RExC_seen_zerolen++;                /* Do not optimize RE away */
10177             goto finish_meta_pat;
10178         case 'C':
10179             ret = reg_node(pRExC_state, CANY);
10180             RExC_seen |= REG_SEEN_CANY;
10181             *flagp |= HASWIDTH|SIMPLE;
10182             goto finish_meta_pat;
10183         case 'X':
10184             ret = reg_node(pRExC_state, CLUMP);
10185             *flagp |= HASWIDTH;
10186             goto finish_meta_pat;
10187
10188         case 'W':
10189             invert = 1;
10190             /* FALLTHROUGH */
10191         case 'w':
10192             arg = ANYOF_WORDCHAR;
10193             goto join_posix;
10194
10195         case 'b':
10196             RExC_seen_zerolen++;
10197             RExC_seen |= REG_SEEN_LOOKBEHIND;
10198             op = BOUND + get_regex_charset(RExC_flags);
10199             if (op > BOUNDA) {  /* /aa is same as /a */
10200                 op = BOUNDA;
10201             }
10202             ret = reg_node(pRExC_state, op);
10203             FLAGS(ret) = get_regex_charset(RExC_flags);
10204             *flagp |= SIMPLE;
10205             goto finish_meta_pat;
10206         case 'B':
10207             RExC_seen_zerolen++;
10208             RExC_seen |= REG_SEEN_LOOKBEHIND;
10209             op = NBOUND + get_regex_charset(RExC_flags);
10210             if (op > NBOUNDA) { /* /aa is same as /a */
10211                 op = NBOUNDA;
10212             }
10213             ret = reg_node(pRExC_state, op);
10214             FLAGS(ret) = get_regex_charset(RExC_flags);
10215             *flagp |= SIMPLE;
10216             goto finish_meta_pat;
10217
10218         case 'D':
10219             invert = 1;
10220             /* FALLTHROUGH */
10221         case 'd':
10222             arg = ANYOF_DIGIT;
10223             goto join_posix;
10224
10225         case 'R':
10226             ret = reg_node(pRExC_state, LNBREAK);
10227             *flagp |= HASWIDTH|SIMPLE;
10228             goto finish_meta_pat;
10229
10230         case 'H':
10231             invert = 1;
10232             /* FALLTHROUGH */
10233         case 'h':
10234             arg = ANYOF_BLANK;
10235             op = POSIXU;
10236             goto join_posix_op_known;
10237
10238         case 'V':
10239             invert = 1;
10240             /* FALLTHROUGH */
10241         case 'v':
10242             arg = ANYOF_VERTWS;
10243             op = POSIXU;
10244             goto join_posix_op_known;
10245
10246         case 'S':
10247             invert = 1;
10248             /* FALLTHROUGH */
10249         case 's':
10250             arg = ANYOF_SPACE;
10251
10252         join_posix:
10253
10254             op = POSIXD + get_regex_charset(RExC_flags);
10255             if (op > POSIXA) {  /* /aa is same as /a */
10256                 op = POSIXA;
10257             }
10258
10259         join_posix_op_known:
10260
10261             if (invert) {
10262                 op += NPOSIXD - POSIXD;
10263             }
10264
10265             ret = reg_node(pRExC_state, op);
10266             if (! SIZE_ONLY) {
10267                 FLAGS(ret) = namedclass_to_classnum(arg);
10268             }
10269
10270             *flagp |= HASWIDTH|SIMPLE;
10271             /* FALL THROUGH */
10272
10273          finish_meta_pat:
10274             nextchar(pRExC_state);
10275             Set_Node_Length(ret, 2); /* MJD */
10276             break;
10277         case 'p':
10278         case 'P':
10279             {
10280                 char* const oldregxend = RExC_end;
10281 #ifdef DEBUGGING
10282                 char* parse_start = RExC_parse - 2;
10283 #endif
10284
10285                 if (RExC_parse[1] == '{') {
10286                   /* a lovely hack--pretend we saw [\pX] instead */
10287                     RExC_end = strchr(RExC_parse, '}');
10288                     if (!RExC_end) {
10289                         const U8 c = (U8)*RExC_parse;
10290                         RExC_parse += 2;
10291                         RExC_end = oldregxend;
10292                         vFAIL2("Missing right brace on \\%c{}", c);
10293                     }
10294                     RExC_end++;
10295                 }
10296                 else {
10297                     RExC_end = RExC_parse + 2;
10298                     if (RExC_end > oldregxend)
10299                         RExC_end = oldregxend;
10300                 }
10301                 RExC_parse--;
10302
10303                 ret = regclass(pRExC_state, flagp,depth+1);
10304
10305                 RExC_end = oldregxend;
10306                 RExC_parse--;
10307
10308                 Set_Node_Offset(ret, parse_start + 2);
10309                 Set_Node_Cur_Length(ret);
10310                 nextchar(pRExC_state);
10311             }
10312             break;
10313         case 'N':
10314             /* Handle \N and \N{NAME} with multiple code points here and not
10315              * below because it can be multicharacter. join_exact() will join
10316              * them up later on.  Also this makes sure that things like
10317              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10318              * The options to the grok function call causes it to fail if the
10319              * sequence is just a single code point.  We then go treat it as
10320              * just another character in the current EXACT node, and hence it
10321              * gets uniform treatment with all the other characters.  The
10322              * special treatment for quantifiers is not needed for such single
10323              * character sequences */
10324             ++RExC_parse;
10325             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10326                 RExC_parse--;
10327                 goto defchar;
10328             }
10329             break;
10330         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10331         parse_named_seq:
10332         {
10333             char ch= RExC_parse[1];
10334             if (ch != '<' && ch != '\'' && ch != '{') {
10335                 RExC_parse++;
10336                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10337             } else {
10338                 /* this pretty much dupes the code for (?P=...) in reg(), if
10339                    you change this make sure you change that */
10340                 char* name_start = (RExC_parse += 2);
10341                 U32 num = 0;
10342                 SV *sv_dat = reg_scan_name(pRExC_state,
10343                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10344                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10345                 if (RExC_parse == name_start || *RExC_parse != ch)
10346                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10347
10348                 if (!SIZE_ONLY) {
10349                     num = add_data( pRExC_state, 1, "S" );
10350                     RExC_rxi->data->data[num]=(void*)sv_dat;
10351                     SvREFCNT_inc_simple_void(sv_dat);
10352                 }
10353
10354                 RExC_sawback = 1;
10355                 ret = reganode(pRExC_state,
10356                                ((! FOLD)
10357                                  ? NREF
10358                                  : (ASCII_FOLD_RESTRICTED)
10359                                    ? NREFFA
10360                                    : (AT_LEAST_UNI_SEMANTICS)
10361                                      ? NREFFU
10362                                      : (LOC)
10363                                        ? NREFFL
10364                                        : NREFF),
10365                                 num);
10366                 *flagp |= HASWIDTH;
10367
10368                 /* override incorrect value set in reganode MJD */
10369                 Set_Node_Offset(ret, parse_start+1);
10370                 Set_Node_Cur_Length(ret); /* MJD */
10371                 nextchar(pRExC_state);
10372
10373             }
10374             break;
10375         }
10376         case 'g':
10377         case '1': case '2': case '3': case '4':
10378         case '5': case '6': case '7': case '8': case '9':
10379             {
10380                 I32 num;
10381                 bool isg = *RExC_parse == 'g';
10382                 bool isrel = 0;
10383                 bool hasbrace = 0;
10384                 if (isg) {
10385                     RExC_parse++;
10386                     if (*RExC_parse == '{') {
10387                         RExC_parse++;
10388                         hasbrace = 1;
10389                     }
10390                     if (*RExC_parse == '-') {
10391                         RExC_parse++;
10392                         isrel = 1;
10393                     }
10394                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10395                         if (isrel) RExC_parse--;
10396                         RExC_parse -= 2;
10397                         goto parse_named_seq;
10398                 }   }
10399                 num = atoi(RExC_parse);
10400                 if (isg && num == 0)
10401                     vFAIL("Reference to invalid group 0");
10402                 if (isrel) {
10403                     num = RExC_npar - num;
10404                     if (num < 1)
10405                         vFAIL("Reference to nonexistent or unclosed group");
10406                 }
10407                 if (!isg && num > 9 && num >= RExC_npar)
10408                     /* Probably a character specified in octal, e.g. \35 */
10409                     goto defchar;
10410                 else {
10411                     char * const parse_start = RExC_parse - 1; /* MJD */
10412                     while (isDIGIT(*RExC_parse))
10413                         RExC_parse++;
10414                     if (parse_start == RExC_parse - 1)
10415                         vFAIL("Unterminated \\g... pattern");
10416                     if (hasbrace) {
10417                         if (*RExC_parse != '}')
10418                             vFAIL("Unterminated \\g{...} pattern");
10419                         RExC_parse++;
10420                     }
10421                     if (!SIZE_ONLY) {
10422                         if (num > (I32)RExC_rx->nparens)
10423                             vFAIL("Reference to nonexistent group");
10424                     }
10425                     RExC_sawback = 1;
10426                     ret = reganode(pRExC_state,
10427                                    ((! FOLD)
10428                                      ? REF
10429                                      : (ASCII_FOLD_RESTRICTED)
10430                                        ? REFFA
10431                                        : (AT_LEAST_UNI_SEMANTICS)
10432                                          ? REFFU
10433                                          : (LOC)
10434                                            ? REFFL
10435                                            : REFF),
10436                                     num);
10437                     *flagp |= HASWIDTH;
10438
10439                     /* override incorrect value set in reganode MJD */
10440                     Set_Node_Offset(ret, parse_start+1);
10441                     Set_Node_Cur_Length(ret); /* MJD */
10442                     RExC_parse--;
10443                     nextchar(pRExC_state);
10444                 }
10445             }
10446             break;
10447         case '\0':
10448             if (RExC_parse >= RExC_end)
10449                 FAIL("Trailing \\");
10450             /* FALL THROUGH */
10451         default:
10452             /* Do not generate "unrecognized" warnings here, we fall
10453                back into the quick-grab loop below */
10454             parse_start--;
10455             goto defchar;
10456         }
10457         break;
10458
10459     case '#':
10460         if (RExC_flags & RXf_PMf_EXTENDED) {
10461             if ( reg_skipcomment( pRExC_state ) )
10462                 goto tryagain;
10463         }
10464         /* FALL THROUGH */
10465
10466     default:
10467
10468             parse_start = RExC_parse - 1;
10469
10470             RExC_parse++;
10471
10472         defchar: {
10473             STRLEN len = 0;
10474             UV ender;
10475             char *p;
10476             char *s;
10477 #define MAX_NODE_STRING_SIZE 127
10478             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10479             char *s0;
10480             U8 upper_parse = MAX_NODE_STRING_SIZE;
10481             STRLEN foldlen;
10482             U8 node_type;
10483             bool next_is_quantifier;
10484             char * oldp = NULL;
10485
10486             /* If a folding node contains only code points that don't
10487              * participate in folds, it can be changed into an EXACT node,
10488              * which allows the optimizer more things to look for */
10489             bool maybe_exact;
10490
10491             ender = 0;
10492             node_type = compute_EXACTish(pRExC_state);
10493             ret = reg_node(pRExC_state, node_type);
10494
10495             /* In pass1, folded, we use a temporary buffer instead of the
10496              * actual node, as the node doesn't exist yet */
10497             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10498
10499             s0 = s;
10500
10501         reparse:
10502
10503             /* We do the EXACTFish to EXACT node only if folding, and not if in
10504              * locale, as whether a character folds or not isn't known until
10505              * runtime */
10506             maybe_exact = FOLD && ! LOC;
10507
10508             /* XXX The node can hold up to 255 bytes, yet this only goes to
10509              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10510              * 255 allows us to not have to worry about overflow due to
10511              * converting to utf8 and fold expansion, but that value is
10512              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10513              * split up by this limit into a single one using the real max of
10514              * 255.  Even at 127, this breaks under rare circumstances.  If
10515              * folding, we do not want to split a node at a character that is a
10516              * non-final in a multi-char fold, as an input string could just
10517              * happen to want to match across the node boundary.  The join
10518              * would solve that problem if the join actually happens.  But a
10519              * series of more than two nodes in a row each of 127 would cause
10520              * the first join to succeed to get to 254, but then there wouldn't
10521              * be room for the next one, which could at be one of those split
10522              * multi-char folds.  I don't know of any fool-proof solution.  One
10523              * could back off to end with only a code point that isn't such a
10524              * non-final, but it is possible for there not to be any in the
10525              * entire node. */
10526             for (p = RExC_parse - 1;
10527                  len < upper_parse && p < RExC_end;
10528                  len++)
10529             {
10530                 oldp = p;
10531
10532                 if (RExC_flags & RXf_PMf_EXTENDED)
10533                     p = regwhite( pRExC_state, p );
10534                 switch ((U8)*p) {
10535                 case '^':
10536                 case '$':
10537                 case '.':
10538                 case '[':
10539                 case '(':
10540                 case ')':
10541                 case '|':
10542                     goto loopdone;
10543                 case '\\':
10544                     /* Literal Escapes Switch
10545
10546                        This switch is meant to handle escape sequences that
10547                        resolve to a literal character.
10548
10549                        Every escape sequence that represents something
10550                        else, like an assertion or a char class, is handled
10551                        in the switch marked 'Special Escapes' above in this
10552                        routine, but also has an entry here as anything that
10553                        isn't explicitly mentioned here will be treated as
10554                        an unescaped equivalent literal.
10555                     */
10556
10557                     switch ((U8)*++p) {
10558                     /* These are all the special escapes. */
10559                     case 'A':             /* Start assertion */
10560                     case 'b': case 'B':   /* Word-boundary assertion*/
10561                     case 'C':             /* Single char !DANGEROUS! */
10562                     case 'd': case 'D':   /* digit class */
10563                     case 'g': case 'G':   /* generic-backref, pos assertion */
10564                     case 'h': case 'H':   /* HORIZWS */
10565                     case 'k': case 'K':   /* named backref, keep marker */
10566                     case 'p': case 'P':   /* Unicode property */
10567                               case 'R':   /* LNBREAK */
10568                     case 's': case 'S':   /* space class */
10569                     case 'v': case 'V':   /* VERTWS */
10570                     case 'w': case 'W':   /* word class */
10571                     case 'X':             /* eXtended Unicode "combining character sequence" */
10572                     case 'z': case 'Z':   /* End of line/string assertion */
10573                         --p;
10574                         goto loopdone;
10575
10576                     /* Anything after here is an escape that resolves to a
10577                        literal. (Except digits, which may or may not)
10578                      */
10579                     case 'n':
10580                         ender = '\n';
10581                         p++;
10582                         break;
10583                     case 'N': /* Handle a single-code point named character. */
10584                         /* The options cause it to fail if a multiple code
10585                          * point sequence.  Handle those in the switch() above
10586                          * */
10587                         RExC_parse = p + 1;
10588                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10589                                             flagp, depth, FALSE))
10590                         {
10591                             RExC_parse = p = oldp;
10592                             goto loopdone;
10593                         }
10594                         p = RExC_parse;
10595                         if (ender > 0xff) {
10596                             REQUIRE_UTF8;
10597                         }
10598                         break;
10599                     case 'r':
10600                         ender = '\r';
10601                         p++;
10602                         break;
10603                     case 't':
10604                         ender = '\t';
10605                         p++;
10606                         break;
10607                     case 'f':
10608                         ender = '\f';
10609                         p++;
10610                         break;
10611                     case 'e':
10612                           ender = ASCII_TO_NATIVE('\033');
10613                         p++;
10614                         break;
10615                     case 'a':
10616                           ender = ASCII_TO_NATIVE('\007');
10617                         p++;
10618                         break;
10619                     case 'o':
10620                         {
10621                             STRLEN brace_len = len;
10622                             UV result;
10623                             const char* error_msg;
10624
10625                             bool valid = grok_bslash_o(p,
10626                                                        &result,
10627                                                        &brace_len,
10628                                                        &error_msg,
10629                                                        1);
10630                             p += brace_len;
10631                             if (! valid) {
10632                                 RExC_parse = p; /* going to die anyway; point
10633                                                    to exact spot of failure */
10634                                 vFAIL(error_msg);
10635                             }
10636                             else
10637                             {
10638                                 ender = result;
10639                             }
10640                             if (PL_encoding && ender < 0x100) {
10641                                 goto recode_encoding;
10642                             }
10643                             if (ender > 0xff) {
10644                                 REQUIRE_UTF8;
10645                             }
10646                             break;
10647                         }
10648                     case 'x':
10649                         {
10650                             STRLEN brace_len = len;
10651                             UV result;
10652                             const char* error_msg;
10653
10654                             bool valid = grok_bslash_x(p,
10655                                                        &result,
10656                                                        &brace_len,
10657                                                        &error_msg,
10658                                                        1);
10659                             p += brace_len;
10660                             if (! valid) {
10661                                 RExC_parse = p; /* going to die anyway; point
10662                                                    to exact spot of failure */
10663                                 vFAIL(error_msg);
10664                             }
10665                             else {
10666                                 ender = result;
10667                             }
10668                             if (PL_encoding && ender < 0x100) {
10669                                 goto recode_encoding;
10670                             }
10671                             if (ender > 0xff) {
10672                                 REQUIRE_UTF8;
10673                             }
10674                             break;
10675                         }
10676                     case 'c':
10677                         p++;
10678                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10679                         break;
10680                     case '0': case '1': case '2': case '3':case '4':
10681                     case '5': case '6': case '7':
10682                         if (*p == '0' ||
10683                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10684                         {
10685                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10686                             STRLEN numlen = 3;
10687                             ender = grok_oct(p, &numlen, &flags, NULL);
10688                             if (ender > 0xff) {
10689                                 REQUIRE_UTF8;
10690                             }
10691                             p += numlen;
10692                         }
10693                         else {
10694                             --p;
10695                             goto loopdone;
10696                         }
10697                         if (PL_encoding && ender < 0x100)
10698                             goto recode_encoding;
10699                         break;
10700                     recode_encoding:
10701                         if (! RExC_override_recoding) {
10702                             SV* enc = PL_encoding;
10703                             ender = reg_recode((const char)(U8)ender, &enc);
10704                             if (!enc && SIZE_ONLY)
10705                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10706                             REQUIRE_UTF8;
10707                         }
10708                         break;
10709                     case '\0':
10710                         if (p >= RExC_end)
10711                             FAIL("Trailing \\");
10712                         /* FALL THROUGH */
10713                     default:
10714                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10715                             ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10716                         }
10717                         goto normal_default;
10718                     }
10719                     break;
10720                 case '{':
10721                     /* Currently we don't warn when the lbrace is at the start
10722                      * of a construct.  This catches it in the middle of a
10723                      * literal string, or when its the first thing after
10724                      * something like "\b" */
10725                     if (! SIZE_ONLY
10726                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10727                     {
10728                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10729                     }
10730                     /*FALLTHROUGH*/
10731                 default:
10732                   normal_default:
10733                     if (UTF8_IS_START(*p) && UTF) {
10734                         STRLEN numlen;
10735                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10736                                                &numlen, UTF8_ALLOW_DEFAULT);
10737                         p += numlen;
10738                     }
10739                     else
10740                         ender = (U8) *p++;
10741                     break;
10742                 } /* End of switch on the literal */
10743
10744                 /* Here, have looked at the literal character and <ender>
10745                  * contains its ordinal, <p> points to the character after it
10746                  */
10747
10748                 if ( RExC_flags & RXf_PMf_EXTENDED)
10749                     p = regwhite( pRExC_state, p );
10750
10751                 /* If the next thing is a quantifier, it applies to this
10752                  * character only, which means that this character has to be in
10753                  * its own node and can't just be appended to the string in an
10754                  * existing node, so if there are already other characters in
10755                  * the node, close the node with just them, and set up to do
10756                  * this character again next time through, when it will be the
10757                  * only thing in its new node */
10758                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10759                 {
10760                     p = oldp;
10761                     goto loopdone;
10762                 }
10763
10764                 if (FOLD) {
10765                     if (UTF
10766                             /* See comments for join_exact() as to why we fold
10767                              * this non-UTF at compile time */
10768                         || (node_type == EXACTFU
10769                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10770                     {
10771
10772
10773                         /* Prime the casefolded buffer.  Locale rules, which
10774                          * apply only to code points < 256, aren't known until
10775                          * execution, so for them, just output the original
10776                          * character using utf8.  If we start to fold non-UTF
10777                          * patterns, be sure to update join_exact() */
10778                         if (LOC && ender < 256) {
10779                             if (UNI_IS_INVARIANT(ender)) {
10780                                 *s = (U8) ender;
10781                                 foldlen = 1;
10782                             } else {
10783                                 *s = UTF8_TWO_BYTE_HI(ender);
10784                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10785                                 foldlen = 2;
10786                             }
10787                         }
10788                         else {
10789                             UV folded = _to_uni_fold_flags(
10790                                            ender,
10791                                            (U8 *) s,
10792                                            &foldlen,
10793                                            FOLD_FLAGS_FULL
10794                                            | ((LOC) ?  FOLD_FLAGS_LOCALE
10795                                                     : (ASCII_FOLD_RESTRICTED)
10796                                                       ? FOLD_FLAGS_NOMIX_ASCII
10797                                                       : 0)
10798                                             );
10799
10800                             /* If this node only contains non-folding code
10801                              * points so far, see if this new one is also
10802                              * non-folding */
10803                             if (maybe_exact) {
10804                                 if (folded != ender) {
10805                                     maybe_exact = FALSE;
10806                                 }
10807                                 else {
10808                                     /* Here the fold is the original; we have
10809                                      * to check further to see if anything
10810                                      * folds to it */
10811                                     if (! PL_utf8_foldable) {
10812                                         SV* swash = swash_init("utf8",
10813                                                            "_Perl_Any_Folds",
10814                                                            &PL_sv_undef, 1, 0);
10815                                         PL_utf8_foldable =
10816                                                     _get_swash_invlist(swash);
10817                                         SvREFCNT_dec_NN(swash);
10818                                     }
10819                                     if (_invlist_contains_cp(PL_utf8_foldable,
10820                                                              ender))
10821                                     {
10822                                         maybe_exact = FALSE;
10823                                     }
10824                                 }
10825                             }
10826                             ender = folded;
10827                         }
10828                         s += foldlen;
10829
10830                         /* The loop increments <len> each time, as all but this
10831                          * path (and the one just below for UTF) through it add
10832                          * a single byte to the EXACTish node.  But this one
10833                          * has changed len to be the correct final value, so
10834                          * subtract one to cancel out the increment that
10835                          * follows */
10836                         len += foldlen - 1;
10837                     }
10838                     else {
10839                         *(s++) = ender;
10840                         maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10841                     }
10842                 }
10843                 else if (UTF) {
10844                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10845                     if (unilen > 0) {
10846                        s   += unilen;
10847                        len += unilen;
10848                     }
10849
10850                     /* See comment just above for - 1 */
10851                     len--;
10852                 }
10853                 else {
10854                     REGC((char)ender, s++);
10855                 }
10856
10857                 if (next_is_quantifier) {
10858
10859                     /* Here, the next input is a quantifier, and to get here,
10860                      * the current character is the only one in the node.
10861                      * Also, here <len> doesn't include the final byte for this
10862                      * character */
10863                     len++;
10864                     goto loopdone;
10865                 }
10866
10867             } /* End of loop through literal characters */
10868
10869             /* Here we have either exhausted the input or ran out of room in
10870              * the node.  (If we encountered a character that can't be in the
10871              * node, transfer is made directly to <loopdone>, and so we
10872              * wouldn't have fallen off the end of the loop.)  In the latter
10873              * case, we artificially have to split the node into two, because
10874              * we just don't have enough space to hold everything.  This
10875              * creates a problem if the final character participates in a
10876              * multi-character fold in the non-final position, as a match that
10877              * should have occurred won't, due to the way nodes are matched,
10878              * and our artificial boundary.  So back off until we find a non-
10879              * problematic character -- one that isn't at the beginning or
10880              * middle of such a fold.  (Either it doesn't participate in any
10881              * folds, or appears only in the final position of all the folds it
10882              * does participate in.)  A better solution with far fewer false
10883              * positives, and that would fill the nodes more completely, would
10884              * be to actually have available all the multi-character folds to
10885              * test against, and to back-off only far enough to be sure that
10886              * this node isn't ending with a partial one.  <upper_parse> is set
10887              * further below (if we need to reparse the node) to include just
10888              * up through that final non-problematic character that this code
10889              * identifies, so when it is set to less than the full node, we can
10890              * skip the rest of this */
10891             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10892
10893                 const STRLEN full_len = len;
10894
10895                 assert(len >= MAX_NODE_STRING_SIZE);
10896
10897                 /* Here, <s> points to the final byte of the final character.
10898                  * Look backwards through the string until find a non-
10899                  * problematic character */
10900
10901                 if (! UTF) {
10902
10903                     /* These two have no multi-char folds to non-UTF characters
10904                      */
10905                     if (ASCII_FOLD_RESTRICTED || LOC) {
10906                         goto loopdone;
10907                     }
10908
10909                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10910                     len = s - s0 + 1;
10911                 }
10912                 else {
10913                     if (!  PL_NonL1NonFinalFold) {
10914                         PL_NonL1NonFinalFold = _new_invlist_C_array(
10915                                         NonL1_Perl_Non_Final_Folds_invlist);
10916                     }
10917
10918                     /* Point to the first byte of the final character */
10919                     s = (char *) utf8_hop((U8 *) s, -1);
10920
10921                     while (s >= s0) {   /* Search backwards until find
10922                                            non-problematic char */
10923                         if (UTF8_IS_INVARIANT(*s)) {
10924
10925                             /* There are no ascii characters that participate
10926                              * in multi-char folds under /aa.  In EBCDIC, the
10927                              * non-ascii invariants are all control characters,
10928                              * so don't ever participate in any folds. */
10929                             if (ASCII_FOLD_RESTRICTED
10930                                 || ! IS_NON_FINAL_FOLD(*s))
10931                             {
10932                                 break;
10933                             }
10934                         }
10935                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10936
10937                             /* No Latin1 characters participate in multi-char
10938                              * folds under /l */
10939                             if (LOC
10940                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10941                                                                 *s, *(s+1))))
10942                             {
10943                                 break;
10944                             }
10945                         }
10946                         else if (! _invlist_contains_cp(
10947                                         PL_NonL1NonFinalFold,
10948                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
10949                         {
10950                             break;
10951                         }
10952
10953                         /* Here, the current character is problematic in that
10954                          * it does occur in the non-final position of some
10955                          * fold, so try the character before it, but have to
10956                          * special case the very first byte in the string, so
10957                          * we don't read outside the string */
10958                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
10959                     } /* End of loop backwards through the string */
10960
10961                     /* If there were only problematic characters in the string,
10962                      * <s> will point to before s0, in which case the length
10963                      * should be 0, otherwise include the length of the
10964                      * non-problematic character just found */
10965                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
10966                 }
10967
10968                 /* Here, have found the final character, if any, that is
10969                  * non-problematic as far as ending the node without splitting
10970                  * it across a potential multi-char fold.  <len> contains the
10971                  * number of bytes in the node up-to and including that
10972                  * character, or is 0 if there is no such character, meaning
10973                  * the whole node contains only problematic characters.  In
10974                  * this case, give up and just take the node as-is.  We can't
10975                  * do any better */
10976                 if (len == 0) {
10977                     len = full_len;
10978                 } else {
10979
10980                     /* Here, the node does contain some characters that aren't
10981                      * problematic.  If one such is the final character in the
10982                      * node, we are done */
10983                     if (len == full_len) {
10984                         goto loopdone;
10985                     }
10986                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
10987
10988                         /* If the final character is problematic, but the
10989                          * penultimate is not, back-off that last character to
10990                          * later start a new node with it */
10991                         p = oldp;
10992                         goto loopdone;
10993                     }
10994
10995                     /* Here, the final non-problematic character is earlier
10996                      * in the input than the penultimate character.  What we do
10997                      * is reparse from the beginning, going up only as far as
10998                      * this final ok one, thus guaranteeing that the node ends
10999                      * in an acceptable character.  The reason we reparse is
11000                      * that we know how far in the character is, but we don't
11001                      * know how to correlate its position with the input parse.
11002                      * An alternate implementation would be to build that
11003                      * correlation as we go along during the original parse,
11004                      * but that would entail extra work for every node, whereas
11005                      * this code gets executed only when the string is too
11006                      * large for the node, and the final two characters are
11007                      * problematic, an infrequent occurrence.  Yet another
11008                      * possible strategy would be to save the tail of the
11009                      * string, and the next time regatom is called, initialize
11010                      * with that.  The problem with this is that unless you
11011                      * back off one more character, you won't be guaranteed
11012                      * regatom will get called again, unless regbranch,
11013                      * regpiece ... are also changed.  If you do back off that
11014                      * extra character, so that there is input guaranteed to
11015                      * force calling regatom, you can't handle the case where
11016                      * just the first character in the node is acceptable.  I
11017                      * (khw) decided to try this method which doesn't have that
11018                      * pitfall; if performance issues are found, we can do a
11019                      * combination of the current approach plus that one */
11020                     upper_parse = len;
11021                     len = 0;
11022                     s = s0;
11023                     goto reparse;
11024                 }
11025             }   /* End of verifying node ends with an appropriate char */
11026
11027         loopdone:   /* Jumped to when encounters something that shouldn't be in
11028                        the node */
11029
11030             /* If 'maybe_exact' is still set here, means there are no
11031              * code points in the node that participate in folds */
11032             if (FOLD && maybe_exact) {
11033                 OP(ret) = EXACT;
11034             }
11035
11036             /* I (khw) don't know if you can get here with zero length, but the
11037              * old code handled this situation by creating a zero-length EXACT
11038              * node.  Might as well be NOTHING instead */
11039             if (len == 0) {
11040                 OP(ret) = NOTHING;
11041             }
11042             else{
11043                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11044             }
11045
11046             RExC_parse = p - 1;
11047             Set_Node_Cur_Length(ret); /* MJD */
11048             nextchar(pRExC_state);
11049             {
11050                 /* len is STRLEN which is unsigned, need to copy to signed */
11051                 IV iv = len;
11052                 if (iv < 0)
11053                     vFAIL("Internal disaster");
11054             }
11055
11056         } /* End of label 'defchar:' */
11057         break;
11058     } /* End of giant switch on input character */
11059
11060     return(ret);
11061 }
11062
11063 STATIC char *
11064 S_regwhite( RExC_state_t *pRExC_state, char *p )
11065 {
11066     const char *e = RExC_end;
11067
11068     PERL_ARGS_ASSERT_REGWHITE;
11069
11070     while (p < e) {
11071         if (isSPACE(*p))
11072             ++p;
11073         else if (*p == '#') {
11074             bool ended = 0;
11075             do {
11076                 if (*p++ == '\n') {
11077                     ended = 1;
11078                     break;
11079                 }
11080             } while (p < e);
11081             if (!ended)
11082                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11083         }
11084         else
11085             break;
11086     }
11087     return p;
11088 }
11089
11090 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11091    Character classes ([:foo:]) can also be negated ([:^foo:]).
11092    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11093    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11094    but trigger failures because they are currently unimplemented. */
11095
11096 #define POSIXCC_DONE(c)   ((c) == ':')
11097 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11098 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11099
11100 PERL_STATIC_INLINE I32
11101 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
11102 {
11103     dVAR;
11104     I32 namedclass = OOB_NAMEDCLASS;
11105
11106     PERL_ARGS_ASSERT_REGPPOSIXCC;
11107
11108     if (value == '[' && RExC_parse + 1 < RExC_end &&
11109         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11110         POSIXCC(UCHARAT(RExC_parse))) {
11111         const char c = UCHARAT(RExC_parse);
11112         char* const s = RExC_parse++;
11113
11114         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11115             RExC_parse++;
11116         if (RExC_parse == RExC_end)
11117             /* Grandfather lone [:, [=, [. */
11118             RExC_parse = s;
11119         else {
11120             const char* const t = RExC_parse++; /* skip over the c */
11121             assert(*t == c);
11122
11123             if (UCHARAT(RExC_parse) == ']') {
11124                 const char *posixcc = s + 1;
11125                 RExC_parse++; /* skip over the ending ] */
11126
11127                 if (*s == ':') {
11128                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11129                     const I32 skip = t - posixcc;
11130
11131                     /* Initially switch on the length of the name.  */
11132                     switch (skip) {
11133                     case 4:
11134                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11135                             namedclass = ANYOF_WORDCHAR;
11136                         break;
11137                     case 5:
11138                         /* Names all of length 5.  */
11139                         /* alnum alpha ascii blank cntrl digit graph lower
11140                            print punct space upper  */
11141                         /* Offset 4 gives the best switch position.  */
11142                         switch (posixcc[4]) {
11143                         case 'a':
11144                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11145                                 namedclass = ANYOF_ALPHA;
11146                             break;
11147                         case 'e':
11148                             if (memEQ(posixcc, "spac", 4)) /* space */
11149                                 namedclass = ANYOF_PSXSPC;
11150                             break;
11151                         case 'h':
11152                             if (memEQ(posixcc, "grap", 4)) /* graph */
11153                                 namedclass = ANYOF_GRAPH;
11154                             break;
11155                         case 'i':
11156                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11157                                 namedclass = ANYOF_ASCII;
11158                             break;
11159                         case 'k':
11160                             if (memEQ(posixcc, "blan", 4)) /* blank */
11161                                 namedclass = ANYOF_BLANK;
11162                             break;
11163                         case 'l':
11164                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11165                                 namedclass = ANYOF_CNTRL;
11166                             break;
11167                         case 'm':
11168                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11169                                 namedclass = ANYOF_ALPHANUMERIC;
11170                             break;
11171                         case 'r':
11172                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11173                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11174                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11175                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11176                             break;
11177                         case 't':
11178                             if (memEQ(posixcc, "digi", 4)) /* digit */
11179                                 namedclass = ANYOF_DIGIT;
11180                             else if (memEQ(posixcc, "prin", 4)) /* print */
11181                                 namedclass = ANYOF_PRINT;
11182                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11183                                 namedclass = ANYOF_PUNCT;
11184                             break;
11185                         }
11186                         break;
11187                     case 6:
11188                         if (memEQ(posixcc, "xdigit", 6))
11189                             namedclass = ANYOF_XDIGIT;
11190                         break;
11191                     }
11192
11193                     if (namedclass == OOB_NAMEDCLASS)
11194                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11195                                       t - s - 1, s + 1);
11196
11197                     /* The #defines are structured so each complement is +1 to
11198                      * the normal one */
11199                     if (complement) {
11200                         namedclass++;
11201                     }
11202                     assert (posixcc[skip] == ':');
11203                     assert (posixcc[skip+1] == ']');
11204                 } else if (!SIZE_ONLY) {
11205                     /* [[=foo=]] and [[.foo.]] are still future. */
11206
11207                     /* adjust RExC_parse so the warning shows after
11208                        the class closes */
11209                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11210                         RExC_parse++;
11211                     SvREFCNT_dec(free_me);
11212                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11213                 }
11214             } else {
11215                 /* Maternal grandfather:
11216                  * "[:" ending in ":" but not in ":]" */
11217                 RExC_parse = s;
11218             }
11219         }
11220     }
11221
11222     return namedclass;
11223 }
11224
11225
11226 /* The names of properties whose definitions are not known at compile time are
11227  * stored in this SV, after a constant heading.  So if the length has been
11228  * changed since initialization, then there is a run-time definition. */
11229 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11230
11231 STATIC regnode *
11232 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11233 {
11234     /* parse a bracketed class specification.  Most of these will produce an ANYOF node;
11235      * but something like [a] will produce an EXACT node; [aA], an EXACTFish
11236      * node; [[:ascii:]], a POSIXA node; etc.  It is more complex under /i with
11237      * multi-character folds: it will be rewritten following the paradigm of
11238      * this example, where the <multi-fold>s are characters which fold to
11239      * multiple character sequences:
11240      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11241      * gets effectively rewritten as:
11242      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11243      * reg() gets called (recursively) on the rewritten version, and this
11244      * function will return what it constructs.  (Actually the <multi-fold>s
11245      * aren't physically removed from the [abcdefghi], it's just that they are
11246      * ignored in the recursion by means of a flag:
11247      * <RExC_in_multi_char_class>.)
11248      *
11249      * ANYOF nodes contain a bit map for the first 256 characters, with the
11250      * corresponding bit set if that character is in the list.  For characters
11251      * above 255, a range list or swash is used.  There are extra bits for \w,
11252      * etc. in locale ANYOFs, as what these match is not determinable at
11253      * compile time */
11254
11255     dVAR;
11256     UV nextvalue;
11257     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11258     IV range = 0;
11259     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11260     regnode *ret;
11261     STRLEN numlen;
11262     IV namedclass = OOB_NAMEDCLASS;
11263     char *rangebegin = NULL;
11264     bool need_class = 0;
11265     SV *listsv = NULL;
11266     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11267                                       than just initialized.  */
11268     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11269     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
11270                                extended beyond the Latin1 range */
11271     UV element_count = 0;   /* Number of distinct elements in the class.
11272                                Optimizations may be possible if this is tiny */
11273     AV * multi_char_matches = NULL; /* Code points that fold to more than one
11274                                        character; used under /i */
11275     UV n;
11276
11277     /* Unicode properties are stored in a swash; this holds the current one
11278      * being parsed.  If this swash is the only above-latin1 component of the
11279      * character class, an optimization is to pass it directly on to the
11280      * execution engine.  Otherwise, it is set to NULL to indicate that there
11281      * are other things in the class that have to be dealt with at execution
11282      * time */
11283     SV* swash = NULL;           /* Code points that match \p{} \P{} */
11284
11285     /* Set if a component of this character class is user-defined; just passed
11286      * on to the engine */
11287     bool has_user_defined_property = FALSE;
11288
11289     /* inversion list of code points this node matches only when the target
11290      * string is in UTF-8.  (Because is under /d) */
11291     SV* depends_list = NULL;
11292
11293     /* inversion list of code points this node matches.  For much of the
11294      * function, it includes only those that match regardless of the utf8ness
11295      * of the target string */
11296     SV* cp_list = NULL;
11297
11298 #ifdef EBCDIC
11299     /* In a range, counts how many 0-2 of the ends of it came from literals,
11300      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
11301     UV literal_endpoint = 0;
11302 #endif
11303     bool invert = FALSE;    /* Is this class to be complemented */
11304
11305     /* Is there any thing like \W or [:^digit:] that matches above the legal
11306      * Unicode range? */
11307     bool runtime_posix_matches_above_Unicode = FALSE;
11308
11309     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11310         case we need to change the emitted regop to an EXACT. */
11311     const char * orig_parse = RExC_parse;
11312     const I32 orig_size = RExC_size;
11313     GET_RE_DEBUG_FLAGS_DECL;
11314
11315     PERL_ARGS_ASSERT_REGCLASS;
11316 #ifndef DEBUGGING
11317     PERL_UNUSED_ARG(depth);
11318 #endif
11319
11320     DEBUG_PARSE("clas");
11321
11322     /* Assume we are going to generate an ANYOF node. */
11323     ret = reganode(pRExC_state, ANYOF, 0);
11324
11325     if (!SIZE_ONLY) {
11326         ANYOF_FLAGS(ret) = 0;
11327     }
11328
11329     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11330         RExC_parse++;
11331         invert = TRUE;
11332         RExC_naughty++;
11333     }
11334
11335     if (SIZE_ONLY) {
11336         RExC_size += ANYOF_SKIP;
11337         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11338     }
11339     else {
11340         RExC_emit += ANYOF_SKIP;
11341         if (LOC) {
11342             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11343         }
11344         listsv = newSVpvs("# comment\n");
11345         initial_listsv_len = SvCUR(listsv);
11346     }
11347
11348     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11349
11350     if (!SIZE_ONLY && POSIXCC(nextvalue))
11351     {
11352         const char *s = RExC_parse;
11353         const char  c = *s++;
11354
11355         while (isWORDCHAR(*s))
11356             s++;
11357         if (*s && c == *s && s[1] == ']') {
11358             SAVEFREESV(RExC_rx_sv);
11359             SAVEFREESV(listsv);
11360             ckWARN3reg(s+2,
11361                        "POSIX syntax [%c %c] belongs inside character classes",
11362                        c, c);
11363             (void)ReREFCNT_inc(RExC_rx_sv);
11364             SvREFCNT_inc_simple_void_NN(listsv);
11365         }
11366     }
11367
11368     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11369     if (UCHARAT(RExC_parse) == ']')
11370         goto charclassloop;
11371
11372 parseit:
11373     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11374
11375     charclassloop:
11376
11377         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11378         save_value = value;
11379         save_prevvalue = prevvalue;
11380
11381         if (!range) {
11382             rangebegin = RExC_parse;
11383             element_count++;
11384         }
11385         if (UTF) {
11386             value = utf8n_to_uvchr((U8*)RExC_parse,
11387                                    RExC_end - RExC_parse,
11388                                    &numlen, UTF8_ALLOW_DEFAULT);
11389             RExC_parse += numlen;
11390         }
11391         else
11392             value = UCHARAT(RExC_parse++);
11393
11394         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11395         if (value == '[' && POSIXCC(nextvalue))
11396             namedclass = regpposixcc(pRExC_state, value, listsv);
11397         else if (value == '\\') {
11398             if (UTF) {
11399                 value = utf8n_to_uvchr((U8*)RExC_parse,
11400                                    RExC_end - RExC_parse,
11401                                    &numlen, UTF8_ALLOW_DEFAULT);
11402                 RExC_parse += numlen;
11403             }
11404             else
11405                 value = UCHARAT(RExC_parse++);
11406             /* Some compilers cannot handle switching on 64-bit integer
11407              * values, therefore value cannot be an UV.  Yes, this will
11408              * be a problem later if we want switch on Unicode.
11409              * A similar issue a little bit later when switching on
11410              * namedclass. --jhi */
11411             switch ((I32)value) {
11412             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
11413             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
11414             case 's':   namedclass = ANYOF_SPACE;       break;
11415             case 'S':   namedclass = ANYOF_NSPACE;      break;
11416             case 'd':   namedclass = ANYOF_DIGIT;       break;
11417             case 'D':   namedclass = ANYOF_NDIGIT;      break;
11418             case 'v':   namedclass = ANYOF_VERTWS;      break;
11419             case 'V':   namedclass = ANYOF_NVERTWS;     break;
11420             case 'h':   namedclass = ANYOF_HORIZWS;     break;
11421             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
11422             case 'N':  /* Handle \N{NAME} in class */
11423                 {
11424                     /* We only pay attention to the first char of
11425                     multichar strings being returned. I kinda wonder
11426                     if this makes sense as it does change the behaviour
11427                     from earlier versions, OTOH that behaviour was broken
11428                     as well. */
11429                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11430                                       TRUE /* => charclass */))
11431                     {
11432                         goto parseit;
11433                     }
11434                 }
11435                 break;
11436             case 'p':
11437             case 'P':
11438                 {
11439                 char *e;
11440
11441                 /* This routine will handle any undefined properties */
11442                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11443
11444                 if (RExC_parse >= RExC_end)
11445                     vFAIL2("Empty \\%c{}", (U8)value);
11446                 if (*RExC_parse == '{') {
11447                     const U8 c = (U8)value;
11448                     e = strchr(RExC_parse++, '}');
11449                     if (!e)
11450                         vFAIL2("Missing right brace on \\%c{}", c);
11451                     while (isSPACE(UCHARAT(RExC_parse)))
11452                         RExC_parse++;
11453                     if (e == RExC_parse)
11454                         vFAIL2("Empty \\%c{}", c);
11455                     n = e - RExC_parse;
11456                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11457                         n--;
11458                 }
11459                 else {
11460                     e = RExC_parse;
11461                     n = 1;
11462                 }
11463                 if (!SIZE_ONLY) {
11464                     SV* invlist;
11465                     char* name;
11466
11467                     if (UCHARAT(RExC_parse) == '^') {
11468                          RExC_parse++;
11469                          n--;
11470                          /* toggle.  (The rhs xor gets the single bit that
11471                           * differs between P and p; the other xor inverts just
11472                           * that bit) */
11473                          value ^= 'P' ^ 'p';
11474
11475                          while (isSPACE(UCHARAT(RExC_parse))) {
11476                               RExC_parse++;
11477                               n--;
11478                          }
11479                     }
11480                     /* Try to get the definition of the property into
11481                      * <invlist>.  If /i is in effect, the effective property
11482                      * will have its name be <__NAME_i>.  The design is
11483                      * discussed in commit
11484                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11485                     Newx(name, n + sizeof("_i__\n"), char);
11486
11487                     sprintf(name, "%s%.*s%s\n",
11488                                     (FOLD) ? "__" : "",
11489                                     (int)n,
11490                                     RExC_parse,
11491                                     (FOLD) ? "_i" : ""
11492                     );
11493
11494                     /* Look up the property name, and get its swash and
11495                      * inversion list, if the property is found  */
11496                     if (swash) {
11497                         SvREFCNT_dec_NN(swash);
11498                     }
11499                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
11500                                              1, /* binary */
11501                                              0, /* not tr/// */
11502                                              NULL, /* No inversion list */
11503                                              &swash_init_flags
11504                                             );
11505                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11506                         if (swash) {
11507                             SvREFCNT_dec_NN(swash);
11508                             swash = NULL;
11509                         }
11510
11511                         /* Here didn't find it.  It could be a user-defined
11512                          * property that will be available at run-time.  Add it
11513                          * to the list to look up then */
11514                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11515                                         (value == 'p' ? '+' : '!'),
11516                                         name);
11517                         has_user_defined_property = TRUE;
11518
11519                         /* We don't know yet, so have to assume that the
11520                          * property could match something in the Latin1 range,
11521                          * hence something that isn't utf8.  Note that this
11522                          * would cause things in <depends_list> to match
11523                          * inappropriately, except that any \p{}, including
11524                          * this one forces Unicode semantics, which means there
11525                          * is <no depends_list> */
11526                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11527                     }
11528                     else {
11529
11530                         /* Here, did get the swash and its inversion list.  If
11531                          * the swash is from a user-defined property, then this
11532                          * whole character class should be regarded as such */
11533                         has_user_defined_property =
11534                                     (swash_init_flags
11535                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11536
11537                         /* Invert if asking for the complement */
11538                         if (value == 'P') {
11539                             _invlist_union_complement_2nd(properties,
11540                                                           invlist,
11541                                                           &properties);
11542
11543                             /* The swash can't be used as-is, because we've
11544                              * inverted things; delay removing it to here after
11545                              * have copied its invlist above */
11546                             SvREFCNT_dec_NN(swash);
11547                             swash = NULL;
11548                         }
11549                         else {
11550                             _invlist_union(properties, invlist, &properties);
11551                         }
11552                     }
11553                     Safefree(name);
11554                 }
11555                 RExC_parse = e + 1;
11556                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's named */
11557
11558                 /* \p means they want Unicode semantics */
11559                 RExC_uni_semantics = 1;
11560                 }
11561                 break;
11562             case 'n':   value = '\n';                   break;
11563             case 'r':   value = '\r';                   break;
11564             case 't':   value = '\t';                   break;
11565             case 'f':   value = '\f';                   break;
11566             case 'b':   value = '\b';                   break;
11567             case 'e':   value = ASCII_TO_NATIVE('\033');break;
11568             case 'a':   value = ASCII_TO_NATIVE('\007');break;
11569             case 'o':
11570                 RExC_parse--;   /* function expects to be pointed at the 'o' */
11571                 {
11572                     const char* error_msg;
11573                     bool valid = grok_bslash_o(RExC_parse,
11574                                                &value,
11575                                                &numlen,
11576                                                &error_msg,
11577                                                SIZE_ONLY);
11578                     RExC_parse += numlen;
11579                     if (! valid) {
11580                         vFAIL(error_msg);
11581                     }
11582                 }
11583                 if (PL_encoding && value < 0x100) {
11584                     goto recode_encoding;
11585                 }
11586                 break;
11587             case 'x':
11588                 RExC_parse--;   /* function expects to be pointed at the 'x' */
11589                 {
11590                     const char* error_msg;
11591                     bool valid = grok_bslash_x(RExC_parse,
11592                                                &value,
11593                                                &numlen,
11594                                                &error_msg,
11595                                                1);
11596                     RExC_parse += numlen;
11597                     if (! valid) {
11598                         vFAIL(error_msg);
11599                     }
11600                 }
11601                 if (PL_encoding && value < 0x100)
11602                     goto recode_encoding;
11603                 break;
11604             case 'c':
11605                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11606                 break;
11607             case '0': case '1': case '2': case '3': case '4':
11608             case '5': case '6': case '7':
11609                 {
11610                     /* Take 1-3 octal digits */
11611                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11612                     numlen = 3;
11613                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11614                     RExC_parse += numlen;
11615                     if (PL_encoding && value < 0x100)
11616                         goto recode_encoding;
11617                     break;
11618                 }
11619             recode_encoding:
11620                 if (! RExC_override_recoding) {
11621                     SV* enc = PL_encoding;
11622                     value = reg_recode((const char)(U8)value, &enc);
11623                     if (!enc && SIZE_ONLY)
11624                         ckWARNreg(RExC_parse,
11625                                   "Invalid escape in the specified encoding");
11626                     break;
11627                 }
11628             default:
11629                 /* Allow \_ to not give an error */
11630                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
11631                     SAVEFREESV(RExC_rx_sv);
11632                     SAVEFREESV(listsv);
11633                     ckWARN2reg(RExC_parse,
11634                                "Unrecognized escape \\%c in character class passed through",
11635                                (int)value);
11636                     (void)ReREFCNT_inc(RExC_rx_sv);
11637                     SvREFCNT_inc_simple_void_NN(listsv);
11638                 }
11639                 break;
11640             }
11641         } /* end of \blah */
11642 #ifdef EBCDIC
11643         else
11644             literal_endpoint++;
11645 #endif
11646
11647             /* What matches in a locale is not known until runtime.  This
11648              * includes what the Posix classes (like \w, [:space:]) match.
11649              * Room must be reserved (one time per class) to store such
11650              * classes, either if Perl is compiled so that locale nodes always
11651              * should have this space, or if there is such class info to be
11652              * stored.  The space will contain a bit for each named class that
11653              * is to be matched against.  This isn't needed for \p{} and
11654              * pseudo-classes, as they are not affected by locale, and hence
11655              * are dealt with separately */
11656             if (LOC
11657                 && ! need_class
11658                 && (ANYOF_LOCALE == ANYOF_CLASS
11659                     || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11660             {
11661                 need_class = 1;
11662                 if (SIZE_ONLY) {
11663                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11664                 }
11665                 else {
11666                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11667                     ANYOF_CLASS_ZERO(ret);
11668                 }
11669                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11670             }
11671
11672         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11673
11674             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
11675              * literal, as is the character that began the false range, i.e.
11676              * the 'a' in the examples */
11677             if (range) {
11678                 if (!SIZE_ONLY) {
11679                     const int w =
11680                         RExC_parse >= rangebegin ?
11681                         RExC_parse - rangebegin : 0;
11682                     SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11683                     SAVEFREESV(listsv);
11684                     ckWARN4reg(RExC_parse,
11685                                "False [] range \"%*.*s\"",
11686                                w, w, rangebegin);
11687                     (void)ReREFCNT_inc(RExC_rx_sv);
11688                     SvREFCNT_inc_simple_void_NN(listsv);
11689                     cp_list = add_cp_to_invlist(cp_list, '-');
11690                     cp_list = add_cp_to_invlist(cp_list, prevvalue);
11691                 }
11692
11693                 range = 0; /* this was not a true range */
11694                 element_count += 2; /* So counts for three values */
11695             }
11696
11697             if (! SIZE_ONLY) {
11698                 U8 classnum = namedclass_to_classnum(namedclass);
11699                 if (namedclass >= ANYOF_MAX) {  /* If a special class */
11700                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
11701
11702                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
11703                          * /l make a difference in what these match.  There
11704                          * would be problems if these characters had folds
11705                          * other than themselves, as cp_list is subject to
11706                          * folding. */
11707                         if (classnum != _CC_VERTSPACE) {
11708                             assert(   namedclass == ANYOF_HORIZWS
11709                                    || namedclass == ANYOF_NHORIZWS);
11710
11711                             /* It turns out that \h is just a synonym for
11712                              * XPosixBlank */
11713                             classnum = _CC_BLANK;
11714                         }
11715
11716                         _invlist_union_maybe_complement_2nd(
11717                                 cp_list,
11718                                 PL_XPosix_ptrs[classnum],
11719                                 namedclass % 2,  /* Complement if odd
11720                                                     (NHORIZWS, NVERTWS) */
11721                                 &cp_list);
11722                     }
11723                 }
11724                 else if (classnum == _CC_ASCII) {
11725 #ifdef HAS_ISASCII
11726                     if (LOC) {
11727                         ANYOF_CLASS_SET(ret, namedclass);
11728                     }
11729                     else
11730 #endif  /* Not isascii(); just use the hard-coded definition for it */
11731                         _invlist_union_maybe_complement_2nd(
11732                                 posixes,
11733                                 PL_ASCII,
11734                                 namedclass % 2, /* Complement if odd (NASCII) */
11735                                 &posixes);
11736                 }
11737                 else {  /* Garden variety class */
11738
11739                     /* The ascii range inversion list */
11740                     SV* ascii_source = PL_Posix_ptrs[classnum];
11741
11742                     /* The full Latin1 range inversion list */
11743                     SV* l1_source = PL_L1Posix_ptrs[classnum];
11744
11745                     /* This code is structured into two major clauses.  The
11746                      * first is for classes whose complete definitions may not
11747                      * already be known.  It not, the Latin1 definition
11748                      * (guaranteed to already known) is used plus code is
11749                      * generated to load the rest at run-time (only if needed).
11750                      * If the complete definition is known, it drops down to
11751                      * the second clause, where the complete definition is
11752                      * known */
11753
11754                     if (classnum < _FIRST_NON_SWASH_CC) {
11755
11756                         /* Here, the class has a swash, which may or not
11757                          * already be loaded */
11758
11759                         /* The name of the property to use to match the full
11760                          * eXtended Unicode range swash for this character
11761                          * class */
11762                         const char *Xname = swash_property_names[classnum];
11763
11764                         if ( !  PL_utf8_swash_ptrs[classnum]) {
11765                             if (namedclass % 2 == 0) { /* A non-complemented
11766                                                           class */
11767                                 /* If not /a matching, there are code points we
11768                                  * don't know at compile time.  Arrange for the
11769                                  * unknown matches to be loaded at run-time, if
11770                                  * needed */
11771                                 if (! AT_LEAST_ASCII_RESTRICTED) {
11772                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
11773                                                                  Xname);
11774                                 }
11775                                 if (LOC) {  /* Under locale, set run-time
11776                                                lookup */
11777                                     ANYOF_CLASS_SET(ret, namedclass);
11778                                 }
11779                                 else {
11780                                     /* Add the current class's code points to
11781                                      * the running total */
11782                                     _invlist_union(posixes,
11783                                                    (AT_LEAST_ASCII_RESTRICTED)
11784                                                         ? ascii_source
11785                                                         : l1_source,
11786                                                    &posixes);
11787                                 }
11788                             }
11789                             else {  /* A complemented class */
11790                                 if (AT_LEAST_ASCII_RESTRICTED) {
11791                                     /* Under /a should match everything above
11792                                      * ASCII, plus the complement of the set's
11793                                      * ASCII matches */
11794                                     _invlist_union_complement_2nd(posixes,
11795                                                                   ascii_source,
11796                                                                   &posixes);
11797                                 }
11798                                 else {
11799                                     /* Arrange for the unknown matches to be
11800                                      * loaded at run-time, if needed */
11801                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
11802                                                                  Xname);
11803                                     runtime_posix_matches_above_Unicode = TRUE;
11804                                     if (LOC) {
11805                                         ANYOF_CLASS_SET(ret, namedclass);
11806                                     }
11807                                     else {
11808
11809                                         /* We want to match everything in
11810                                          * Latin1, except those things that
11811                                          * l1_source matches */
11812                                         SV* scratch_list = NULL;
11813                                         _invlist_subtract(PL_Latin1, l1_source,
11814                                                           &scratch_list);
11815
11816                                         /* Add the list from this class to the
11817                                          * running total */
11818                                         if (! posixes) {
11819                                             posixes = scratch_list;
11820                                         }
11821                                         else {
11822                                             _invlist_union(posixes,
11823                                                            scratch_list,
11824                                                            &posixes);
11825                                             SvREFCNT_dec_NN(scratch_list);
11826                                         }
11827                                         if (DEPENDS_SEMANTICS) {
11828                                             ANYOF_FLAGS(ret)
11829                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
11830                                         }
11831                                     }
11832                                 }
11833                             }
11834                             goto namedclass_done;
11835                         }
11836
11837                         /* Here, there is a swash loaded for the class.  If no
11838                          * inversion list for it yet, get it */
11839                         if (! PL_XPosix_ptrs[classnum]) {
11840                             PL_XPosix_ptrs[classnum]
11841                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
11842                         }
11843                     }
11844
11845                     /* Here there is an inversion list already loaded for the
11846                      * entire class */
11847
11848                     if (namedclass % 2 == 0) {  /* A non-complemented class,
11849                                                    like ANYOF_PUNCT */
11850                         if (! LOC) {
11851                             /* For non-locale, just add it to any existing list
11852                              * */
11853                             _invlist_union(posixes,
11854                                            (AT_LEAST_ASCII_RESTRICTED)
11855                                                ? ascii_source
11856                                                : PL_XPosix_ptrs[classnum],
11857                                            &posixes);
11858                         }
11859                         else {  /* Locale */
11860                             SV* scratch_list = NULL;
11861
11862                             /* For above Latin1 code points, we use the full
11863                              * Unicode range */
11864                             _invlist_intersection(PL_AboveLatin1,
11865                                                   PL_XPosix_ptrs[classnum],
11866                                                   &scratch_list);
11867                             /* And set the output to it, adding instead if
11868                              * there already is an output.  Checking if
11869                              * 'posixes' is NULL first saves an extra clone.
11870                              * Its reference count will be decremented at the
11871                              * next union, etc, or if this is the only
11872                              * instance, at the end of the routine */
11873                             if (! posixes) {
11874                                 posixes = scratch_list;
11875                             }
11876                             else {
11877                                 _invlist_union(posixes, scratch_list, &posixes);
11878                                 SvREFCNT_dec_NN(scratch_list);
11879                             }
11880
11881 #ifndef HAS_ISBLANK
11882                             if (namedclass != ANYOF_BLANK) {
11883 #endif
11884                                 /* Set this class in the node for runtime
11885                                  * matching */
11886                                 ANYOF_CLASS_SET(ret, namedclass);
11887 #ifndef HAS_ISBLANK
11888                             }
11889                             else {
11890                                 /* No isblank(), use the hard-coded ASCII-range
11891                                  * blanks, adding them to the running total. */
11892
11893                                 _invlist_union(posixes, ascii_source, &posixes);
11894                             }
11895 #endif
11896                         }
11897                     }
11898                     else {  /* A complemented class, like ANYOF_NPUNCT */
11899                         if (! LOC) {
11900                             _invlist_union_complement_2nd(
11901                                                 posixes,
11902                                                 (AT_LEAST_ASCII_RESTRICTED)
11903                                                     ? ascii_source
11904                                                     : PL_XPosix_ptrs[classnum],
11905                                                 &posixes);
11906                             /* Under /d, everything in the upper half of the
11907                              * Latin1 range matches this complement */
11908                             if (DEPENDS_SEMANTICS) {
11909                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11910                             }
11911                         }
11912                         else {  /* Locale */
11913                             SV* scratch_list = NULL;
11914                             _invlist_subtract(PL_AboveLatin1,
11915                                               PL_XPosix_ptrs[classnum],
11916                                               &scratch_list);
11917                             if (! posixes) {
11918                                 posixes = scratch_list;
11919                             }
11920                             else {
11921                                 _invlist_union(posixes, scratch_list, &posixes);
11922                                 SvREFCNT_dec_NN(scratch_list);
11923                             }
11924 #ifndef HAS_ISBLANK
11925                             if (namedclass != ANYOF_NBLANK) {
11926 #endif
11927                                 ANYOF_CLASS_SET(ret, namedclass);
11928 #ifndef HAS_ISBLANK
11929                             }
11930                             else {
11931                                 /* Get the list of all code points in Latin1
11932                                  * that are not ASCII blanks, and add them to
11933                                  * the running total */
11934                                 _invlist_subtract(PL_Latin1, ascii_source,
11935                                                   &scratch_list);
11936                                 _invlist_union(posixes, scratch_list, &posixes);
11937                                 SvREFCNT_dec_NN(scratch_list);
11938                             }
11939 #endif
11940                         }
11941                     }
11942                 }
11943               namedclass_done:
11944                 continue;   /* Go get next character */
11945             }
11946         } /* end of namedclass \blah */
11947
11948         if (range) {
11949             if (prevvalue > value) /* b-a */ {
11950                 const int w = RExC_parse - rangebegin;
11951                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11952                 range = 0; /* not a valid range */
11953             }
11954         }
11955         else {
11956             prevvalue = value; /* save the beginning of the potential range */
11957             if (RExC_parse+1 < RExC_end
11958                 && *RExC_parse == '-'
11959                 && RExC_parse[1] != ']')
11960             {
11961                 RExC_parse++;
11962
11963                 /* a bad range like \w-, [:word:]- ? */
11964                 if (namedclass > OOB_NAMEDCLASS) {
11965                     if (ckWARN(WARN_REGEXP)) {
11966                         const int w =
11967                             RExC_parse >= rangebegin ?
11968                             RExC_parse - rangebegin : 0;
11969                         vWARN4(RExC_parse,
11970                                "False [] range \"%*.*s\"",
11971                                w, w, rangebegin);
11972                     }
11973                     if (!SIZE_ONLY) {
11974                         cp_list = add_cp_to_invlist(cp_list, '-');
11975                     }
11976                     element_count++;
11977                 } else
11978                     range = 1;  /* yeah, it's a range! */
11979                 continue;       /* but do it the next time */
11980             }
11981         }
11982
11983         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
11984          * if not */
11985
11986         /* non-Latin1 code point implies unicode semantics.  Must be set in
11987          * pass1 so is there for the whole of pass 2 */
11988         if (value > 255) {
11989             RExC_uni_semantics = 1;
11990         }
11991
11992         /* Ready to process either the single value, or the completed range.
11993          * For single-valued non-inverted ranges, we consider the possibility
11994          * of multi-char folds.  (We made a conscious decision to not do this
11995          * for the other cases because it can often lead to non-intuitive
11996          * results.  For example, you have the peculiar case that:
11997          *  "s s" =~ /^[^\xDF]+$/i => Y
11998          *  "ss"  =~ /^[^\xDF]+$/i => N
11999          *
12000          * See [perl #89750] */
12001         if (FOLD && ! invert && value == prevvalue) {
12002             if (value == LATIN_SMALL_LETTER_SHARP_S
12003                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12004                                                         value)))
12005             {
12006                 /* Here <value> is indeed a multi-char fold.  Get what it is */
12007
12008                 U8 foldbuf[UTF8_MAXBYTES_CASE];
12009                 STRLEN foldlen;
12010
12011                 UV folded = _to_uni_fold_flags(
12012                                 value,
12013                                 foldbuf,
12014                                 &foldlen,
12015                                 FOLD_FLAGS_FULL
12016                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
12017                                             : (ASCII_FOLD_RESTRICTED)
12018                                               ? FOLD_FLAGS_NOMIX_ASCII
12019                                               : 0)
12020                                 );
12021
12022                 /* Here, <folded> should be the first character of the
12023                  * multi-char fold of <value>, with <foldbuf> containing the
12024                  * whole thing.  But, if this fold is not allowed (because of
12025                  * the flags), <fold> will be the same as <value>, and should
12026                  * be processed like any other character, so skip the special
12027                  * handling */
12028                 if (folded != value) {
12029
12030                     /* Skip if we are recursed, currently parsing the class
12031                      * again.  Otherwise add this character to the list of
12032                      * multi-char folds. */
12033                     if (! RExC_in_multi_char_class) {
12034                         AV** this_array_ptr;
12035                         AV* this_array;
12036                         STRLEN cp_count = utf8_length(foldbuf,
12037                                                       foldbuf + foldlen);
12038                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12039
12040                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12041
12042
12043                         if (! multi_char_matches) {
12044                             multi_char_matches = newAV();
12045                         }
12046
12047                         /* <multi_char_matches> is actually an array of arrays.
12048                          * There will be one or two top-level elements: [2],
12049                          * and/or [3].  The [2] element is an array, each
12050                          * element thereof is a character which folds to two
12051                          * characters; likewise for [3].  (Unicode guarantees a
12052                          * maximum of 3 characters in any fold.)  When we
12053                          * rewrite the character class below, we will do so
12054                          * such that the longest folds are written first, so
12055                          * that it prefers the longest matching strings first.
12056                          * This is done even if it turns out that any
12057                          * quantifier is non-greedy, out of programmer
12058                          * laziness.  Tom Christiansen has agreed that this is
12059                          * ok.  This makes the test for the ligature 'ffi' come
12060                          * before the test for 'ff' */
12061                         if (av_exists(multi_char_matches, cp_count)) {
12062                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
12063                                                              cp_count, FALSE);
12064                             this_array = *this_array_ptr;
12065                         }
12066                         else {
12067                             this_array = newAV();
12068                             av_store(multi_char_matches, cp_count,
12069                                      (SV*) this_array);
12070                         }
12071                         av_push(this_array, multi_fold);
12072                     }
12073
12074                     /* This element should not be processed further in this
12075                      * class */
12076                     element_count--;
12077                     value = save_value;
12078                     prevvalue = save_prevvalue;
12079                     continue;
12080                 }
12081             }
12082         }
12083
12084         /* Deal with this element of the class */
12085         if (! SIZE_ONLY) {
12086 #ifndef EBCDIC
12087             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12088 #else
12089             UV* this_range = _new_invlist(1);
12090             _append_range_to_invlist(this_range, prevvalue, value);
12091
12092             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12093              * If this range was specified using something like 'i-j', we want
12094              * to include only the 'i' and the 'j', and not anything in
12095              * between, so exclude non-ASCII, non-alphabetics from it.
12096              * However, if the range was specified with something like
12097              * [\x89-\x91] or [\x89-j], all code points within it should be
12098              * included.  literal_endpoint==2 means both ends of the range used
12099              * a literal character, not \x{foo} */
12100             if (literal_endpoint == 2
12101                 && (prevvalue >= 'a' && value <= 'z')
12102                     || (prevvalue >= 'A' && value <= 'Z'))
12103             {
12104                 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12105                 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12106             }
12107             _invlist_union(cp_list, this_range, &cp_list);
12108             literal_endpoint = 0;
12109 #endif
12110         }
12111
12112         range = 0; /* this range (if it was one) is done now */
12113     } /* End of loop through all the text within the brackets */
12114
12115     /* If anything in the class expands to more than one character, we have to
12116      * deal with them by building up a substitute parse string, and recursively
12117      * calling reg() on it, instead of proceeding */
12118     if (multi_char_matches) {
12119         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12120         I32 cp_count;
12121         STRLEN len;
12122         char *save_end = RExC_end;
12123         char *save_parse = RExC_parse;
12124         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
12125                                        a "|" */
12126         I32 reg_flags;
12127
12128         assert(! invert);
12129 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
12130            because too confusing */
12131         if (invert) {
12132             sv_catpv(substitute_parse, "(?:");
12133         }
12134 #endif
12135
12136         /* Look at the longest folds first */
12137         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12138
12139             if (av_exists(multi_char_matches, cp_count)) {
12140                 AV** this_array_ptr;
12141                 SV* this_sequence;
12142
12143                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12144                                                  cp_count, FALSE);
12145                 while ((this_sequence = av_pop(*this_array_ptr)) !=
12146                                                                 &PL_sv_undef)
12147                 {
12148                     if (! first_time) {
12149                         sv_catpv(substitute_parse, "|");
12150                     }
12151                     first_time = FALSE;
12152
12153                     sv_catpv(substitute_parse, SvPVX(this_sequence));
12154                 }
12155             }
12156         }
12157
12158         /* If the character class contains anything else besides these
12159          * multi-character folds, have to include it in recursive parsing */
12160         if (element_count) {
12161             sv_catpv(substitute_parse, "|[");
12162             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12163             sv_catpv(substitute_parse, "]");
12164         }
12165
12166         sv_catpv(substitute_parse, ")");
12167 #if 0
12168         if (invert) {
12169             /* This is a way to get the parse to skip forward a whole named
12170              * sequence instead of matching the 2nd character when it fails the
12171              * first */
12172             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12173         }
12174 #endif
12175
12176         RExC_parse = SvPV(substitute_parse, len);
12177         RExC_end = RExC_parse + len;
12178         RExC_in_multi_char_class = 1;
12179         RExC_emit = (regnode *)orig_emit;
12180
12181         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
12182
12183         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12184
12185         RExC_parse = save_parse;
12186         RExC_end = save_end;
12187         RExC_in_multi_char_class = 0;
12188         SvREFCNT_dec_NN(multi_char_matches);
12189         SvREFCNT_dec_NN(listsv);
12190         return ret;
12191     }
12192
12193     /* If the character class contains only a single element, it may be
12194      * optimizable into another node type which is smaller and runs faster.
12195      * Check if this is the case for this class */
12196     if (element_count == 1) {
12197         U8 op = END;
12198         U8 arg = 0;
12199
12200         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12201                                               [:digit:] or \p{foo} */
12202
12203             /* All named classes are mapped into POSIXish nodes, with its FLAG
12204              * argument giving which class it is */
12205             switch ((I32)namedclass) {
12206                 case ANYOF_UNIPROP:
12207                     break;
12208
12209                 /* These don't depend on the charset modifiers.  They always
12210                  * match under /u rules */
12211                 case ANYOF_NHORIZWS:
12212                 case ANYOF_HORIZWS:
12213                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
12214                     /* FALLTHROUGH */
12215
12216                 case ANYOF_NVERTWS:
12217                 case ANYOF_VERTWS:
12218                     op = POSIXU;
12219                     goto join_posix;
12220
12221                 /* The actual POSIXish node for all the rest depends on the
12222                  * charset modifier.  The ones in the first set depend only on
12223                  * ASCII or, if available on this platform, locale */
12224                 case ANYOF_ASCII:
12225                 case ANYOF_NASCII:
12226 #ifdef HAS_ISASCII
12227                     op = (LOC) ? POSIXL : POSIXA;
12228 #else
12229                     op = POSIXA;
12230 #endif
12231                     goto join_posix;
12232
12233                 case ANYOF_NCASED:
12234                 case ANYOF_LOWER:
12235                 case ANYOF_NLOWER:
12236                 case ANYOF_UPPER:
12237                 case ANYOF_NUPPER:
12238                     /* under /a could be alpha */
12239                     if (FOLD) {
12240                         if (ASCII_RESTRICTED) {
12241                             namedclass = ANYOF_ALPHA + (namedclass % 2);
12242                         }
12243                         else if (! LOC) {
12244                             break;
12245                         }
12246                     }
12247                     /* FALLTHROUGH */
12248
12249                 /* The rest have more possibilities depending on the charset.  We
12250                  * take advantage of the enum ordering of the charset modifiers to
12251                  * get the exact node type, */
12252                 default:
12253                     op = POSIXD + get_regex_charset(RExC_flags);
12254                     if (op > POSIXA) { /* /aa is same as /a */
12255                         op = POSIXA;
12256                     }
12257 #ifndef HAS_ISBLANK
12258                     if (op == POSIXL
12259                         && (namedclass == ANYOF_BLANK
12260                             || namedclass == ANYOF_NBLANK))
12261                     {
12262                         op = POSIXA;
12263                     }
12264 #endif
12265
12266                 join_posix:
12267                     /* The odd numbered ones are the complements of the
12268                      * next-lower even number one */
12269                     if (namedclass % 2 == 1) {
12270                         invert = ! invert;
12271                         namedclass--;
12272                     }
12273                     arg = namedclass_to_classnum(namedclass);
12274                     break;
12275             }
12276         }
12277         else if (value == prevvalue) {
12278
12279             /* Here, the class consists of just a single code point */
12280
12281             if (invert) {
12282                 if (! LOC && value == '\n') {
12283                     op = REG_ANY; /* Optimize [^\n] */
12284                     *flagp |= HASWIDTH|SIMPLE;
12285                     RExC_naughty++;
12286                 }
12287             }
12288             else if (value < 256 || UTF) {
12289
12290                 /* Optimize a single value into an EXACTish node, but not if it
12291                  * would require converting the pattern to UTF-8. */
12292                 op = compute_EXACTish(pRExC_state);
12293             }
12294         } /* Otherwise is a range */
12295         else if (! LOC) {   /* locale could vary these */
12296             if (prevvalue == '0') {
12297                 if (value == '9') {
12298                     arg = _CC_DIGIT;
12299                     op = POSIXA;
12300                 }
12301             }
12302         }
12303
12304         /* Here, we have changed <op> away from its initial value iff we found
12305          * an optimization */
12306         if (op != END) {
12307
12308             /* Throw away this ANYOF regnode, and emit the calculated one,
12309              * which should correspond to the beginning, not current, state of
12310              * the parse */
12311             const char * cur_parse = RExC_parse;
12312             RExC_parse = (char *)orig_parse;
12313             if ( SIZE_ONLY) {
12314                 if (! LOC) {
12315
12316                     /* To get locale nodes to not use the full ANYOF size would
12317                      * require moving the code above that writes the portions
12318                      * of it that aren't in other nodes to after this point.
12319                      * e.g.  ANYOF_CLASS_SET */
12320                     RExC_size = orig_size;
12321                 }
12322             }
12323             else {
12324                 RExC_emit = (regnode *)orig_emit;
12325                 if (PL_regkind[op] == POSIXD) {
12326                     if (invert) {
12327                         op += NPOSIXD - POSIXD;
12328                     }
12329                 }
12330             }
12331
12332             ret = reg_node(pRExC_state, op);
12333
12334             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
12335                 if (! SIZE_ONLY) {
12336                     FLAGS(ret) = arg;
12337                 }
12338                 *flagp |= HASWIDTH|SIMPLE;
12339             }
12340             else if (PL_regkind[op] == EXACT) {
12341                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12342             }
12343
12344             RExC_parse = (char *) cur_parse;
12345
12346             SvREFCNT_dec(posixes);
12347             SvREFCNT_dec_NN(listsv);
12348             SvREFCNT_dec(cp_list);
12349             return ret;
12350         }
12351     }
12352
12353     if (SIZE_ONLY)
12354         return ret;
12355     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12356
12357     /* If folding, we calculate all characters that could fold to or from the
12358      * ones already on the list */
12359     if (FOLD && cp_list) {
12360         UV start, end;  /* End points of code point ranges */
12361
12362         SV* fold_intersection = NULL;
12363
12364         /* If the highest code point is within Latin1, we can use the
12365          * compiled-in Alphas list, and not have to go out to disk.  This
12366          * yields two false positives, the masculine and feminine ordinal
12367          * indicators, which are weeded out below using the
12368          * IS_IN_SOME_FOLD_L1() macro */
12369         if (invlist_highest(cp_list) < 256) {
12370             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list, &fold_intersection);
12371         }
12372         else {
12373
12374             /* Here, there are non-Latin1 code points, so we will have to go
12375              * fetch the list of all the characters that participate in folds
12376              */
12377             if (! PL_utf8_foldable) {
12378                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12379                                        &PL_sv_undef, 1, 0);
12380                 PL_utf8_foldable = _get_swash_invlist(swash);
12381                 SvREFCNT_dec_NN(swash);
12382             }
12383
12384             /* This is a hash that for a particular fold gives all characters
12385              * that are involved in it */
12386             if (! PL_utf8_foldclosures) {
12387
12388                 /* If we were unable to find any folds, then we likely won't be
12389                  * able to find the closures.  So just create an empty list.
12390                  * Folding will effectively be restricted to the non-Unicode
12391                  * rules hard-coded into Perl.  (This case happens legitimately
12392                  * during compilation of Perl itself before the Unicode tables
12393                  * are generated) */
12394                 if (_invlist_len(PL_utf8_foldable) == 0) {
12395                     PL_utf8_foldclosures = newHV();
12396                 }
12397                 else {
12398                     /* If the folds haven't been read in, call a fold function
12399                      * to force that */
12400                     if (! PL_utf8_tofold) {
12401                         U8 dummy[UTF8_MAXBYTES+1];
12402
12403                         /* This string is just a short named one above \xff */
12404                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
12405                         assert(PL_utf8_tofold); /* Verify that worked */
12406                     }
12407                     PL_utf8_foldclosures =
12408                                     _swash_inversion_hash(PL_utf8_tofold);
12409                 }
12410             }
12411
12412             /* Only the characters in this class that participate in folds need
12413              * be checked.  Get the intersection of this class and all the
12414              * possible characters that are foldable.  This can quickly narrow
12415              * down a large class */
12416             _invlist_intersection(PL_utf8_foldable, cp_list,
12417                                   &fold_intersection);
12418         }
12419
12420         /* Now look at the foldable characters in this class individually */
12421         invlist_iterinit(fold_intersection);
12422         while (invlist_iternext(fold_intersection, &start, &end)) {
12423             UV j;
12424
12425             /* Locale folding for Latin1 characters is deferred until runtime */
12426             if (LOC && start < 256) {
12427                 start = 256;
12428             }
12429
12430             /* Look at every character in the range */
12431             for (j = start; j <= end; j++) {
12432
12433                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12434                 STRLEN foldlen;
12435                 SV** listp;
12436
12437                 if (j < 256) {
12438
12439                     /* We have the latin1 folding rules hard-coded here so that
12440                      * an innocent-looking character class, like /[ks]/i won't
12441                      * have to go out to disk to find the possible matches.
12442                      * XXX It would be better to generate these via regen, in
12443                      * case a new version of the Unicode standard adds new
12444                      * mappings, though that is not really likely, and may be
12445                      * caught by the default: case of the switch below. */
12446
12447                     if (IS_IN_SOME_FOLD_L1(j)) {
12448
12449                         /* ASCII is always matched; non-ASCII is matched only
12450                          * under Unicode rules */
12451                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12452                             cp_list =
12453                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12454                         }
12455                         else {
12456                             depends_list =
12457                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12458                         }
12459                     }
12460
12461                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12462                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12463                     {
12464                         /* Certain Latin1 characters have matches outside
12465                          * Latin1.  To get here, <j> is one of those
12466                          * characters.   None of these matches is valid for
12467                          * ASCII characters under /aa, which is why the 'if'
12468                          * just above excludes those.  These matches only
12469                          * happen when the target string is utf8.  The code
12470                          * below adds the single fold closures for <j> to the
12471                          * inversion list. */
12472                         switch (j) {
12473                             case 'k':
12474                             case 'K':
12475                                 cp_list =
12476                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
12477                                 break;
12478                             case 's':
12479                             case 'S':
12480                                 cp_list = add_cp_to_invlist(cp_list,
12481                                                     LATIN_SMALL_LETTER_LONG_S);
12482                                 break;
12483                             case MICRO_SIGN:
12484                                 cp_list = add_cp_to_invlist(cp_list,
12485                                                     GREEK_CAPITAL_LETTER_MU);
12486                                 cp_list = add_cp_to_invlist(cp_list,
12487                                                     GREEK_SMALL_LETTER_MU);
12488                                 break;
12489                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12490                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12491                                 cp_list =
12492                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12493                                 break;
12494                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12495                                 cp_list = add_cp_to_invlist(cp_list,
12496                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12497                                 break;
12498                             case LATIN_SMALL_LETTER_SHARP_S:
12499                                 cp_list = add_cp_to_invlist(cp_list,
12500                                                 LATIN_CAPITAL_LETTER_SHARP_S);
12501                                 break;
12502                             case 'F': case 'f':
12503                             case 'I': case 'i':
12504                             case 'L': case 'l':
12505                             case 'T': case 't':
12506                             case 'A': case 'a':
12507                             case 'H': case 'h':
12508                             case 'J': case 'j':
12509                             case 'N': case 'n':
12510                             case 'W': case 'w':
12511                             case 'Y': case 'y':
12512                                 /* These all are targets of multi-character
12513                                  * folds from code points that require UTF8 to
12514                                  * express, so they can't match unless the
12515                                  * target string is in UTF-8, so no action here
12516                                  * is necessary, as regexec.c properly handles
12517                                  * the general case for UTF-8 matching and
12518                                  * multi-char folds */
12519                                 break;
12520                             default:
12521                                 /* Use deprecated warning to increase the
12522                                  * chances of this being output */
12523                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12524                                 break;
12525                         }
12526                     }
12527                     continue;
12528                 }
12529
12530                 /* Here is an above Latin1 character.  We don't have the rules
12531                  * hard-coded for it.  First, get its fold.  This is the simple
12532                  * fold, as the multi-character folds have been handled earlier
12533                  * and separated out */
12534                 _to_uni_fold_flags(j, foldbuf, &foldlen,
12535                                                ((LOC)
12536                                                ? FOLD_FLAGS_LOCALE
12537                                                : (ASCII_FOLD_RESTRICTED)
12538                                                   ? FOLD_FLAGS_NOMIX_ASCII
12539                                                   : 0));
12540
12541                 /* Single character fold of above Latin1.  Add everything in
12542                  * its fold closure to the list that this node should match.
12543                  * The fold closures data structure is a hash with the keys
12544                  * being the UTF-8 of every character that is folded to, like
12545                  * 'k', and the values each an array of all code points that
12546                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
12547                  * Multi-character folds are not included */
12548                 if ((listp = hv_fetch(PL_utf8_foldclosures,
12549                                       (char *) foldbuf, foldlen, FALSE)))
12550                 {
12551                     AV* list = (AV*) *listp;
12552                     IV k;
12553                     for (k = 0; k <= av_len(list); k++) {
12554                         SV** c_p = av_fetch(list, k, FALSE);
12555                         UV c;
12556                         if (c_p == NULL) {
12557                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12558                         }
12559                         c = SvUV(*c_p);
12560
12561                         /* /aa doesn't allow folds between ASCII and non-; /l
12562                          * doesn't allow them between above and below 256 */
12563                         if ((ASCII_FOLD_RESTRICTED
12564                                   && (isASCII(c) != isASCII(j)))
12565                             || (LOC && ((c < 256) != (j < 256))))
12566                         {
12567                             continue;
12568                         }
12569
12570                         /* Folds involving non-ascii Latin1 characters
12571                          * under /d are added to a separate list */
12572                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12573                         {
12574                             cp_list = add_cp_to_invlist(cp_list, c);
12575                         }
12576                         else {
12577                           depends_list = add_cp_to_invlist(depends_list, c);
12578                         }
12579                     }
12580                 }
12581             }
12582         }
12583         SvREFCNT_dec_NN(fold_intersection);
12584     }
12585
12586     /* And combine the result (if any) with any inversion list from posix
12587      * classes.  The lists are kept separate up to now because we don't want to
12588      * fold the classes (folding of those is automatically handled by the swash
12589      * fetching code) */
12590     if (posixes) {
12591         if (! DEPENDS_SEMANTICS) {
12592             if (cp_list) {
12593                 _invlist_union(cp_list, posixes, &cp_list);
12594                 SvREFCNT_dec_NN(posixes);
12595             }
12596             else {
12597                 cp_list = posixes;
12598             }
12599         }
12600         else {
12601             /* Under /d, we put into a separate list the Latin1 things that
12602              * match only when the target string is utf8 */
12603             SV* nonascii_but_latin1_properties = NULL;
12604             _invlist_intersection(posixes, PL_Latin1,
12605                                   &nonascii_but_latin1_properties);
12606             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12607                               &nonascii_but_latin1_properties);
12608             _invlist_subtract(posixes, nonascii_but_latin1_properties,
12609                               &posixes);
12610             if (cp_list) {
12611                 _invlist_union(cp_list, posixes, &cp_list);
12612                 SvREFCNT_dec_NN(posixes);
12613             }
12614             else {
12615                 cp_list = posixes;
12616             }
12617
12618             if (depends_list) {
12619                 _invlist_union(depends_list, nonascii_but_latin1_properties,
12620                                &depends_list);
12621                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
12622             }
12623             else {
12624                 depends_list = nonascii_but_latin1_properties;
12625             }
12626         }
12627     }
12628
12629     /* And combine the result (if any) with any inversion list from properties.
12630      * The lists are kept separate up to now so that we can distinguish the two
12631      * in regards to matching above-Unicode.  A run-time warning is generated
12632      * if a Unicode property is matched against a non-Unicode code point. But,
12633      * we allow user-defined properties to match anything, without any warning,
12634      * and we also suppress the warning if there is a portion of the character
12635      * class that isn't a Unicode property, and which matches above Unicode, \W
12636      * or [\x{110000}] for example.
12637      * (Note that in this case, unlike the Posix one above, there is no
12638      * <depends_list>, because having a Unicode property forces Unicode
12639      * semantics */
12640     if (properties) {
12641         bool warn_super = ! has_user_defined_property;
12642         if (cp_list) {
12643
12644             /* If it matters to the final outcome, see if a non-property
12645              * component of the class matches above Unicode.  If so, the
12646              * warning gets suppressed.  This is true even if just a single
12647              * such code point is specified, as though not strictly correct if
12648              * another such code point is matched against, the fact that they
12649              * are using above-Unicode code points indicates they should know
12650              * the issues involved */
12651             if (warn_super) {
12652                 bool non_prop_matches_above_Unicode =
12653                             runtime_posix_matches_above_Unicode
12654                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12655                 if (invert) {
12656                     non_prop_matches_above_Unicode =
12657                                             !  non_prop_matches_above_Unicode;
12658                 }
12659                 warn_super = ! non_prop_matches_above_Unicode;
12660             }
12661
12662             _invlist_union(properties, cp_list, &cp_list);
12663             SvREFCNT_dec_NN(properties);
12664         }
12665         else {
12666             cp_list = properties;
12667         }
12668
12669         if (warn_super) {
12670             OP(ret) = ANYOF_WARN_SUPER;
12671         }
12672     }
12673
12674     /* Here, we have calculated what code points should be in the character
12675      * class.
12676      *
12677      * Now we can see about various optimizations.  Fold calculation (which we
12678      * did above) needs to take place before inversion.  Otherwise /[^k]/i
12679      * would invert to include K, which under /i would match k, which it
12680      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
12681      * folded until runtime */
12682
12683     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12684      * at compile time.  Besides not inverting folded locale now, we can't
12685      * invert if there are things such as \w, which aren't known until runtime
12686      * */
12687     if (invert
12688         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12689         && ! depends_list
12690         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12691     {
12692         _invlist_invert(cp_list);
12693
12694         /* Any swash can't be used as-is, because we've inverted things */
12695         if (swash) {
12696             SvREFCNT_dec_NN(swash);
12697             swash = NULL;
12698         }
12699
12700         /* Clear the invert flag since have just done it here */
12701         invert = FALSE;
12702     }
12703
12704     /* If we didn't do folding, it's because some information isn't available
12705      * until runtime; set the run-time fold flag for these.  (We don't have to
12706      * worry about properties folding, as that is taken care of by the swash
12707      * fetching) */
12708     if (FOLD && LOC)
12709     {
12710        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
12711     }
12712
12713     /* Some character classes are equivalent to other nodes.  Such nodes take
12714      * up less room and generally fewer operations to execute than ANYOF nodes.
12715      * Above, we checked for and optimized into some such equivalents for
12716      * certain common classes that are easy to test.  Getting to this point in
12717      * the code means that the class didn't get optimized there.  Since this
12718      * code is only executed in Pass 2, it is too late to save space--it has
12719      * been allocated in Pass 1, and currently isn't given back.  But turning
12720      * things into an EXACTish node can allow the optimizer to join it to any
12721      * adjacent such nodes.  And if the class is equivalent to things like /./,
12722      * expensive run-time swashes can be avoided.  Now that we have more
12723      * complete information, we can find things necessarily missed by the
12724      * earlier code.  I (khw) am not sure how much to look for here.  It would
12725      * be easy, but perhaps too slow, to check any candidates against all the
12726      * node types they could possibly match using _invlistEQ(). */
12727
12728     if (cp_list
12729         && ! invert
12730         && ! depends_list
12731         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12732         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12733     {
12734         UV start, end;
12735         U8 op = END;  /* The optimzation node-type */
12736         const char * cur_parse= RExC_parse;
12737
12738         invlist_iterinit(cp_list);
12739         if (! invlist_iternext(cp_list, &start, &end)) {
12740
12741             /* Here, the list is empty.  This happens, for example, when a
12742              * Unicode property is the only thing in the character class, and
12743              * it doesn't match anything.  (perluniprops.pod notes such
12744              * properties) */
12745             op = OPFAIL;
12746             *flagp |= HASWIDTH|SIMPLE;
12747         }
12748         else if (start == end) {    /* The range is a single code point */
12749             if (! invlist_iternext(cp_list, &start, &end)
12750
12751                     /* Don't do this optimization if it would require changing
12752                      * the pattern to UTF-8 */
12753                 && (start < 256 || UTF))
12754             {
12755                 /* Here, the list contains a single code point.  Can optimize
12756                  * into an EXACT node */
12757
12758                 value = start;
12759
12760                 if (! FOLD) {
12761                     op = EXACT;
12762                 }
12763                 else if (LOC) {
12764
12765                     /* A locale node under folding with one code point can be
12766                      * an EXACTFL, as its fold won't be calculated until
12767                      * runtime */
12768                     op = EXACTFL;
12769                 }
12770                 else {
12771
12772                     /* Here, we are generally folding, but there is only one
12773                      * code point to match.  If we have to, we use an EXACT
12774                      * node, but it would be better for joining with adjacent
12775                      * nodes in the optimization pass if we used the same
12776                      * EXACTFish node that any such are likely to be.  We can
12777                      * do this iff the code point doesn't participate in any
12778                      * folds.  For example, an EXACTF of a colon is the same as
12779                      * an EXACT one, since nothing folds to or from a colon. */
12780                     if (value < 256) {
12781                         if (IS_IN_SOME_FOLD_L1(value)) {
12782                             op = EXACT;
12783                         }
12784                     }
12785                     else {
12786                         if (! PL_utf8_foldable) {
12787                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12788                                                 &PL_sv_undef, 1, 0);
12789                             PL_utf8_foldable = _get_swash_invlist(swash);
12790                             SvREFCNT_dec_NN(swash);
12791                         }
12792                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
12793                             op = EXACT;
12794                         }
12795                     }
12796
12797                     /* If we haven't found the node type, above, it means we
12798                      * can use the prevailing one */
12799                     if (op == END) {
12800                         op = compute_EXACTish(pRExC_state);
12801                     }
12802                 }
12803             }
12804         }
12805         else if (start == 0) {
12806             if (end == UV_MAX) {
12807                 op = SANY;
12808                 *flagp |= HASWIDTH|SIMPLE;
12809                 RExC_naughty++;
12810             }
12811             else if (end == '\n' - 1
12812                     && invlist_iternext(cp_list, &start, &end)
12813                     && start == '\n' + 1 && end == UV_MAX)
12814             {
12815                 op = REG_ANY;
12816                 *flagp |= HASWIDTH|SIMPLE;
12817                 RExC_naughty++;
12818             }
12819         }
12820         invlist_iterfinish(cp_list);
12821
12822         if (op != END) {
12823             RExC_parse = (char *)orig_parse;
12824             RExC_emit = (regnode *)orig_emit;
12825
12826             ret = reg_node(pRExC_state, op);
12827
12828             RExC_parse = (char *)cur_parse;
12829
12830             if (PL_regkind[op] == EXACT) {
12831                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12832             }
12833
12834             SvREFCNT_dec_NN(cp_list);
12835             SvREFCNT_dec_NN(listsv);
12836             return ret;
12837         }
12838     }
12839
12840     /* Here, <cp_list> contains all the code points we can determine at
12841      * compile time that match under all conditions.  Go through it, and
12842      * for things that belong in the bitmap, put them there, and delete from
12843      * <cp_list>.  While we are at it, see if everything above 255 is in the
12844      * list, and if so, set a flag to speed up execution */
12845     ANYOF_BITMAP_ZERO(ret);
12846     if (cp_list) {
12847
12848         /* This gets set if we actually need to modify things */
12849         bool change_invlist = FALSE;
12850
12851         UV start, end;
12852
12853         /* Start looking through <cp_list> */
12854         invlist_iterinit(cp_list);
12855         while (invlist_iternext(cp_list, &start, &end)) {
12856             UV high;
12857             int i;
12858
12859             if (end == UV_MAX && start <= 256) {
12860                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
12861             }
12862
12863             /* Quit if are above what we should change */
12864             if (start > 255) {
12865                 break;
12866             }
12867
12868             change_invlist = TRUE;
12869
12870             /* Set all the bits in the range, up to the max that we are doing */
12871             high = (end < 255) ? end : 255;
12872             for (i = start; i <= (int) high; i++) {
12873                 if (! ANYOF_BITMAP_TEST(ret, i)) {
12874                     ANYOF_BITMAP_SET(ret, i);
12875                     prevvalue = value;
12876                     value = i;
12877                 }
12878             }
12879         }
12880         invlist_iterfinish(cp_list);
12881
12882         /* Done with loop; remove any code points that are in the bitmap from
12883          * <cp_list> */
12884         if (change_invlist) {
12885             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
12886         }
12887
12888         /* If have completely emptied it, remove it completely */
12889         if (_invlist_len(cp_list) == 0) {
12890             SvREFCNT_dec_NN(cp_list);
12891             cp_list = NULL;
12892         }
12893     }
12894
12895     if (invert) {
12896         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
12897     }
12898
12899     /* Here, the bitmap has been populated with all the Latin1 code points that
12900      * always match.  Can now add to the overall list those that match only
12901      * when the target string is UTF-8 (<depends_list>). */
12902     if (depends_list) {
12903         if (cp_list) {
12904             _invlist_union(cp_list, depends_list, &cp_list);
12905             SvREFCNT_dec_NN(depends_list);
12906         }
12907         else {
12908             cp_list = depends_list;
12909         }
12910     }
12911
12912     /* If there is a swash and more than one element, we can't use the swash in
12913      * the optimization below. */
12914     if (swash && element_count > 1) {
12915         SvREFCNT_dec_NN(swash);
12916         swash = NULL;
12917     }
12918
12919     if (! cp_list
12920         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12921     {
12922         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12923         SvREFCNT_dec_NN(listsv);
12924     }
12925     else {
12926         /* av[0] stores the character class description in its textual form:
12927          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
12928          *       appropriate swash, and is also useful for dumping the regnode.
12929          * av[1] if NULL, is a placeholder to later contain the swash computed
12930          *       from av[0].  But if no further computation need be done, the
12931          *       swash is stored there now.
12932          * av[2] stores the cp_list inversion list for use in addition or
12933          *       instead of av[0]; used only if av[1] is NULL
12934          * av[3] is set if any component of the class is from a user-defined
12935          *       property; used only if av[1] is NULL */
12936         AV * const av = newAV();
12937         SV *rv;
12938
12939         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12940                         ? listsv
12941                         : (SvREFCNT_dec_NN(listsv), &PL_sv_undef));
12942         if (swash) {
12943             av_store(av, 1, swash);
12944             SvREFCNT_dec_NN(cp_list);
12945         }
12946         else {
12947             av_store(av, 1, NULL);
12948             if (cp_list) {
12949                 av_store(av, 2, cp_list);
12950                 av_store(av, 3, newSVuv(has_user_defined_property));
12951             }
12952         }
12953
12954         rv = newRV_noinc(MUTABLE_SV(av));
12955         n = add_data(pRExC_state, 1, "s");
12956         RExC_rxi->data->data[n] = (void*)rv;
12957         ARG_SET(ret, n);
12958     }
12959
12960     *flagp |= HASWIDTH|SIMPLE;
12961     return ret;
12962 }
12963 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12964
12965
12966 /* reg_skipcomment()
12967
12968    Absorbs an /x style # comments from the input stream.
12969    Returns true if there is more text remaining in the stream.
12970    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12971    terminates the pattern without including a newline.
12972
12973    Note its the callers responsibility to ensure that we are
12974    actually in /x mode
12975
12976 */
12977
12978 STATIC bool
12979 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12980 {
12981     bool ended = 0;
12982
12983     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12984
12985     while (RExC_parse < RExC_end)
12986         if (*RExC_parse++ == '\n') {
12987             ended = 1;
12988             break;
12989         }
12990     if (!ended) {
12991         /* we ran off the end of the pattern without ending
12992            the comment, so we have to add an \n when wrapping */
12993         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12994         return 0;
12995     } else
12996         return 1;
12997 }
12998
12999 /* nextchar()
13000
13001    Advances the parse position, and optionally absorbs
13002    "whitespace" from the inputstream.
13003
13004    Without /x "whitespace" means (?#...) style comments only,
13005    with /x this means (?#...) and # comments and whitespace proper.
13006
13007    Returns the RExC_parse point from BEFORE the scan occurs.
13008
13009    This is the /x friendly way of saying RExC_parse++.
13010 */
13011
13012 STATIC char*
13013 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13014 {
13015     char* const retval = RExC_parse++;
13016
13017     PERL_ARGS_ASSERT_NEXTCHAR;
13018
13019     for (;;) {
13020         if (RExC_end - RExC_parse >= 3
13021             && *RExC_parse == '('
13022             && RExC_parse[1] == '?'
13023             && RExC_parse[2] == '#')
13024         {
13025             while (*RExC_parse != ')') {
13026                 if (RExC_parse == RExC_end)
13027                     FAIL("Sequence (?#... not terminated");
13028                 RExC_parse++;
13029             }
13030             RExC_parse++;
13031             continue;
13032         }
13033         if (RExC_flags & RXf_PMf_EXTENDED) {
13034             if (isSPACE(*RExC_parse)) {
13035                 RExC_parse++;
13036                 continue;
13037             }
13038             else if (*RExC_parse == '#') {
13039                 if ( reg_skipcomment( pRExC_state ) )
13040                     continue;
13041             }
13042         }
13043         return retval;
13044     }
13045 }
13046
13047 /*
13048 - reg_node - emit a node
13049 */
13050 STATIC regnode *                        /* Location. */
13051 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13052 {
13053     dVAR;
13054     regnode *ptr;
13055     regnode * const ret = RExC_emit;
13056     GET_RE_DEBUG_FLAGS_DECL;
13057
13058     PERL_ARGS_ASSERT_REG_NODE;
13059
13060     if (SIZE_ONLY) {
13061         SIZE_ALIGN(RExC_size);
13062         RExC_size += 1;
13063         return(ret);
13064     }
13065     if (RExC_emit >= RExC_emit_bound)
13066         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13067                    op, RExC_emit, RExC_emit_bound);
13068
13069     NODE_ALIGN_FILL(ret);
13070     ptr = ret;
13071     FILL_ADVANCE_NODE(ptr, op);
13072 #ifdef RE_TRACK_PATTERN_OFFSETS
13073     if (RExC_offsets) {         /* MJD */
13074         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13075               "reg_node", __LINE__,
13076               PL_reg_name[op],
13077               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13078                 ? "Overwriting end of array!\n" : "OK",
13079               (UV)(RExC_emit - RExC_emit_start),
13080               (UV)(RExC_parse - RExC_start),
13081               (UV)RExC_offsets[0]));
13082         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13083     }
13084 #endif
13085     RExC_emit = ptr;
13086     return(ret);
13087 }
13088
13089 /*
13090 - reganode - emit a node with an argument
13091 */
13092 STATIC regnode *                        /* Location. */
13093 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13094 {
13095     dVAR;
13096     regnode *ptr;
13097     regnode * const ret = RExC_emit;
13098     GET_RE_DEBUG_FLAGS_DECL;
13099
13100     PERL_ARGS_ASSERT_REGANODE;
13101
13102     if (SIZE_ONLY) {
13103         SIZE_ALIGN(RExC_size);
13104         RExC_size += 2;
13105         /*
13106            We can't do this:
13107
13108            assert(2==regarglen[op]+1);
13109
13110            Anything larger than this has to allocate the extra amount.
13111            If we changed this to be:
13112
13113            RExC_size += (1 + regarglen[op]);
13114
13115            then it wouldn't matter. Its not clear what side effect
13116            might come from that so its not done so far.
13117            -- dmq
13118         */
13119         return(ret);
13120     }
13121     if (RExC_emit >= RExC_emit_bound)
13122         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13123                    op, RExC_emit, RExC_emit_bound);
13124
13125     NODE_ALIGN_FILL(ret);
13126     ptr = ret;
13127     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13128 #ifdef RE_TRACK_PATTERN_OFFSETS
13129     if (RExC_offsets) {         /* MJD */
13130         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13131               "reganode",
13132               __LINE__,
13133               PL_reg_name[op],
13134               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13135               "Overwriting end of array!\n" : "OK",
13136               (UV)(RExC_emit - RExC_emit_start),
13137               (UV)(RExC_parse - RExC_start),
13138               (UV)RExC_offsets[0]));
13139         Set_Cur_Node_Offset;
13140     }
13141 #endif
13142     RExC_emit = ptr;
13143     return(ret);
13144 }
13145
13146 /*
13147 - reguni - emit (if appropriate) a Unicode character
13148 */
13149 STATIC STRLEN
13150 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13151 {
13152     dVAR;
13153
13154     PERL_ARGS_ASSERT_REGUNI;
13155
13156     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13157 }
13158
13159 /*
13160 - reginsert - insert an operator in front of already-emitted operand
13161 *
13162 * Means relocating the operand.
13163 */
13164 STATIC void
13165 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13166 {
13167     dVAR;
13168     regnode *src;
13169     regnode *dst;
13170     regnode *place;
13171     const int offset = regarglen[(U8)op];
13172     const int size = NODE_STEP_REGNODE + offset;
13173     GET_RE_DEBUG_FLAGS_DECL;
13174
13175     PERL_ARGS_ASSERT_REGINSERT;
13176     PERL_UNUSED_ARG(depth);
13177 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13178     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13179     if (SIZE_ONLY) {
13180         RExC_size += size;
13181         return;
13182     }
13183
13184     src = RExC_emit;
13185     RExC_emit += size;
13186     dst = RExC_emit;
13187     if (RExC_open_parens) {
13188         int paren;
13189         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13190         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13191             if ( RExC_open_parens[paren] >= opnd ) {
13192                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13193                 RExC_open_parens[paren] += size;
13194             } else {
13195                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13196             }
13197             if ( RExC_close_parens[paren] >= opnd ) {
13198                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13199                 RExC_close_parens[paren] += size;
13200             } else {
13201                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13202             }
13203         }
13204     }
13205
13206     while (src > opnd) {
13207         StructCopy(--src, --dst, regnode);
13208 #ifdef RE_TRACK_PATTERN_OFFSETS
13209         if (RExC_offsets) {     /* MJD 20010112 */
13210             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13211                   "reg_insert",
13212                   __LINE__,
13213                   PL_reg_name[op],
13214                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13215                     ? "Overwriting end of array!\n" : "OK",
13216                   (UV)(src - RExC_emit_start),
13217                   (UV)(dst - RExC_emit_start),
13218                   (UV)RExC_offsets[0]));
13219             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13220             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13221         }
13222 #endif
13223     }
13224
13225
13226     place = opnd;               /* Op node, where operand used to be. */
13227 #ifdef RE_TRACK_PATTERN_OFFSETS
13228     if (RExC_offsets) {         /* MJD */
13229         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13230               "reginsert",
13231               __LINE__,
13232               PL_reg_name[op],
13233               (UV)(place - RExC_emit_start) > RExC_offsets[0]
13234               ? "Overwriting end of array!\n" : "OK",
13235               (UV)(place - RExC_emit_start),
13236               (UV)(RExC_parse - RExC_start),
13237               (UV)RExC_offsets[0]));
13238         Set_Node_Offset(place, RExC_parse);
13239         Set_Node_Length(place, 1);
13240     }
13241 #endif
13242     src = NEXTOPER(place);
13243     FILL_ADVANCE_NODE(place, op);
13244     Zero(src, offset, regnode);
13245 }
13246
13247 /*
13248 - regtail - set the next-pointer at the end of a node chain of p to val.
13249 - SEE ALSO: regtail_study
13250 */
13251 /* TODO: All three parms should be const */
13252 STATIC void
13253 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13254 {
13255     dVAR;
13256     regnode *scan;
13257     GET_RE_DEBUG_FLAGS_DECL;
13258
13259     PERL_ARGS_ASSERT_REGTAIL;
13260 #ifndef DEBUGGING
13261     PERL_UNUSED_ARG(depth);
13262 #endif
13263
13264     if (SIZE_ONLY)
13265         return;
13266
13267     /* Find last node. */
13268     scan = p;
13269     for (;;) {
13270         regnode * const temp = regnext(scan);
13271         DEBUG_PARSE_r({
13272             SV * const mysv=sv_newmortal();
13273             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13274             regprop(RExC_rx, mysv, scan);
13275             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13276                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13277                     (temp == NULL ? "->" : ""),
13278                     (temp == NULL ? PL_reg_name[OP(val)] : "")
13279             );
13280         });
13281         if (temp == NULL)
13282             break;
13283         scan = temp;
13284     }
13285
13286     if (reg_off_by_arg[OP(scan)]) {
13287         ARG_SET(scan, val - scan);
13288     }
13289     else {
13290         NEXT_OFF(scan) = val - scan;
13291     }
13292 }
13293
13294 #ifdef DEBUGGING
13295 /*
13296 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13297 - Look for optimizable sequences at the same time.
13298 - currently only looks for EXACT chains.
13299
13300 This is experimental code. The idea is to use this routine to perform
13301 in place optimizations on branches and groups as they are constructed,
13302 with the long term intention of removing optimization from study_chunk so
13303 that it is purely analytical.
13304
13305 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13306 to control which is which.
13307
13308 */
13309 /* TODO: All four parms should be const */
13310
13311 STATIC U8
13312 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13313 {
13314     dVAR;
13315     regnode *scan;
13316     U8 exact = PSEUDO;
13317 #ifdef EXPERIMENTAL_INPLACESCAN
13318     I32 min = 0;
13319 #endif
13320     GET_RE_DEBUG_FLAGS_DECL;
13321
13322     PERL_ARGS_ASSERT_REGTAIL_STUDY;
13323
13324
13325     if (SIZE_ONLY)
13326         return exact;
13327
13328     /* Find last node. */
13329
13330     scan = p;
13331     for (;;) {
13332         regnode * const temp = regnext(scan);
13333 #ifdef EXPERIMENTAL_INPLACESCAN
13334         if (PL_regkind[OP(scan)] == EXACT) {
13335             bool has_exactf_sharp_s;    /* Unexamined in this routine */
13336             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13337                 return EXACT;
13338         }
13339 #endif
13340         if ( exact ) {
13341             switch (OP(scan)) {
13342                 case EXACT:
13343                 case EXACTF:
13344                 case EXACTFA:
13345                 case EXACTFU:
13346                 case EXACTFU_SS:
13347                 case EXACTFU_TRICKYFOLD:
13348                 case EXACTFL:
13349                         if( exact == PSEUDO )
13350                             exact= OP(scan);
13351                         else if ( exact != OP(scan) )
13352                             exact= 0;
13353                 case NOTHING:
13354                     break;
13355                 default:
13356                     exact= 0;
13357             }
13358         }
13359         DEBUG_PARSE_r({
13360             SV * const mysv=sv_newmortal();
13361             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13362             regprop(RExC_rx, mysv, scan);
13363             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13364                 SvPV_nolen_const(mysv),
13365                 REG_NODE_NUM(scan),
13366                 PL_reg_name[exact]);
13367         });
13368         if (temp == NULL)
13369             break;
13370         scan = temp;
13371     }
13372     DEBUG_PARSE_r({
13373         SV * const mysv_val=sv_newmortal();
13374         DEBUG_PARSE_MSG("");
13375         regprop(RExC_rx, mysv_val, val);
13376         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13377                       SvPV_nolen_const(mysv_val),
13378                       (IV)REG_NODE_NUM(val),
13379                       (IV)(val - scan)
13380         );
13381     });
13382     if (reg_off_by_arg[OP(scan)]) {
13383         ARG_SET(scan, val - scan);
13384     }
13385     else {
13386         NEXT_OFF(scan) = val - scan;
13387     }
13388
13389     return exact;
13390 }
13391 #endif
13392
13393 /*
13394  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13395  */
13396 #ifdef DEBUGGING
13397 static void
13398 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13399 {
13400     int bit;
13401     int set=0;
13402     regex_charset cs;
13403
13404     for (bit=0; bit<32; bit++) {
13405         if (flags & (1<<bit)) {
13406             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
13407                 continue;
13408             }
13409             if (!set++ && lead)
13410                 PerlIO_printf(Perl_debug_log, "%s",lead);
13411             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13412         }
13413     }
13414     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13415             if (!set++ && lead) {
13416                 PerlIO_printf(Perl_debug_log, "%s",lead);
13417             }
13418             switch (cs) {
13419                 case REGEX_UNICODE_CHARSET:
13420                     PerlIO_printf(Perl_debug_log, "UNICODE");
13421                     break;
13422                 case REGEX_LOCALE_CHARSET:
13423                     PerlIO_printf(Perl_debug_log, "LOCALE");
13424                     break;
13425                 case REGEX_ASCII_RESTRICTED_CHARSET:
13426                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13427                     break;
13428                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13429                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13430                     break;
13431                 default:
13432                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13433                     break;
13434             }
13435     }
13436     if (lead)  {
13437         if (set)
13438             PerlIO_printf(Perl_debug_log, "\n");
13439         else
13440             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13441     }
13442 }
13443 #endif
13444
13445 void
13446 Perl_regdump(pTHX_ const regexp *r)
13447 {
13448 #ifdef DEBUGGING
13449     dVAR;
13450     SV * const sv = sv_newmortal();
13451     SV *dsv= sv_newmortal();
13452     RXi_GET_DECL(r,ri);
13453     GET_RE_DEBUG_FLAGS_DECL;
13454
13455     PERL_ARGS_ASSERT_REGDUMP;
13456
13457     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13458
13459     /* Header fields of interest. */
13460     if (r->anchored_substr) {
13461         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13462             RE_SV_DUMPLEN(r->anchored_substr), 30);
13463         PerlIO_printf(Perl_debug_log,
13464                       "anchored %s%s at %"IVdf" ",
13465                       s, RE_SV_TAIL(r->anchored_substr),
13466                       (IV)r->anchored_offset);
13467     } else if (r->anchored_utf8) {
13468         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13469             RE_SV_DUMPLEN(r->anchored_utf8), 30);
13470         PerlIO_printf(Perl_debug_log,
13471                       "anchored utf8 %s%s at %"IVdf" ",
13472                       s, RE_SV_TAIL(r->anchored_utf8),
13473                       (IV)r->anchored_offset);
13474     }
13475     if (r->float_substr) {
13476         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13477             RE_SV_DUMPLEN(r->float_substr), 30);
13478         PerlIO_printf(Perl_debug_log,
13479                       "floating %s%s at %"IVdf"..%"UVuf" ",
13480                       s, RE_SV_TAIL(r->float_substr),
13481                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13482     } else if (r->float_utf8) {
13483         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13484             RE_SV_DUMPLEN(r->float_utf8), 30);
13485         PerlIO_printf(Perl_debug_log,
13486                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13487                       s, RE_SV_TAIL(r->float_utf8),
13488                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13489     }
13490     if (r->check_substr || r->check_utf8)
13491         PerlIO_printf(Perl_debug_log,
13492                       (const char *)
13493                       (r->check_substr == r->float_substr
13494                        && r->check_utf8 == r->float_utf8
13495                        ? "(checking floating" : "(checking anchored"));
13496     if (r->extflags & RXf_NOSCAN)
13497         PerlIO_printf(Perl_debug_log, " noscan");
13498     if (r->extflags & RXf_CHECK_ALL)
13499         PerlIO_printf(Perl_debug_log, " isall");
13500     if (r->check_substr || r->check_utf8)
13501         PerlIO_printf(Perl_debug_log, ") ");
13502
13503     if (ri->regstclass) {
13504         regprop(r, sv, ri->regstclass);
13505         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13506     }
13507     if (r->extflags & RXf_ANCH) {
13508         PerlIO_printf(Perl_debug_log, "anchored");
13509         if (r->extflags & RXf_ANCH_BOL)
13510             PerlIO_printf(Perl_debug_log, "(BOL)");
13511         if (r->extflags & RXf_ANCH_MBOL)
13512             PerlIO_printf(Perl_debug_log, "(MBOL)");
13513         if (r->extflags & RXf_ANCH_SBOL)
13514             PerlIO_printf(Perl_debug_log, "(SBOL)");
13515         if (r->extflags & RXf_ANCH_GPOS)
13516             PerlIO_printf(Perl_debug_log, "(GPOS)");
13517         PerlIO_putc(Perl_debug_log, ' ');
13518     }
13519     if (r->extflags & RXf_GPOS_SEEN)
13520         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13521     if (r->intflags & PREGf_SKIP)
13522         PerlIO_printf(Perl_debug_log, "plus ");
13523     if (r->intflags & PREGf_IMPLICIT)
13524         PerlIO_printf(Perl_debug_log, "implicit ");
13525     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13526     if (r->extflags & RXf_EVAL_SEEN)
13527         PerlIO_printf(Perl_debug_log, "with eval ");
13528     PerlIO_printf(Perl_debug_log, "\n");
13529     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13530 #else
13531     PERL_ARGS_ASSERT_REGDUMP;
13532     PERL_UNUSED_CONTEXT;
13533     PERL_UNUSED_ARG(r);
13534 #endif  /* DEBUGGING */
13535 }
13536
13537 /*
13538 - regprop - printable representation of opcode
13539 */
13540 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13541 STMT_START { \
13542         if (do_sep) {                           \
13543             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13544             if (flags & ANYOF_INVERT)           \
13545                 /*make sure the invert info is in each */ \
13546                 sv_catpvs(sv, "^");             \
13547             do_sep = 0;                         \
13548         }                                       \
13549 } STMT_END
13550
13551 void
13552 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13553 {
13554 #ifdef DEBUGGING
13555     dVAR;
13556     int k;
13557
13558     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13559     static const char * const anyofs[] = {
13560 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
13561     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
13562     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
13563     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
13564     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
13565     || _CC_VERTSPACE != 16
13566   #error Need to adjust order of anyofs[]
13567 #endif
13568         "[\\w]",
13569         "[\\W]",
13570         "[\\d]",
13571         "[\\D]",
13572         "[:alpha:]",
13573         "[:^alpha:]",
13574         "[:lower:]",
13575         "[:^lower:]",
13576         "[:upper:]",
13577         "[:^upper:]",
13578         "[:punct:]",
13579         "[:^punct:]",
13580         "[:print:]",
13581         "[:^print:]",
13582         "[:alnum:]",
13583         "[:^alnum:]",
13584         "[:graph:]",
13585         "[:^graph:]",
13586         "[:cased:]",
13587         "[:^cased:]",
13588         "[\\s]",
13589         "[\\S]",
13590         "[:blank:]",
13591         "[:^blank:]",
13592         "[:xdigit:]",
13593         "[:^xdigit:]",
13594         "[:space:]",
13595         "[:^space:]",
13596         "[:cntrl:]",
13597         "[:^cntrl:]",
13598         "[:ascii:]",
13599         "[:^ascii:]",
13600         "[\\v]",
13601         "[\\V]"
13602     };
13603     RXi_GET_DECL(prog,progi);
13604     GET_RE_DEBUG_FLAGS_DECL;
13605
13606     PERL_ARGS_ASSERT_REGPROP;
13607
13608     sv_setpvs(sv, "");
13609
13610     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
13611         /* It would be nice to FAIL() here, but this may be called from
13612            regexec.c, and it would be hard to supply pRExC_state. */
13613         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13614     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13615
13616     k = PL_regkind[OP(o)];
13617
13618     if (k == EXACT) {
13619         sv_catpvs(sv, " ");
13620         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13621          * is a crude hack but it may be the best for now since
13622          * we have no flag "this EXACTish node was UTF-8"
13623          * --jhi */
13624         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13625                   PERL_PV_ESCAPE_UNI_DETECT |
13626                   PERL_PV_ESCAPE_NONASCII   |
13627                   PERL_PV_PRETTY_ELLIPSES   |
13628                   PERL_PV_PRETTY_LTGT       |
13629                   PERL_PV_PRETTY_NOCLEAR
13630                   );
13631     } else if (k == TRIE) {
13632         /* print the details of the trie in dumpuntil instead, as
13633          * progi->data isn't available here */
13634         const char op = OP(o);
13635         const U32 n = ARG(o);
13636         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13637                (reg_ac_data *)progi->data->data[n] :
13638                NULL;
13639         const reg_trie_data * const trie
13640             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13641
13642         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13643         DEBUG_TRIE_COMPILE_r(
13644             Perl_sv_catpvf(aTHX_ sv,
13645                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13646                 (UV)trie->startstate,
13647                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13648                 (UV)trie->wordcount,
13649                 (UV)trie->minlen,
13650                 (UV)trie->maxlen,
13651                 (UV)TRIE_CHARCOUNT(trie),
13652                 (UV)trie->uniquecharcount
13653             )
13654         );
13655         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13656             int i;
13657             int rangestart = -1;
13658             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13659             sv_catpvs(sv, "[");
13660             for (i = 0; i <= 256; i++) {
13661                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13662                     if (rangestart == -1)
13663                         rangestart = i;
13664                 } else if (rangestart != -1) {
13665                     if (i <= rangestart + 3)
13666                         for (; rangestart < i; rangestart++)
13667                             put_byte(sv, rangestart);
13668                     else {
13669                         put_byte(sv, rangestart);
13670                         sv_catpvs(sv, "-");
13671                         put_byte(sv, i - 1);
13672                     }
13673                     rangestart = -1;
13674                 }
13675             }
13676             sv_catpvs(sv, "]");
13677         }
13678
13679     } else if (k == CURLY) {
13680         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13681             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13682         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13683     }
13684     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
13685         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13686     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13687         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
13688         if ( RXp_PAREN_NAMES(prog) ) {
13689             if ( k != REF || (OP(o) < NREF)) {
13690                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13691                 SV **name= av_fetch(list, ARG(o), 0 );
13692                 if (name)
13693                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13694             }
13695             else {
13696                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13697                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13698                 I32 *nums=(I32*)SvPVX(sv_dat);
13699                 SV **name= av_fetch(list, nums[0], 0 );
13700                 I32 n;
13701                 if (name) {
13702                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
13703                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13704                                     (n ? "," : ""), (IV)nums[n]);
13705                     }
13706                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13707                 }
13708             }
13709         }
13710     } else if (k == GOSUB)
13711         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13712     else if (k == VERB) {
13713         if (!o->flags)
13714             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13715                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13716     } else if (k == LOGICAL)
13717         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
13718     else if (k == ANYOF) {
13719         int i, rangestart = -1;
13720         const U8 flags = ANYOF_FLAGS(o);
13721         int do_sep = 0;
13722
13723
13724         if (flags & ANYOF_LOCALE)
13725             sv_catpvs(sv, "{loc}");
13726         if (flags & ANYOF_LOC_FOLD)
13727             sv_catpvs(sv, "{i}");
13728         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13729         if (flags & ANYOF_INVERT)
13730             sv_catpvs(sv, "^");
13731
13732         /* output what the standard cp 0-255 bitmap matches */
13733         for (i = 0; i <= 256; i++) {
13734             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13735                 if (rangestart == -1)
13736                     rangestart = i;
13737             } else if (rangestart != -1) {
13738                 if (i <= rangestart + 3)
13739                     for (; rangestart < i; rangestart++)
13740                         put_byte(sv, rangestart);
13741                 else {
13742                     put_byte(sv, rangestart);
13743                     sv_catpvs(sv, "-");
13744                     put_byte(sv, i - 1);
13745                 }
13746                 do_sep = 1;
13747                 rangestart = -1;
13748             }
13749         }
13750
13751         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13752         /* output any special charclass tests (used entirely under use locale) */
13753         if (ANYOF_CLASS_TEST_ANY_SET(o))
13754             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13755                 if (ANYOF_CLASS_TEST(o,i)) {
13756                     sv_catpv(sv, anyofs[i]);
13757                     do_sep = 1;
13758                 }
13759
13760         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13761
13762         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13763             sv_catpvs(sv, "{non-utf8-latin1-all}");
13764         }
13765
13766         /* output information about the unicode matching */
13767         if (flags & ANYOF_UNICODE_ALL)
13768             sv_catpvs(sv, "{unicode_all}");
13769         else if (ANYOF_NONBITMAP(o))
13770             sv_catpvs(sv, "{unicode}");
13771         if (flags & ANYOF_NONBITMAP_NON_UTF8)
13772             sv_catpvs(sv, "{outside bitmap}");
13773
13774         if (ANYOF_NONBITMAP(o)) {
13775             SV *lv; /* Set if there is something outside the bit map */
13776             SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
13777             bool byte_output = FALSE;   /* If something in the bitmap has been
13778                                            output */
13779
13780             if (lv && lv != &PL_sv_undef) {
13781                 if (sw) {
13782                     U8 s[UTF8_MAXBYTES_CASE+1];
13783
13784                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13785                         uvchr_to_utf8(s, i);
13786
13787                         if (i < 256
13788                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
13789                                                                things already
13790                                                                output as part
13791                                                                of the bitmap */
13792                             && swash_fetch(sw, s, TRUE))
13793                         {
13794                             if (rangestart == -1)
13795                                 rangestart = i;
13796                         } else if (rangestart != -1) {
13797                             byte_output = TRUE;
13798                             if (i <= rangestart + 3)
13799                                 for (; rangestart < i; rangestart++) {
13800                                     put_byte(sv, rangestart);
13801                                 }
13802                             else {
13803                                 put_byte(sv, rangestart);
13804                                 sv_catpvs(sv, "-");
13805                                 put_byte(sv, i-1);
13806                             }
13807                             rangestart = -1;
13808                         }
13809                     }
13810                 }
13811
13812                 {
13813                     char *s = savesvpv(lv);
13814                     char * const origs = s;
13815
13816                     while (*s && *s != '\n')
13817                         s++;
13818
13819                     if (*s == '\n') {
13820                         const char * const t = ++s;
13821
13822                         if (byte_output) {
13823                             sv_catpvs(sv, " ");
13824                         }
13825
13826                         while (*s) {
13827                             if (*s == '\n') {
13828
13829                                 /* Truncate very long output */
13830                                 if (s - origs > 256) {
13831                                     Perl_sv_catpvf(aTHX_ sv,
13832                                                    "%.*s...",
13833                                                    (int) (s - origs - 1),
13834                                                    t);
13835                                     goto out_dump;
13836                                 }
13837                                 *s = ' ';
13838                             }
13839                             else if (*s == '\t') {
13840                                 *s = '-';
13841                             }
13842                             s++;
13843                         }
13844                         if (s[-1] == ' ')
13845                             s[-1] = 0;
13846
13847                         sv_catpv(sv, t);
13848                     }
13849
13850                 out_dump:
13851
13852                     Safefree(origs);
13853                 }
13854                 SvREFCNT_dec_NN(lv);
13855             }
13856         }
13857
13858         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13859     }
13860     else if (k == POSIXD || k == NPOSIXD) {
13861         U8 index = FLAGS(o) * 2;
13862         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
13863             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
13864         }
13865         else {
13866             sv_catpv(sv, anyofs[index]);
13867         }
13868     }
13869     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13870         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13871 #else
13872     PERL_UNUSED_CONTEXT;
13873     PERL_UNUSED_ARG(sv);
13874     PERL_UNUSED_ARG(o);
13875     PERL_UNUSED_ARG(prog);
13876 #endif  /* DEBUGGING */
13877 }
13878
13879 SV *
13880 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13881 {                               /* Assume that RE_INTUIT is set */
13882     dVAR;
13883     struct regexp *const prog = ReANY(r);
13884     GET_RE_DEBUG_FLAGS_DECL;
13885
13886     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13887     PERL_UNUSED_CONTEXT;
13888
13889     DEBUG_COMPILE_r(
13890         {
13891             const char * const s = SvPV_nolen_const(prog->check_substr
13892                       ? prog->check_substr : prog->check_utf8);
13893
13894             if (!PL_colorset) reginitcolors();
13895             PerlIO_printf(Perl_debug_log,
13896                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13897                       PL_colors[4],
13898                       prog->check_substr ? "" : "utf8 ",
13899                       PL_colors[5],PL_colors[0],
13900                       s,
13901                       PL_colors[1],
13902                       (strlen(s) > 60 ? "..." : ""));
13903         } );
13904
13905     return prog->check_substr ? prog->check_substr : prog->check_utf8;
13906 }
13907
13908 /*
13909    pregfree()
13910
13911    handles refcounting and freeing the perl core regexp structure. When
13912    it is necessary to actually free the structure the first thing it
13913    does is call the 'free' method of the regexp_engine associated to
13914    the regexp, allowing the handling of the void *pprivate; member
13915    first. (This routine is not overridable by extensions, which is why
13916    the extensions free is called first.)
13917
13918    See regdupe and regdupe_internal if you change anything here.
13919 */
13920 #ifndef PERL_IN_XSUB_RE
13921 void
13922 Perl_pregfree(pTHX_ REGEXP *r)
13923 {
13924     SvREFCNT_dec(r);
13925 }
13926
13927 void
13928 Perl_pregfree2(pTHX_ REGEXP *rx)
13929 {
13930     dVAR;
13931     struct regexp *const r = ReANY(rx);
13932     GET_RE_DEBUG_FLAGS_DECL;
13933
13934     PERL_ARGS_ASSERT_PREGFREE2;
13935
13936     if (r->mother_re) {
13937         ReREFCNT_dec(r->mother_re);
13938     } else {
13939         CALLREGFREE_PVT(rx); /* free the private data */
13940         SvREFCNT_dec(RXp_PAREN_NAMES(r));
13941         Safefree(r->xpv_len_u.xpvlenu_pv);
13942     }
13943     if (r->substrs) {
13944         SvREFCNT_dec(r->anchored_substr);
13945         SvREFCNT_dec(r->anchored_utf8);
13946         SvREFCNT_dec(r->float_substr);
13947         SvREFCNT_dec(r->float_utf8);
13948         Safefree(r->substrs);
13949     }
13950     RX_MATCH_COPY_FREE(rx);
13951 #ifdef PERL_ANY_COW
13952     SvREFCNT_dec(r->saved_copy);
13953 #endif
13954     Safefree(r->offs);
13955     SvREFCNT_dec(r->qr_anoncv);
13956     rx->sv_u.svu_rx = 0;
13957 }
13958
13959 /*  reg_temp_copy()
13960
13961     This is a hacky workaround to the structural issue of match results
13962     being stored in the regexp structure which is in turn stored in
13963     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13964     could be PL_curpm in multiple contexts, and could require multiple
13965     result sets being associated with the pattern simultaneously, such
13966     as when doing a recursive match with (??{$qr})
13967
13968     The solution is to make a lightweight copy of the regexp structure
13969     when a qr// is returned from the code executed by (??{$qr}) this
13970     lightweight copy doesn't actually own any of its data except for
13971     the starp/end and the actual regexp structure itself.
13972
13973 */
13974
13975
13976 REGEXP *
13977 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13978 {
13979     struct regexp *ret;
13980     struct regexp *const r = ReANY(rx);
13981     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
13982
13983     PERL_ARGS_ASSERT_REG_TEMP_COPY;
13984
13985     if (!ret_x)
13986         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13987     else {
13988         SvOK_off((SV *)ret_x);
13989         if (islv) {
13990             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
13991                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
13992                made both spots point to the same regexp body.) */
13993             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
13994             assert(!SvPVX(ret_x));
13995             ret_x->sv_u.svu_rx = temp->sv_any;
13996             temp->sv_any = NULL;
13997             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
13998             SvREFCNT_dec_NN(temp);
13999             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14000                ing below will not set it. */
14001             SvCUR_set(ret_x, SvCUR(rx));
14002         }
14003     }
14004     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14005        sv_force_normal(sv) is called.  */
14006     SvFAKE_on(ret_x);
14007     ret = ReANY(ret_x);
14008
14009     SvFLAGS(ret_x) |= SvUTF8(rx);
14010     /* We share the same string buffer as the original regexp, on which we
14011        hold a reference count, incremented when mother_re is set below.
14012        The string pointer is copied here, being part of the regexp struct.
14013      */
14014     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14015            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14016     if (r->offs) {
14017         const I32 npar = r->nparens+1;
14018         Newx(ret->offs, npar, regexp_paren_pair);
14019         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14020     }
14021     if (r->substrs) {
14022         Newx(ret->substrs, 1, struct reg_substr_data);
14023         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14024
14025         SvREFCNT_inc_void(ret->anchored_substr);
14026         SvREFCNT_inc_void(ret->anchored_utf8);
14027         SvREFCNT_inc_void(ret->float_substr);
14028         SvREFCNT_inc_void(ret->float_utf8);
14029
14030         /* check_substr and check_utf8, if non-NULL, point to either their
14031            anchored or float namesakes, and don't hold a second reference.  */
14032     }
14033     RX_MATCH_COPIED_off(ret_x);
14034 #ifdef PERL_ANY_COW
14035     ret->saved_copy = NULL;
14036 #endif
14037     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14038     SvREFCNT_inc_void(ret->qr_anoncv);
14039
14040     return ret_x;
14041 }
14042 #endif
14043
14044 /* regfree_internal()
14045
14046    Free the private data in a regexp. This is overloadable by
14047    extensions. Perl takes care of the regexp structure in pregfree(),
14048    this covers the *pprivate pointer which technically perl doesn't
14049    know about, however of course we have to handle the
14050    regexp_internal structure when no extension is in use.
14051
14052    Note this is called before freeing anything in the regexp
14053    structure.
14054  */
14055
14056 void
14057 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14058 {
14059     dVAR;
14060     struct regexp *const r = ReANY(rx);
14061     RXi_GET_DECL(r,ri);
14062     GET_RE_DEBUG_FLAGS_DECL;
14063
14064     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14065
14066     DEBUG_COMPILE_r({
14067         if (!PL_colorset)
14068             reginitcolors();
14069         {
14070             SV *dsv= sv_newmortal();
14071             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14072                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14073             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14074                 PL_colors[4],PL_colors[5],s);
14075         }
14076     });
14077 #ifdef RE_TRACK_PATTERN_OFFSETS
14078     if (ri->u.offsets)
14079         Safefree(ri->u.offsets);             /* 20010421 MJD */
14080 #endif
14081     if (ri->code_blocks) {
14082         int n;
14083         for (n = 0; n < ri->num_code_blocks; n++)
14084             SvREFCNT_dec(ri->code_blocks[n].src_regex);
14085         Safefree(ri->code_blocks);
14086     }
14087
14088     if (ri->data) {
14089         int n = ri->data->count;
14090
14091         while (--n >= 0) {
14092           /* If you add a ->what type here, update the comment in regcomp.h */
14093             switch (ri->data->what[n]) {
14094             case 'a':
14095             case 'r':
14096             case 's':
14097             case 'S':
14098             case 'u':
14099                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14100                 break;
14101             case 'f':
14102                 Safefree(ri->data->data[n]);
14103                 break;
14104             case 'l':
14105             case 'L':
14106                 break;
14107             case 'T':
14108                 { /* Aho Corasick add-on structure for a trie node.
14109                      Used in stclass optimization only */
14110                     U32 refcount;
14111                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14112                     OP_REFCNT_LOCK;
14113                     refcount = --aho->refcount;
14114                     OP_REFCNT_UNLOCK;
14115                     if ( !refcount ) {
14116                         PerlMemShared_free(aho->states);
14117                         PerlMemShared_free(aho->fail);
14118                          /* do this last!!!! */
14119                         PerlMemShared_free(ri->data->data[n]);
14120                         PerlMemShared_free(ri->regstclass);
14121                     }
14122                 }
14123                 break;
14124             case 't':
14125                 {
14126                     /* trie structure. */
14127                     U32 refcount;
14128                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14129                     OP_REFCNT_LOCK;
14130                     refcount = --trie->refcount;
14131                     OP_REFCNT_UNLOCK;
14132                     if ( !refcount ) {
14133                         PerlMemShared_free(trie->charmap);
14134                         PerlMemShared_free(trie->states);
14135                         PerlMemShared_free(trie->trans);
14136                         if (trie->bitmap)
14137                             PerlMemShared_free(trie->bitmap);
14138                         if (trie->jump)
14139                             PerlMemShared_free(trie->jump);
14140                         PerlMemShared_free(trie->wordinfo);
14141                         /* do this last!!!! */
14142                         PerlMemShared_free(ri->data->data[n]);
14143                     }
14144                 }
14145                 break;
14146             default:
14147                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14148             }
14149         }
14150         Safefree(ri->data->what);
14151         Safefree(ri->data);
14152     }
14153
14154     Safefree(ri);
14155 }
14156
14157 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14158 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14159 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
14160
14161 /*
14162    re_dup - duplicate a regexp.
14163
14164    This routine is expected to clone a given regexp structure. It is only
14165    compiled under USE_ITHREADS.
14166
14167    After all of the core data stored in struct regexp is duplicated
14168    the regexp_engine.dupe method is used to copy any private data
14169    stored in the *pprivate pointer. This allows extensions to handle
14170    any duplication it needs to do.
14171
14172    See pregfree() and regfree_internal() if you change anything here.
14173 */
14174 #if defined(USE_ITHREADS)
14175 #ifndef PERL_IN_XSUB_RE
14176 void
14177 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14178 {
14179     dVAR;
14180     I32 npar;
14181     const struct regexp *r = ReANY(sstr);
14182     struct regexp *ret = ReANY(dstr);
14183
14184     PERL_ARGS_ASSERT_RE_DUP_GUTS;
14185
14186     npar = r->nparens+1;
14187     Newx(ret->offs, npar, regexp_paren_pair);
14188     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14189     if(ret->swap) {
14190         /* no need to copy these */
14191         Newx(ret->swap, npar, regexp_paren_pair);
14192     }
14193
14194     if (ret->substrs) {
14195         /* Do it this way to avoid reading from *r after the StructCopy().
14196            That way, if any of the sv_dup_inc()s dislodge *r from the L1
14197            cache, it doesn't matter.  */
14198         const bool anchored = r->check_substr
14199             ? r->check_substr == r->anchored_substr
14200             : r->check_utf8 == r->anchored_utf8;
14201         Newx(ret->substrs, 1, struct reg_substr_data);
14202         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14203
14204         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14205         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14206         ret->float_substr = sv_dup_inc(ret->float_substr, param);
14207         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14208
14209         /* check_substr and check_utf8, if non-NULL, point to either their
14210            anchored or float namesakes, and don't hold a second reference.  */
14211
14212         if (ret->check_substr) {
14213             if (anchored) {
14214                 assert(r->check_utf8 == r->anchored_utf8);
14215                 ret->check_substr = ret->anchored_substr;
14216                 ret->check_utf8 = ret->anchored_utf8;
14217             } else {
14218                 assert(r->check_substr == r->float_substr);
14219                 assert(r->check_utf8 == r->float_utf8);
14220                 ret->check_substr = ret->float_substr;
14221                 ret->check_utf8 = ret->float_utf8;
14222             }
14223         } else if (ret->check_utf8) {
14224             if (anchored) {
14225                 ret->check_utf8 = ret->anchored_utf8;
14226             } else {
14227                 ret->check_utf8 = ret->float_utf8;
14228             }
14229         }
14230     }
14231
14232     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14233     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14234
14235     if (ret->pprivate)
14236         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14237
14238     if (RX_MATCH_COPIED(dstr))
14239         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
14240     else
14241         ret->subbeg = NULL;
14242 #ifdef PERL_ANY_COW
14243     ret->saved_copy = NULL;
14244 #endif
14245
14246     /* Whether mother_re be set or no, we need to copy the string.  We
14247        cannot refrain from copying it when the storage points directly to
14248        our mother regexp, because that's
14249                1: a buffer in a different thread
14250                2: something we no longer hold a reference on
14251                so we need to copy it locally.  */
14252     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
14253     ret->mother_re   = NULL;
14254     ret->gofs = 0;
14255 }
14256 #endif /* PERL_IN_XSUB_RE */
14257
14258 /*
14259    regdupe_internal()
14260
14261    This is the internal complement to regdupe() which is used to copy
14262    the structure pointed to by the *pprivate pointer in the regexp.
14263    This is the core version of the extension overridable cloning hook.
14264    The regexp structure being duplicated will be copied by perl prior
14265    to this and will be provided as the regexp *r argument, however
14266    with the /old/ structures pprivate pointer value. Thus this routine
14267    may override any copying normally done by perl.
14268
14269    It returns a pointer to the new regexp_internal structure.
14270 */
14271
14272 void *
14273 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14274 {
14275     dVAR;
14276     struct regexp *const r = ReANY(rx);
14277     regexp_internal *reti;
14278     int len;
14279     RXi_GET_DECL(r,ri);
14280
14281     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14282
14283     len = ProgLen(ri);
14284
14285     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14286     Copy(ri->program, reti->program, len+1, regnode);
14287
14288     reti->num_code_blocks = ri->num_code_blocks;
14289     if (ri->code_blocks) {
14290         int n;
14291         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14292                 struct reg_code_block);
14293         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14294                 struct reg_code_block);
14295         for (n = 0; n < ri->num_code_blocks; n++)
14296              reti->code_blocks[n].src_regex = (REGEXP*)
14297                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14298     }
14299     else
14300         reti->code_blocks = NULL;
14301
14302     reti->regstclass = NULL;
14303
14304     if (ri->data) {
14305         struct reg_data *d;
14306         const int count = ri->data->count;
14307         int i;
14308
14309         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14310                 char, struct reg_data);
14311         Newx(d->what, count, U8);
14312
14313         d->count = count;
14314         for (i = 0; i < count; i++) {
14315             d->what[i] = ri->data->what[i];
14316             switch (d->what[i]) {
14317                 /* see also regcomp.h and regfree_internal() */
14318             case 'a': /* actually an AV, but the dup function is identical.  */
14319             case 'r':
14320             case 's':
14321             case 'S':
14322             case 'u': /* actually an HV, but the dup function is identical.  */
14323                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14324                 break;
14325             case 'f':
14326                 /* This is cheating. */
14327                 Newx(d->data[i], 1, struct regnode_charclass_class);
14328                 StructCopy(ri->data->data[i], d->data[i],
14329                             struct regnode_charclass_class);
14330                 reti->regstclass = (regnode*)d->data[i];
14331                 break;
14332             case 'T':
14333                 /* Trie stclasses are readonly and can thus be shared
14334                  * without duplication. We free the stclass in pregfree
14335                  * when the corresponding reg_ac_data struct is freed.
14336                  */
14337                 reti->regstclass= ri->regstclass;
14338                 /* Fall through */
14339             case 't':
14340                 OP_REFCNT_LOCK;
14341                 ((reg_trie_data*)ri->data->data[i])->refcount++;
14342                 OP_REFCNT_UNLOCK;
14343                 /* Fall through */
14344             case 'l':
14345             case 'L':
14346                 d->data[i] = ri->data->data[i];
14347                 break;
14348             default:
14349                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14350             }
14351         }
14352
14353         reti->data = d;
14354     }
14355     else
14356         reti->data = NULL;
14357
14358     reti->name_list_idx = ri->name_list_idx;
14359
14360 #ifdef RE_TRACK_PATTERN_OFFSETS
14361     if (ri->u.offsets) {
14362         Newx(reti->u.offsets, 2*len+1, U32);
14363         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14364     }
14365 #else
14366     SetProgLen(reti,len);
14367 #endif
14368
14369     return (void*)reti;
14370 }
14371
14372 #endif    /* USE_ITHREADS */
14373
14374 #ifndef PERL_IN_XSUB_RE
14375
14376 /*
14377  - regnext - dig the "next" pointer out of a node
14378  */
14379 regnode *
14380 Perl_regnext(pTHX_ regnode *p)
14381 {
14382     dVAR;
14383     I32 offset;
14384
14385     if (!p)
14386         return(NULL);
14387
14388     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
14389         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14390     }
14391
14392     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14393     if (offset == 0)
14394         return(NULL);
14395
14396     return(p+offset);
14397 }
14398 #endif
14399
14400 STATIC void
14401 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14402 {
14403     va_list args;
14404     STRLEN l1 = strlen(pat1);
14405     STRLEN l2 = strlen(pat2);
14406     char buf[512];
14407     SV *msv;
14408     const char *message;
14409
14410     PERL_ARGS_ASSERT_RE_CROAK2;
14411
14412     if (l1 > 510)
14413         l1 = 510;
14414     if (l1 + l2 > 510)
14415         l2 = 510 - l1;
14416     Copy(pat1, buf, l1 , char);
14417     Copy(pat2, buf + l1, l2 , char);
14418     buf[l1 + l2] = '\n';
14419     buf[l1 + l2 + 1] = '\0';
14420 #ifdef I_STDARG
14421     /* ANSI variant takes additional second argument */
14422     va_start(args, pat2);
14423 #else
14424     va_start(args);
14425 #endif
14426     msv = vmess(buf, &args);
14427     va_end(args);
14428     message = SvPV_const(msv,l1);
14429     if (l1 > 512)
14430         l1 = 512;
14431     Copy(message, buf, l1 , char);
14432     buf[l1-1] = '\0';                   /* Overwrite \n */
14433     Perl_croak(aTHX_ "%s", buf);
14434 }
14435
14436 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
14437
14438 #ifndef PERL_IN_XSUB_RE
14439 void
14440 Perl_save_re_context(pTHX)
14441 {
14442     dVAR;
14443
14444     struct re_save_state *state;
14445
14446     SAVEVPTR(PL_curcop);
14447     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14448
14449     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14450     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14451     SSPUSHUV(SAVEt_RE_STATE);
14452
14453     Copy(&PL_reg_state, state, 1, struct re_save_state);
14454
14455     PL_reg_oldsaved = NULL;
14456     PL_reg_oldsavedlen = 0;
14457     PL_reg_oldsavedoffset = 0;
14458     PL_reg_oldsavedcoffset = 0;
14459     PL_reg_maxiter = 0;
14460     PL_reg_leftiter = 0;
14461     PL_reg_poscache = NULL;
14462     PL_reg_poscache_size = 0;
14463 #ifdef PERL_ANY_COW
14464     PL_nrs = NULL;
14465 #endif
14466
14467     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14468     if (PL_curpm) {
14469         const REGEXP * const rx = PM_GETRE(PL_curpm);
14470         if (rx) {
14471             U32 i;
14472             for (i = 1; i <= RX_NPARENS(rx); i++) {
14473                 char digits[TYPE_CHARS(long)];
14474                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14475                 GV *const *const gvp
14476                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14477
14478                 if (gvp) {
14479                     GV * const gv = *gvp;
14480                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14481                         save_scalar(gv);
14482                 }
14483             }
14484         }
14485     }
14486 }
14487 #endif
14488
14489 #ifdef DEBUGGING
14490
14491 STATIC void
14492 S_put_byte(pTHX_ SV *sv, int c)
14493 {
14494     PERL_ARGS_ASSERT_PUT_BYTE;
14495
14496     /* Our definition of isPRINT() ignores locales, so only bytes that are
14497        not part of UTF-8 are considered printable. I assume that the same
14498        holds for UTF-EBCDIC.
14499        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14500        which Wikipedia says:
14501
14502        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14503        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14504        identical, to the ASCII delete (DEL) or rubout control character.
14505        ) So the old condition can be simplified to !isPRINT(c)  */
14506     if (!isPRINT(c)) {
14507         if (c < 256) {
14508             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14509         }
14510         else {
14511             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14512         }
14513     }
14514     else {
14515         const char string = c;
14516         if (c == '-' || c == ']' || c == '\\' || c == '^')
14517             sv_catpvs(sv, "\\");
14518         sv_catpvn(sv, &string, 1);
14519     }
14520 }
14521
14522
14523 #define CLEAR_OPTSTART \
14524     if (optstart) STMT_START { \
14525             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14526             optstart=NULL; \
14527     } STMT_END
14528
14529 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14530
14531 STATIC const regnode *
14532 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14533             const regnode *last, const regnode *plast,
14534             SV* sv, I32 indent, U32 depth)
14535 {
14536     dVAR;
14537     U8 op = PSEUDO;     /* Arbitrary non-END op. */
14538     const regnode *next;
14539     const regnode *optstart= NULL;
14540
14541     RXi_GET_DECL(r,ri);
14542     GET_RE_DEBUG_FLAGS_DECL;
14543
14544     PERL_ARGS_ASSERT_DUMPUNTIL;
14545
14546 #ifdef DEBUG_DUMPUNTIL
14547     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14548         last ? last-start : 0,plast ? plast-start : 0);
14549 #endif
14550
14551     if (plast && plast < last)
14552         last= plast;
14553
14554     while (PL_regkind[op] != END && (!last || node < last)) {
14555         /* While that wasn't END last time... */
14556         NODE_ALIGN(node);
14557         op = OP(node);
14558         if (op == CLOSE || op == WHILEM)
14559             indent--;
14560         next = regnext((regnode *)node);
14561
14562         /* Where, what. */
14563         if (OP(node) == OPTIMIZED) {
14564             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14565                 optstart = node;
14566             else
14567                 goto after_print;
14568         } else
14569             CLEAR_OPTSTART;
14570
14571         regprop(r, sv, node);
14572         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14573                       (int)(2*indent + 1), "", SvPVX_const(sv));
14574
14575         if (OP(node) != OPTIMIZED) {
14576             if (next == NULL)           /* Next ptr. */
14577                 PerlIO_printf(Perl_debug_log, " (0)");
14578             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14579                 PerlIO_printf(Perl_debug_log, " (FAIL)");
14580             else
14581                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14582             (void)PerlIO_putc(Perl_debug_log, '\n');
14583         }
14584
14585       after_print:
14586         if (PL_regkind[(U8)op] == BRANCHJ) {
14587             assert(next);
14588             {
14589                 const regnode *nnode = (OP(next) == LONGJMP
14590                                        ? regnext((regnode *)next)
14591                                        : next);
14592                 if (last && nnode > last)
14593                     nnode = last;
14594                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14595             }
14596         }
14597         else if (PL_regkind[(U8)op] == BRANCH) {
14598             assert(next);
14599             DUMPUNTIL(NEXTOPER(node), next);
14600         }
14601         else if ( PL_regkind[(U8)op]  == TRIE ) {
14602             const regnode *this_trie = node;
14603             const char op = OP(node);
14604             const U32 n = ARG(node);
14605             const reg_ac_data * const ac = op>=AHOCORASICK ?
14606                (reg_ac_data *)ri->data->data[n] :
14607                NULL;
14608             const reg_trie_data * const trie =
14609                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14610 #ifdef DEBUGGING
14611             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14612 #endif
14613             const regnode *nextbranch= NULL;
14614             I32 word_idx;
14615             sv_setpvs(sv, "");
14616             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14617                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14618
14619                 PerlIO_printf(Perl_debug_log, "%*s%s ",
14620                    (int)(2*(indent+3)), "",
14621                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14622                             PL_colors[0], PL_colors[1],
14623                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14624                             PERL_PV_PRETTY_ELLIPSES    |
14625                             PERL_PV_PRETTY_LTGT
14626                             )
14627                             : "???"
14628                 );
14629                 if (trie->jump) {
14630                     U16 dist= trie->jump[word_idx+1];
14631                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14632                                   (UV)((dist ? this_trie + dist : next) - start));
14633                     if (dist) {
14634                         if (!nextbranch)
14635                             nextbranch= this_trie + trie->jump[0];
14636                         DUMPUNTIL(this_trie + dist, nextbranch);
14637                     }
14638                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14639                         nextbranch= regnext((regnode *)nextbranch);
14640                 } else {
14641                     PerlIO_printf(Perl_debug_log, "\n");
14642                 }
14643             }
14644             if (last && next > last)
14645                 node= last;
14646             else
14647                 node= next;
14648         }
14649         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
14650             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14651                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14652         }
14653         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14654             assert(next);
14655             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14656         }
14657         else if ( op == PLUS || op == STAR) {
14658             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14659         }
14660         else if (PL_regkind[(U8)op] == ANYOF) {
14661             /* arglen 1 + class block */
14662             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14663                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14664             node = NEXTOPER(node);
14665         }
14666         else if (PL_regkind[(U8)op] == EXACT) {
14667             /* Literal string, where present. */
14668             node += NODE_SZ_STR(node) - 1;
14669             node = NEXTOPER(node);
14670         }
14671         else {
14672             node = NEXTOPER(node);
14673             node += regarglen[(U8)op];
14674         }
14675         if (op == CURLYX || op == OPEN)
14676             indent++;
14677     }
14678     CLEAR_OPTSTART;
14679 #ifdef DEBUG_DUMPUNTIL
14680     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14681 #endif
14682     return node;
14683 }
14684
14685 #endif  /* DEBUGGING */
14686
14687 /*
14688  * Local variables:
14689  * c-indentation-style: bsd
14690  * c-basic-offset: 4
14691  * indent-tabs-mode: nil
14692  * End:
14693  *
14694  * ex: set ts=8 sts=4 sw=4 et:
14695  */