regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifdef op
  99 #undef op
 100 #endif /* op */
 101
 102 #ifdef MSDOS
 103 #  if defined(BUGGY_MSC6)
 104  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 105 #    pragma optimize("a",off)
 106  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 107 #    pragma optimize("w",on )
 108 #  endif /* BUGGY_MSC6 */
 109 #endif /* MSDOS */
 110
 111 #ifndef STATIC
 112 #define STATIC  static
 113 #endif
 114
 115
 116 typedef struct RExC_state_t {
 117     U32         flags;                  /* RXf_* are we folding, multilining? */
 118     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 119     char        *precomp;               /* uncompiled string. */
 120     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 121     regexp      *rx;                    /* perl core regexp structure */
 122     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 123     char        *start;                 /* Start of input for compile */
 124     char        *end;                   /* End of input for compile */
 125     char        *parse;                 /* Input-scan pointer. */
 126     I32         whilem_seen;            /* number of WHILEM in this expr */
 127     regnode     *emit_start;            /* Start of emitted-code area */
 128     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 129     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 130     I32         naughty;                /* How bad is this pattern? */
 131     I32         sawback;                /* Did we see \1, ...? */
 132     U32         seen;
 133     I32         size;                   /* Code size. */
 134     I32         npar;                   /* Capture buffer count, (OPEN). */
 135     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 136     I32         nestroot;               /* root parens we are in - used by accept */
 137     I32         extralen;
 138     I32         seen_zerolen;
 139     regnode     **open_parens;          /* pointers to open parens */
 140     regnode     **close_parens;         /* pointers to close parens */
 141     regnode     *opend;                 /* END node in program */
 142     I32         utf8;           /* whether the pattern is utf8 or not */
 143     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 144                                 /* XXX use this for future optimisation of case
 145                                  * where pattern must be upgraded to utf8. */
 146     I32         uni_semantics;  /* If a d charset modifier should use unicode
 147                                    rules, even if the pattern is not in
 148                                    utf8 */
 149     HV          *paren_names;           /* Paren names */
 150
 151     regnode     **recurse;              /* Recurse regops */
 152     I32         recurse_count;          /* Number of recurse regops */
 153     I32         in_lookbehind;
 154     I32         contains_locale;
 155     I32         override_recoding;
 156     I32         in_multi_char_class;
 157     struct reg_code_block *code_blocks; /* positions of literal (?{})
 158                                             within pattern */
 159     int         num_code_blocks;        /* size of code_blocks[] */
 160     int         code_index;             /* next code_blocks[] slot */
 161 #if ADD_TO_REGEXEC
 162     char        *starttry;              /* -Dr: where regtry was called. */
 163 #define RExC_starttry   (pRExC_state->starttry)
 164 #endif
 165     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 166 #ifdef DEBUGGING
 167     const char  *lastparse;
 168     I32         lastnum;
 169     AV          *paren_name_list;       /* idx -> name */
 170 #define RExC_lastparse  (pRExC_state->lastparse)
 171 #define RExC_lastnum    (pRExC_state->lastnum)
 172 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 173 #endif
 174 } RExC_state_t;
 175
 176 #define RExC_flags      (pRExC_state->flags)
 177 #define RExC_pm_flags   (pRExC_state->pm_flags)
 178 #define RExC_precomp    (pRExC_state->precomp)
 179 #define RExC_rx_sv      (pRExC_state->rx_sv)
 180 #define RExC_rx         (pRExC_state->rx)
 181 #define RExC_rxi        (pRExC_state->rxi)
 182 #define RExC_start      (pRExC_state->start)
 183 #define RExC_end        (pRExC_state->end)
 184 #define RExC_parse      (pRExC_state->parse)
 185 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 186 #ifdef RE_TRACK_PATTERN_OFFSETS
 187 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 188 #endif
 189 #define RExC_emit       (pRExC_state->emit)
 190 #define RExC_emit_start (pRExC_state->emit_start)
 191 #define RExC_emit_bound (pRExC_state->emit_bound)
 192 #define RExC_naughty    (pRExC_state->naughty)
 193 #define RExC_sawback    (pRExC_state->sawback)
 194 #define RExC_seen       (pRExC_state->seen)
 195 #define RExC_size       (pRExC_state->size)
 196 #define RExC_npar       (pRExC_state->npar)
 197 #define RExC_nestroot   (pRExC_state->nestroot)
 198 #define RExC_extralen   (pRExC_state->extralen)
 199 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 200 #define RExC_utf8       (pRExC_state->utf8)
 201 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 202 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 203 #define RExC_open_parens        (pRExC_state->open_parens)
 204 #define RExC_close_parens       (pRExC_state->close_parens)
 205 #define RExC_opend      (pRExC_state->opend)
 206 #define RExC_paren_names        (pRExC_state->paren_names)
 207 #define RExC_recurse    (pRExC_state->recurse)
 208 #define RExC_recurse_count      (pRExC_state->recurse_count)
 209 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 210 #define RExC_contains_locale    (pRExC_state->contains_locale)
 211 #define RExC_override_recoding (pRExC_state->override_recoding)
 212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 213
 214
 215 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 216 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 217         ((*s) == '{' && regcurly(s, FALSE)))
 218
 219 #ifdef SPSTART
 220 #undef SPSTART          /* dratted cpp namespace... */
 221 #endif
 222 /*
 223  * Flags to be passed up and down.
 224  */
 225 #define WORST           0       /* Worst case. */
 226 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 227
 228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 229  * character.  (There needs to be a case: in the switch statement in regexec.c
 230  * for any node marked SIMPLE.)  Note that this is not the same thing as
 231  * REGNODE_SIMPLE */
 232 #define SIMPLE          0x02
 233 #define SPSTART         0x04    /* Starts with * or + */
 234 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 235 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 236
 237 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 238
 239 /* whether trie related optimizations are enabled */
 240 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 241 #define TRIE_STUDY_OPT
 242 #define FULL_TRIE_STUDY
 243 #define TRIE_STCLASS
 244 #endif
 245
 246
 247
 248 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 249 #define PBITVAL(paren) (1 << ((paren) & 7))
 250 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 251 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 252 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 253
 254 /* If not already in utf8, do a longjmp back to the beginning */
 255 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 256 #define REQUIRE_UTF8    STMT_START {                                       \
 257                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 258                         } STMT_END
 259
 260 /* This converts the named class defined in regcomp.h to its equivalent class
 261  * number defined in handy.h. */
 262 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 263 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 264
 265 /* About scan_data_t.
 266
 267   During optimisation we recurse through the regexp program performing
 268   various inplace (keyhole style) optimisations. In addition study_chunk
 269   and scan_commit populate this data structure with information about
 270   what strings MUST appear in the pattern. We look for the longest
 271   string that must appear at a fixed location, and we look for the
 272   longest string that may appear at a floating location. So for instance
 273   in the pattern:
 274
 275     /FOO[xX]A.*B[xX]BAR/
 276
 277   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 278   strings (because they follow a .* construct). study_chunk will identify
 279   both FOO and BAR as being the longest fixed and floating strings respectively.
 280
 281   The strings can be composites, for instance
 282
 283      /(f)(o)(o)/
 284
 285   will result in a composite fixed substring 'foo'.
 286
 287   For each string some basic information is maintained:
 288
 289   - offset or min_offset
 290     This is the position the string must appear at, or not before.
 291     It also implicitly (when combined with minlenp) tells us how many
 292     characters must match before the string we are searching for.
 293     Likewise when combined with minlenp and the length of the string it
 294     tells us how many characters must appear after the string we have
 295     found.
 296
 297   - max_offset
 298     Only used for floating strings. This is the rightmost point that
 299     the string can appear at. If set to I32 max it indicates that the
 300     string can occur infinitely far to the right.
 301
 302   - minlenp
 303     A pointer to the minimum number of characters of the pattern that the
 304     string was found inside. This is important as in the case of positive
 305     lookahead or positive lookbehind we can have multiple patterns
 306     involved. Consider
 307
 308     /(?=FOO).*F/
 309
 310     The minimum length of the pattern overall is 3, the minimum length
 311     of the lookahead part is 3, but the minimum length of the part that
 312     will actually match is 1. So 'FOO's minimum length is 3, but the
 313     minimum length for the F is 1. This is important as the minimum length
 314     is used to determine offsets in front of and behind the string being
 315     looked for.  Since strings can be composites this is the length of the
 316     pattern at the time it was committed with a scan_commit. Note that
 317     the length is calculated by study_chunk, so that the minimum lengths
 318     are not known until the full pattern has been compiled, thus the
 319     pointer to the value.
 320
 321   - lookbehind
 322
 323     In the case of lookbehind the string being searched for can be
 324     offset past the start point of the final matching string.
 325     If this value was just blithely removed from the min_offset it would
 326     invalidate some of the calculations for how many chars must match
 327     before or after (as they are derived from min_offset and minlen and
 328     the length of the string being searched for).
 329     When the final pattern is compiled and the data is moved from the
 330     scan_data_t structure into the regexp structure the information
 331     about lookbehind is factored in, with the information that would
 332     have been lost precalculated in the end_shift field for the
 333     associated string.
 334
 335   The fields pos_min and pos_delta are used to store the minimum offset
 336   and the delta to the maximum offset at the current point in the pattern.
 337
 338 */
 339
 340 typedef struct scan_data_t {
 341     /*I32 len_min;      unused */
 342     /*I32 len_delta;    unused */
 343     I32 pos_min;
 344     I32 pos_delta;
 345     SV *last_found;
 346     I32 last_end;           /* min value, <0 unless valid. */
 347     I32 last_start_min;
 348     I32 last_start_max;
 349     SV **longest;           /* Either &l_fixed, or &l_float. */
 350     SV *longest_fixed;      /* longest fixed string found in pattern */
 351     I32 offset_fixed;       /* offset where it starts */
 352     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 353     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 354     SV *longest_float;      /* longest floating string found in pattern */
 355     I32 offset_float_min;   /* earliest point in string it can appear */
 356     I32 offset_float_max;   /* latest point in string it can appear */
 357     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 358     I32 lookbehind_float;   /* is the position of the string modified by LB */
 359     I32 flags;
 360     I32 whilem_c;
 361     I32 *last_closep;
 362     struct regnode_charclass_class *start_class;
 363 } scan_data_t;
 364
 365 /*
 366  * Forward declarations for pregcomp()'s friends.
 367  */
 368
 369 static const scan_data_t zero_scan_data =
 370   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 371
 372 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 373 #define SF_BEFORE_SEOL          0x0001
 374 #define SF_BEFORE_MEOL          0x0002
 375 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 376 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 377
 378 #ifdef NO_UNARY_PLUS
 379 #  define SF_FIX_SHIFT_EOL      (0+2)
 380 #  define SF_FL_SHIFT_EOL               (0+4)
 381 #else
 382 #  define SF_FIX_SHIFT_EOL      (+2)
 383 #  define SF_FL_SHIFT_EOL               (+4)
 384 #endif
 385
 386 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 387 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 388
 389 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 390 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 391 #define SF_IS_INF               0x0040
 392 #define SF_HAS_PAR              0x0080
 393 #define SF_IN_PAR               0x0100
 394 #define SF_HAS_EVAL             0x0200
 395 #define SCF_DO_SUBSTR           0x0400
 396 #define SCF_DO_STCLASS_AND      0x0800
 397 #define SCF_DO_STCLASS_OR       0x1000
 398 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 399 #define SCF_WHILEM_VISITED_POS  0x2000
 400
 401 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 402 #define SCF_SEEN_ACCEPT         0x8000
 403
 404 #define UTF cBOOL(RExC_utf8)
 405
 406 /* The enums for all these are ordered so things work out correctly */
 407 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 408 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 409 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 410 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 411 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 412 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 413 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 414
 415 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 416
 417 #define OOB_NAMEDCLASS          -1
 418
 419 /* There is no code point that is out-of-bounds, so this is problematic.  But
 420  * its only current use is to initialize a variable that is always set before
 421  * looked at. */
 422 #define OOB_UNICODE             0xDEADBEEF
 423
 424 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 425 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 426
 427
 428 /* length of regex to show in messages that don't mark a position within */
 429 #define RegexLengthToShowInErrorMessages 127
 430
 431 /*
 432  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 433  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 434  * op/pragma/warn/regcomp.
 435  */
 436 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 437 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 438
 439 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 440
 441 /*
 442  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 443  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 444  * "...".
 445  */
 446 #define _FAIL(code) STMT_START {                                        \
 447     const char *ellipses = "";                                          \
 448     IV len = RExC_end - RExC_precomp;                                   \
 449                                                                         \
 450     if (!SIZE_ONLY)                                                     \
 451         SAVEFREESV(RExC_rx_sv);                                         \
 452     if (len > RegexLengthToShowInErrorMessages) {                       \
 453         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 454         len = RegexLengthToShowInErrorMessages - 10;                    \
 455         ellipses = "...";                                               \
 456     }                                                                   \
 457     code;                                                               \
 458 } STMT_END
 459
 460 #define FAIL(msg) _FAIL(                            \
 461     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 462             msg, (int)len, RExC_precomp, ellipses))
 463
 464 #define FAIL2(msg,arg) _FAIL(                       \
 465     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 466             arg, (int)len, RExC_precomp, ellipses))
 467
 468 /*
 469  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 470  */
 471 #define Simple_vFAIL(m) STMT_START {                                    \
 472     const IV offset = RExC_parse - RExC_precomp;                        \
 473     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 474             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 475 } STMT_END
 476
 477 /*
 478  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 479  */
 480 #define vFAIL(m) STMT_START {                           \
 481     if (!SIZE_ONLY)                                     \
 482         SAVEFREESV(RExC_rx_sv);                         \
 483     Simple_vFAIL(m);                                    \
 484 } STMT_END
 485
 486 /*
 487  * Like Simple_vFAIL(), but accepts two arguments.
 488  */
 489 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 490     const IV offset = RExC_parse - RExC_precomp;                        \
 491     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 492             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 493 } STMT_END
 494
 495 /*
 496  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 497  */
 498 #define vFAIL2(m,a1) STMT_START {                       \
 499     if (!SIZE_ONLY)                                     \
 500         SAVEFREESV(RExC_rx_sv);                         \
 501     Simple_vFAIL2(m, a1);                               \
 502 } STMT_END
 503
 504
 505 /*
 506  * Like Simple_vFAIL(), but accepts three arguments.
 507  */
 508 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 509     const IV offset = RExC_parse - RExC_precomp;                \
 510     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 511             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 512 } STMT_END
 513
 514 /*
 515  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 516  */
 517 #define vFAIL3(m,a1,a2) STMT_START {                    \
 518     if (!SIZE_ONLY)                                     \
 519         SAVEFREESV(RExC_rx_sv);                         \
 520     Simple_vFAIL3(m, a1, a2);                           \
 521 } STMT_END
 522
 523 /*
 524  * Like Simple_vFAIL(), but accepts four arguments.
 525  */
 526 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 527     const IV offset = RExC_parse - RExC_precomp;                \
 528     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 529             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 530 } STMT_END
 531
 532 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 533     if (!SIZE_ONLY)                                     \
 534         SAVEFREESV(RExC_rx_sv);                         \
 535     Simple_vFAIL4(m, a1, a2, a3);                       \
 536 } STMT_END
 537
 538 /* m is not necessarily a "literal string", in this macro */
 539 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 540     const IV offset = loc - RExC_precomp;                               \
 541     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 542             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 543 } STMT_END
 544
 545 #define ckWARNreg(loc,m) STMT_START {                                   \
 546     const IV offset = loc - RExC_precomp;                               \
 547     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 548             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 549 } STMT_END
 550
 551 #define vWARN_dep(loc, m) STMT_START {                                  \
 552     const IV offset = loc - RExC_precomp;                               \
 553     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 554             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 555 } STMT_END
 556
 557 #define ckWARNdep(loc,m) STMT_START {                                   \
 558     const IV offset = loc - RExC_precomp;                               \
 559     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 560             m REPORT_LOCATION,                                          \
 561             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 562 } STMT_END
 563
 564 #define ckWARNregdep(loc,m) STMT_START {                                \
 565     const IV offset = loc - RExC_precomp;                               \
 566     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 567             m REPORT_LOCATION,                                          \
 568             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 569 } STMT_END
 570
 571 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 572     const IV offset = loc - RExC_precomp;                               \
 573     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 574             m REPORT_LOCATION,                                          \
 575             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 576 } STMT_END
 577
 578 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 579     const IV offset = loc - RExC_precomp;                               \
 580     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 581             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 582 } STMT_END
 583
 584 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 585     const IV offset = loc - RExC_precomp;                               \
 586     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 587             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 588 } STMT_END
 589
 590 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 591     const IV offset = loc - RExC_precomp;                               \
 592     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 593             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 594 } STMT_END
 595
 596 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 597     const IV offset = loc - RExC_precomp;                               \
 598     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 599             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 600 } STMT_END
 601
 602 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 603     const IV offset = loc - RExC_precomp;                               \
 604     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 605             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 606 } STMT_END
 607
 608 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 609     const IV offset = loc - RExC_precomp;                               \
 610     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 611             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 612 } STMT_END
 613
 614
 615 /* Allow for side effects in s */
 616 #define REGC(c,s) STMT_START {                  \
 617     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 618 } STMT_END
 619
 620 /* Macros for recording node offsets.   20001227 mjd@plover.com
 621  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 622  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 623  * Element 0 holds the number n.
 624  * Position is 1 indexed.
 625  */
 626 #ifndef RE_TRACK_PATTERN_OFFSETS
 627 #define Set_Node_Offset_To_R(node,byte)
 628 #define Set_Node_Offset(node,byte)
 629 #define Set_Cur_Node_Offset
 630 #define Set_Node_Length_To_R(node,len)
 631 #define Set_Node_Length(node,len)
 632 #define Set_Node_Cur_Length(node)
 633 #define Node_Offset(n)
 634 #define Node_Length(n)
 635 #define Set_Node_Offset_Length(node,offset,len)
 636 #define ProgLen(ri) ri->u.proglen
 637 #define SetProgLen(ri,x) ri->u.proglen = x
 638 #else
 639 #define ProgLen(ri) ri->u.offsets[0]
 640 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 641 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 642     if (! SIZE_ONLY) {                                                  \
 643         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 644                     __LINE__, (int)(node), (int)(byte)));               \
 645         if((node) < 0) {                                                \
 646             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 647         } else {                                                        \
 648             RExC_offsets[2*(node)-1] = (byte);                          \
 649         }                                                               \
 650     }                                                                   \
 651 } STMT_END
 652
 653 #define Set_Node_Offset(node,byte) \
 654     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 655 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 656
 657 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 658     if (! SIZE_ONLY) {                                                  \
 659         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 660                 __LINE__, (int)(node), (int)(len)));                    \
 661         if((node) < 0) {                                                \
 662             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 663         } else {                                                        \
 664             RExC_offsets[2*(node)] = (len);                             \
 665         }                                                               \
 666     }                                                                   \
 667 } STMT_END
 668
 669 #define Set_Node_Length(node,len) \
 670     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 671 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 672 #define Set_Node_Cur_Length(node) \
 673     Set_Node_Length(node, RExC_parse - parse_start)
 674
 675 /* Get offsets and lengths */
 676 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 677 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 678
 679 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 680     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 681     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 682 } STMT_END
 683 #endif
 684
 685 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 686 #define EXPERIMENTAL_INPLACESCAN
 687 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 688
 689 #define DEBUG_STUDYDATA(str,data,depth)                              \
 690 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 691     PerlIO_printf(Perl_debug_log,                                    \
 692         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 693         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 694         (int)(depth)*2, "",                                          \
 695         (IV)((data)->pos_min),                                       \
 696         (IV)((data)->pos_delta),                                     \
 697         (UV)((data)->flags),                                         \
 698         (IV)((data)->whilem_c),                                      \
 699         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 700         is_inf ? "INF " : ""                                         \
 701     );                                                               \
 702     if ((data)->last_found)                                          \
 703         PerlIO_printf(Perl_debug_log,                                \
 704             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 705             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 706             SvPVX_const((data)->last_found),                         \
 707             (IV)((data)->last_end),                                  \
 708             (IV)((data)->last_start_min),                            \
 709             (IV)((data)->last_start_max),                            \
 710             ((data)->longest &&                                      \
 711              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 712             SvPVX_const((data)->longest_fixed),                      \
 713             (IV)((data)->offset_fixed),                              \
 714             ((data)->longest &&                                      \
 715              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 716             SvPVX_const((data)->longest_float),                      \
 717             (IV)((data)->offset_float_min),                          \
 718             (IV)((data)->offset_float_max)                           \
 719         );                                                           \
 720     PerlIO_printf(Perl_debug_log,"\n");                              \
 721 });
 722
 723 /* Mark that we cannot extend a found fixed substring at this point.
 724    Update the longest found anchored substring and the longest found
 725    floating substrings if needed. */
 726
 727 STATIC void
 728 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 729 {
 730     const STRLEN l = CHR_SVLEN(data->last_found);
 731     const STRLEN old_l = CHR_SVLEN(*data->longest);
 732     GET_RE_DEBUG_FLAGS_DECL;
 733
 734     PERL_ARGS_ASSERT_SCAN_COMMIT;
 735
 736     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 737         SvSetMagicSV(*data->longest, data->last_found);
 738         if (*data->longest == data->longest_fixed) {
 739             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 740             if (data->flags & SF_BEFORE_EOL)
 741                 data->flags
 742                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 743             else
 744                 data->flags &= ~SF_FIX_BEFORE_EOL;
 745             data->minlen_fixed=minlenp;
 746             data->lookbehind_fixed=0;
 747         }
 748         else { /* *data->longest == data->longest_float */
 749             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 750             data->offset_float_max = (l
 751                                       ? data->last_start_max
 752                                       : data->pos_min + data->pos_delta);
 753             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 754                 data->offset_float_max = I32_MAX;
 755             if (data->flags & SF_BEFORE_EOL)
 756                 data->flags
 757                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 758             else
 759                 data->flags &= ~SF_FL_BEFORE_EOL;
 760             data->minlen_float=minlenp;
 761             data->lookbehind_float=0;
 762         }
 763     }
 764     SvCUR_set(data->last_found, 0);
 765     {
 766         SV * const sv = data->last_found;
 767         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 768             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 769             if (mg)
 770                 mg->mg_len = 0;
 771         }
 772     }
 773     data->last_end = -1;
 774     data->flags &= ~SF_BEFORE_EOL;
 775     DEBUG_STUDYDATA("commit: ",data,0);
 776 }
 777
 778 /* These macros set, clear and test whether the synthetic start class ('ssc',
 779  * given by the parameter) matches an empty string (EOS).  This uses the
 780  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 781  * stands alone, so there is never a next_off, so this field is otherwise
 782  * unused.  The EOS information is used only for compilation, but theoretically
 783  * it could be passed on to the execution code.  This could be used to store
 784  * more than one bit of information, but only this one is currently used. */
 785 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 786 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 787 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 788
 789 /* Can match anything (initialization) */
 790 STATIC void
 791 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 792 {
 793     PERL_ARGS_ASSERT_CL_ANYTHING;
 794
 795     ANYOF_BITMAP_SETALL(cl);
 796     cl->flags = ANYOF_UNICODE_ALL;
 797     SET_SSC_EOS(cl);
 798
 799     /* If any portion of the regex is to operate under locale rules,
 800      * initialization includes it.  The reason this isn't done for all regexes
 801      * is that the optimizer was written under the assumption that locale was
 802      * all-or-nothing.  Given the complexity and lack of documentation in the
 803      * optimizer, and that there are inadequate test cases for locale, so many
 804      * parts of it may not work properly, it is safest to avoid locale unless
 805      * necessary. */
 806     if (RExC_contains_locale) {
 807         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 808         cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 809     }
 810     else {
 811         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 812     }
 813 }
 814
 815 /* Can match anything (initialization) */
 816 STATIC int
 817 S_cl_is_anything(const struct regnode_charclass_class *cl)
 818 {
 819     int value;
 820
 821     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 822
 823     for (value = 0; value < ANYOF_MAX; value += 2)
 824         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 825             return 1;
 826     if (!(cl->flags & ANYOF_UNICODE_ALL))
 827         return 0;
 828     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 829         return 0;
 830     return 1;
 831 }
 832
 833 /* Can match anything (initialization) */
 834 STATIC void
 835 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 836 {
 837     PERL_ARGS_ASSERT_CL_INIT;
 838
 839     Zero(cl, 1, struct regnode_charclass_class);
 840     cl->type = ANYOF;
 841     cl_anything(pRExC_state, cl);
 842     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 843 }
 844
 845 /* These two functions currently do the exact same thing */
 846 #define cl_init_zero            S_cl_init
 847
 848 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 849  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 850  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 851 STATIC void
 852 S_cl_and(struct regnode_charclass_class *cl,
 853         const struct regnode_charclass_class *and_with)
 854 {
 855     PERL_ARGS_ASSERT_CL_AND;
 856
 857     assert(PL_regkind[and_with->type] == ANYOF);
 858
 859     /* I (khw) am not sure all these restrictions are necessary XXX */
 860     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 861         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 862         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 863         && !(and_with->flags & ANYOF_LOC_FOLD)
 864         && !(cl->flags & ANYOF_LOC_FOLD)) {
 865         int i;
 866
 867         if (and_with->flags & ANYOF_INVERT)
 868             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 869                 cl->bitmap[i] &= ~and_with->bitmap[i];
 870         else
 871             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 872                 cl->bitmap[i] &= and_with->bitmap[i];
 873     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 874
 875     if (and_with->flags & ANYOF_INVERT) {
 876
 877         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 878          * aren't affected by the inversion.  Those that are affected are
 879          * handled individually below */
 880         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 881         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 882         cl->flags |= affected_flags;
 883
 884         /* We currently don't know how to deal with things that aren't in the
 885          * bitmap, but we know that the intersection is no greater than what
 886          * is already in cl, so let there be false positives that get sorted
 887          * out after the synthetic start class succeeds, and the node is
 888          * matched for real. */
 889
 890         /* The inversion of these two flags indicate that the resulting
 891          * intersection doesn't have them */
 892         if (and_with->flags & ANYOF_UNICODE_ALL) {
 893             cl->flags &= ~ANYOF_UNICODE_ALL;
 894         }
 895         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 896             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 897         }
 898     }
 899     else {   /* and'd node is not inverted */
 900         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 901
 902         if (! ANYOF_NONBITMAP(and_with)) {
 903
 904             /* Here 'and_with' doesn't match anything outside the bitmap
 905              * (except possibly ANYOF_UNICODE_ALL), which means the
 906              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 907              * which case we don't know what the intersection is, but it's no
 908              * greater than what cl already has, so can just leave it alone,
 909              * with possible false positives */
 910             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 911                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 912                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 913             }
 914         }
 915         else if (! ANYOF_NONBITMAP(cl)) {
 916
 917             /* Here, 'and_with' does match something outside the bitmap, and cl
 918              * doesn't have a list of things to match outside the bitmap.  If
 919              * cl can match all code points above 255, the intersection will
 920              * be those above-255 code points that 'and_with' matches.  If cl
 921              * can't match all Unicode code points, it means that it can't
 922              * match anything outside the bitmap (since the 'if' that got us
 923              * into this block tested for that), so we leave the bitmap empty.
 924              */
 925             if (cl->flags & ANYOF_UNICODE_ALL) {
 926                 ARG_SET(cl, ARG(and_with));
 927
 928                 /* and_with's ARG may match things that don't require UTF8.
 929                  * And now cl's will too, in spite of this being an 'and'.  See
 930                  * the comments below about the kludge */
 931                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 932             }
 933         }
 934         else {
 935             /* Here, both 'and_with' and cl match something outside the
 936              * bitmap.  Currently we do not do the intersection, so just match
 937              * whatever cl had at the beginning.  */
 938         }
 939
 940
 941         /* Take the intersection of the two sets of flags.  However, the
 942          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 943          * kludge around the fact that this flag is not treated like the others
 944          * which are initialized in cl_anything().  The way the optimizer works
 945          * is that the synthetic start class (SSC) is initialized to match
 946          * anything, and then the first time a real node is encountered, its
 947          * values are AND'd with the SSC's with the result being the values of
 948          * the real node.  However, there are paths through the optimizer where
 949          * the AND never gets called, so those initialized bits are set
 950          * inappropriately, which is not usually a big deal, as they just cause
 951          * false positives in the SSC, which will just mean a probably
 952          * imperceptible slow down in execution.  However this bit has a
 953          * higher false positive consequence in that it can cause utf8.pm,
 954          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 955          * bigger slowdown and also causes significant extra memory to be used.
 956          * In order to prevent this, the code now takes a different tack.  The
 957          * bit isn't set unless some part of the regular expression needs it,
 958          * but once set it won't get cleared.  This means that these extra
 959          * modules won't get loaded unless there was some path through the
 960          * pattern that would have required them anyway, and  so any false
 961          * positives that occur by not ANDing them out when they could be
 962          * aren't as severe as they would be if we treated this bit like all
 963          * the others */
 964         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 965                                       & ANYOF_NONBITMAP_NON_UTF8;
 966         cl->flags &= and_with->flags;
 967         cl->flags |= outside_bitmap_but_not_utf8;
 968     }
 969 }
 970
 971 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 972  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 973  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 974 STATIC void
 975 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 976 {
 977     PERL_ARGS_ASSERT_CL_OR;
 978
 979     if (or_with->flags & ANYOF_INVERT) {
 980
 981         /* Here, the or'd node is to be inverted.  This means we take the
 982          * complement of everything not in the bitmap, but currently we don't
 983          * know what that is, so give up and match anything */
 984         if (ANYOF_NONBITMAP(or_with)) {
 985             cl_anything(pRExC_state, cl);
 986         }
 987         /* We do not use
 988          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 989          *   <= (B1 | !B2) | (CL1 | !CL2)
 990          * which is wasteful if CL2 is small, but we ignore CL2:
 991          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 992          * XXXX Can we handle case-fold?  Unclear:
 993          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 994          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 995          */
 996         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 997              && !(or_with->flags & ANYOF_LOC_FOLD)
 998              && !(cl->flags & ANYOF_LOC_FOLD) ) {
 999             int i;
1000
1001             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1002                 cl->bitmap[i] |= ~or_with->bitmap[i];
1003         } /* XXXX: logic is complicated otherwise */
1004         else {
1005             cl_anything(pRExC_state, cl);
1006         }
1007
1008         /* And, we can just take the union of the flags that aren't affected
1009          * by the inversion */
1010         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1011
1012         /* For the remaining flags:
1013             ANYOF_UNICODE_ALL and inverted means to not match anything above
1014                     255, which means that the union with cl should just be
1015                     what cl has in it, so can ignore this flag
1016             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1017                     is 127-255 to match them, but then invert that, so the
1018                     union with cl should just be what cl has in it, so can
1019                     ignore this flag
1020          */
1021     } else {    /* 'or_with' is not inverted */
1022         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1023         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1024              && (!(or_with->flags & ANYOF_LOC_FOLD)
1025                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
1026             int i;
1027
1028             /* OR char bitmap and class bitmap separately */
1029             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1030                 cl->bitmap[i] |= or_with->bitmap[i];
1031             ANYOF_CLASS_OR(or_with, cl);
1032         }
1033         else { /* XXXX: logic is complicated, leave it along for a moment. */
1034             cl_anything(pRExC_state, cl);
1035         }
1036
1037         if (ANYOF_NONBITMAP(or_with)) {
1038
1039             /* Use the added node's outside-the-bit-map match if there isn't a
1040              * conflict.  If there is a conflict (both nodes match something
1041              * outside the bitmap, but what they match outside is not the same
1042              * pointer, and hence not easily compared until XXX we extend
1043              * inversion lists this far), give up and allow the start class to
1044              * match everything outside the bitmap.  If that stuff is all above
1045              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1046             if (! ANYOF_NONBITMAP(cl)) {
1047                 ARG_SET(cl, ARG(or_with));
1048             }
1049             else if (ARG(cl) != ARG(or_with)) {
1050
1051                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1052                     cl_anything(pRExC_state, cl);
1053                 }
1054                 else {
1055                     cl->flags |= ANYOF_UNICODE_ALL;
1056                 }
1057             }
1058         }
1059
1060         /* Take the union */
1061         cl->flags |= or_with->flags;
1062     }
1063 }
1064
1065 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1066 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1067 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1068 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1069
1070
1071 #ifdef DEBUGGING
1072 /*
1073    dump_trie(trie,widecharmap,revcharmap)
1074    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1075    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1076
1077    These routines dump out a trie in a somewhat readable format.
1078    The _interim_ variants are used for debugging the interim
1079    tables that are used to generate the final compressed
1080    representation which is what dump_trie expects.
1081
1082    Part of the reason for their existence is to provide a form
1083    of documentation as to how the different representations function.
1084
1085 */
1086
1087 /*
1088   Dumps the final compressed table form of the trie to Perl_debug_log.
1089   Used for debugging make_trie().
1090 */
1091
1092 STATIC void
1093 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1094             AV *revcharmap, U32 depth)
1095 {
1096     U32 state;
1097     SV *sv=sv_newmortal();
1098     int colwidth= widecharmap ? 6 : 4;
1099     U16 word;
1100     GET_RE_DEBUG_FLAGS_DECL;
1101
1102     PERL_ARGS_ASSERT_DUMP_TRIE;
1103
1104     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1105         (int)depth * 2 + 2,"",
1106         "Match","Base","Ofs" );
1107
1108     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1109         SV ** const tmp = av_fetch( revcharmap, state, 0);
1110         if ( tmp ) {
1111             PerlIO_printf( Perl_debug_log, "%*s",
1112                 colwidth,
1113                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1114                             PL_colors[0], PL_colors[1],
1115                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1116                             PERL_PV_ESCAPE_FIRSTCHAR
1117                 )
1118             );
1119         }
1120     }
1121     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1122         (int)depth * 2 + 2,"");
1123
1124     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1125         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1126     PerlIO_printf( Perl_debug_log, "\n");
1127
1128     for( state = 1 ; state < trie->statecount ; state++ ) {
1129         const U32 base = trie->states[ state ].trans.base;
1130
1131         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1132
1133         if ( trie->states[ state ].wordnum ) {
1134             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1135         } else {
1136             PerlIO_printf( Perl_debug_log, "%6s", "" );
1137         }
1138
1139         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1140
1141         if ( base ) {
1142             U32 ofs = 0;
1143
1144             while( ( base + ofs  < trie->uniquecharcount ) ||
1145                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1146                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1147                     ofs++;
1148
1149             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1150
1151             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1152                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1153                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1154                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1155                 {
1156                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1157                     colwidth,
1158                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1159                 } else {
1160                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1161                 }
1162             }
1163
1164             PerlIO_printf( Perl_debug_log, "]");
1165
1166         }
1167         PerlIO_printf( Perl_debug_log, "\n" );
1168     }
1169     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1170     for (word=1; word <= trie->wordcount; word++) {
1171         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1172             (int)word, (int)(trie->wordinfo[word].prev),
1173             (int)(trie->wordinfo[word].len));
1174     }
1175     PerlIO_printf(Perl_debug_log, "\n" );
1176 }
1177 /*
1178   Dumps a fully constructed but uncompressed trie in list form.
1179   List tries normally only are used for construction when the number of
1180   possible chars (trie->uniquecharcount) is very high.
1181   Used for debugging make_trie().
1182 */
1183 STATIC void
1184 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1185                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1186                          U32 depth)
1187 {
1188     U32 state;
1189     SV *sv=sv_newmortal();
1190     int colwidth= widecharmap ? 6 : 4;
1191     GET_RE_DEBUG_FLAGS_DECL;
1192
1193     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1194
1195     /* print out the table precompression.  */
1196     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1197         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1198         "------:-----+-----------------\n" );
1199
1200     for( state=1 ; state < next_alloc ; state ++ ) {
1201         U16 charid;
1202
1203         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1204             (int)depth * 2 + 2,"", (UV)state  );
1205         if ( ! trie->states[ state ].wordnum ) {
1206             PerlIO_printf( Perl_debug_log, "%5s| ","");
1207         } else {
1208             PerlIO_printf( Perl_debug_log, "W%4x| ",
1209                 trie->states[ state ].wordnum
1210             );
1211         }
1212         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1213             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1214             if ( tmp ) {
1215                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1216                     colwidth,
1217                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1218                             PL_colors[0], PL_colors[1],
1219                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1220                             PERL_PV_ESCAPE_FIRSTCHAR
1221                     ) ,
1222                     TRIE_LIST_ITEM(state,charid).forid,
1223                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1224                 );
1225                 if (!(charid % 10))
1226                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1227                         (int)((depth * 2) + 14), "");
1228             }
1229         }
1230         PerlIO_printf( Perl_debug_log, "\n");
1231     }
1232 }
1233
1234 /*
1235   Dumps a fully constructed but uncompressed trie in table form.
1236   This is the normal DFA style state transition table, with a few
1237   twists to facilitate compression later.
1238   Used for debugging make_trie().
1239 */
1240 STATIC void
1241 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1242                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1243                           U32 depth)
1244 {
1245     U32 state;
1246     U16 charid;
1247     SV *sv=sv_newmortal();
1248     int colwidth= widecharmap ? 6 : 4;
1249     GET_RE_DEBUG_FLAGS_DECL;
1250
1251     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1252
1253     /*
1254        print out the table precompression so that we can do a visual check
1255        that they are identical.
1256      */
1257
1258     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1259
1260     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1261         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1262         if ( tmp ) {
1263             PerlIO_printf( Perl_debug_log, "%*s",
1264                 colwidth,
1265                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1266                             PL_colors[0], PL_colors[1],
1267                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1268                             PERL_PV_ESCAPE_FIRSTCHAR
1269                 )
1270             );
1271         }
1272     }
1273
1274     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1275
1276     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1277         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1278     }
1279
1280     PerlIO_printf( Perl_debug_log, "\n" );
1281
1282     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1283
1284         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1285             (int)depth * 2 + 2,"",
1286             (UV)TRIE_NODENUM( state ) );
1287
1288         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1289             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1290             if (v)
1291                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1292             else
1293                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1294         }
1295         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1296             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1297         } else {
1298             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1299             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1300         }
1301     }
1302 }
1303
1304 #endif
1305
1306
1307 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1308   startbranch: the first branch in the whole branch sequence
1309   first      : start branch of sequence of branch-exact nodes.
1310                May be the same as startbranch
1311   last       : Thing following the last branch.
1312                May be the same as tail.
1313   tail       : item following the branch sequence
1314   count      : words in the sequence
1315   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1316   depth      : indent depth
1317
1318 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1319
1320 A trie is an N'ary tree where the branches are determined by digital
1321 decomposition of the key. IE, at the root node you look up the 1st character and
1322 follow that branch repeat until you find the end of the branches. Nodes can be
1323 marked as "accepting" meaning they represent a complete word. Eg:
1324
1325   /he|she|his|hers/
1326
1327 would convert into the following structure. Numbers represent states, letters
1328 following numbers represent valid transitions on the letter from that state, if
1329 the number is in square brackets it represents an accepting state, otherwise it
1330 will be in parenthesis.
1331
1332       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1333       |    |
1334       |   (2)
1335       |    |
1336      (1)   +-i->(6)-+-s->[7]
1337       |
1338       +-s->(3)-+-h->(4)-+-e->[5]
1339
1340       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1341
1342 This shows that when matching against the string 'hers' we will begin at state 1
1343 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1344 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1345 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1346 single traverse. We store a mapping from accepting to state to which word was
1347 matched, and then when we have multiple possibilities we try to complete the
1348 rest of the regex in the order in which they occured in the alternation.
1349
1350 The only prior NFA like behaviour that would be changed by the TRIE support is
1351 the silent ignoring of duplicate alternations which are of the form:
1352
1353  / (DUPE|DUPE) X? (?{ ... }) Y /x
1354
1355 Thus EVAL blocks following a trie may be called a different number of times with
1356 and without the optimisation. With the optimisations dupes will be silently
1357 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1358 the following demonstrates:
1359
1360  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1361
1362 which prints out 'word' three times, but
1363
1364  'words'=~/(word|word|word)(?{ print $1 })S/
1365
1366 which doesnt print it out at all. This is due to other optimisations kicking in.
1367
1368 Example of what happens on a structural level:
1369
1370 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1371
1372    1: CURLYM[1] {1,32767}(18)
1373    5:   BRANCH(8)
1374    6:     EXACT <ac>(16)
1375    8:   BRANCH(11)
1376    9:     EXACT <ad>(16)
1377   11:   BRANCH(14)
1378   12:     EXACT <ab>(16)
1379   16:   SUCCEED(0)
1380   17:   NOTHING(18)
1381   18: END(0)
1382
1383 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1384 and should turn into:
1385
1386    1: CURLYM[1] {1,32767}(18)
1387    5:   TRIE(16)
1388         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1389           <ac>
1390           <ad>
1391           <ab>
1392   16:   SUCCEED(0)
1393   17:   NOTHING(18)
1394   18: END(0)
1395
1396 Cases where tail != last would be like /(?foo|bar)baz/:
1397
1398    1: BRANCH(4)
1399    2:   EXACT <foo>(8)
1400    4: BRANCH(7)
1401    5:   EXACT <bar>(8)
1402    7: TAIL(8)
1403    8: EXACT <baz>(10)
1404   10: END(0)
1405
1406 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1407 and would end up looking like:
1408
1409     1: TRIE(8)
1410       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1411         <foo>
1412         <bar>
1413    7: TAIL(8)
1414    8: EXACT <baz>(10)
1415   10: END(0)
1416
1417     d = uvuni_to_utf8_flags(d, uv, 0);
1418
1419 is the recommended Unicode-aware way of saying
1420
1421     *(d++) = uv;
1422 */
1423
1424 #define TRIE_STORE_REVCHAR(val)                                            \
1425     STMT_START {                                                           \
1426         if (UTF) {                                                         \
1427             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1428             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1429             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1430             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1431             SvPOK_on(zlopp);                                               \
1432             SvUTF8_on(zlopp);                                              \
1433             av_push(revcharmap, zlopp);                                    \
1434         } else {                                                           \
1435             char ooooff = (char)val;                                           \
1436             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1437         }                                                                  \
1438         } STMT_END
1439
1440 #define TRIE_READ_CHAR STMT_START {                                                     \
1441     wordlen++;                                                                          \
1442     if ( UTF ) {                                                                        \
1443         /* if it is UTF then it is either already folded, or does not need folding */   \
1444         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1445     }                                                                                   \
1446     else if (folder == PL_fold_latin1) {                                                \
1447         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1448         if ( foldlen > 0 ) {                                                            \
1449            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1450            foldlen -= len;                                                              \
1451            scan += len;                                                                 \
1452            len = 0;                                                                     \
1453         } else {                                                                        \
1454             len = 1;                                                                    \
1455             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1456             skiplen = UNISKIP(uvc);                                                     \
1457             foldlen -= skiplen;                                                         \
1458             scan = foldbuf + skiplen;                                                   \
1459         }                                                                               \
1460     } else {                                                                            \
1461         /* raw data, will be folded later if needed */                                  \
1462         uvc = (U32)*uc;                                                                 \
1463         len = 1;                                                                        \
1464     }                                                                                   \
1465 } STMT_END
1466
1467
1468
1469 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1470     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1471         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1472         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1473     }                                                           \
1474     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1475     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1476     TRIE_LIST_CUR( state )++;                                   \
1477 } STMT_END
1478
1479 #define TRIE_LIST_NEW(state) STMT_START {                       \
1480     Newxz( trie->states[ state ].trans.list,               \
1481         4, reg_trie_trans_le );                                 \
1482      TRIE_LIST_CUR( state ) = 1;                                \
1483      TRIE_LIST_LEN( state ) = 4;                                \
1484 } STMT_END
1485
1486 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1487     U16 dupe= trie->states[ state ].wordnum;                    \
1488     regnode * const noper_next = regnext( noper );              \
1489                                                                 \
1490     DEBUG_r({                                                   \
1491         /* store the word for dumping */                        \
1492         SV* tmp;                                                \
1493         if (OP(noper) != NOTHING)                               \
1494             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1495         else                                                    \
1496             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1497         av_push( trie_words, tmp );                             \
1498     });                                                         \
1499                                                                 \
1500     curword++;                                                  \
1501     trie->wordinfo[curword].prev   = 0;                         \
1502     trie->wordinfo[curword].len    = wordlen;                   \
1503     trie->wordinfo[curword].accept = state;                     \
1504                                                                 \
1505     if ( noper_next < tail ) {                                  \
1506         if (!trie->jump)                                        \
1507             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1508         trie->jump[curword] = (U16)(noper_next - convert);      \
1509         if (!jumper)                                            \
1510             jumper = noper_next;                                \
1511         if (!nextbranch)                                        \
1512             nextbranch= regnext(cur);                           \
1513     }                                                           \
1514                                                                 \
1515     if ( dupe ) {                                               \
1516         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1517         /* chain, so that when the bits of chain are later    */\
1518         /* linked together, the dups appear in the chain      */\
1519         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1520         trie->wordinfo[dupe].prev = curword;                    \
1521     } else {                                                    \
1522         /* we haven't inserted this word yet.                */ \
1523         trie->states[ state ].wordnum = curword;                \
1524     }                                                           \
1525 } STMT_END
1526
1527
1528 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1529      ( ( base + charid >=  ucharcount                                   \
1530          && base + charid < ubound                                      \
1531          && state == trie->trans[ base - ucharcount + charid ].check    \
1532          && trie->trans[ base - ucharcount + charid ].next )            \
1533            ? trie->trans[ base - ucharcount + charid ].next             \
1534            : ( state==1 ? special : 0 )                                 \
1535       )
1536
1537 #define MADE_TRIE       1
1538 #define MADE_JUMP_TRIE  2
1539 #define MADE_EXACT_TRIE 4
1540
1541 STATIC I32
1542 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1543 {
1544     dVAR;
1545     /* first pass, loop through and scan words */
1546     reg_trie_data *trie;
1547     HV *widecharmap = NULL;
1548     AV *revcharmap = newAV();
1549     regnode *cur;
1550     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1551     STRLEN len = 0;
1552     UV uvc = 0;
1553     U16 curword = 0;
1554     U32 next_alloc = 0;
1555     regnode *jumper = NULL;
1556     regnode *nextbranch = NULL;
1557     regnode *convert = NULL;
1558     U32 *prev_states; /* temp array mapping each state to previous one */
1559     /* we just use folder as a flag in utf8 */
1560     const U8 * folder = NULL;
1561
1562 #ifdef DEBUGGING
1563     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1564     AV *trie_words = NULL;
1565     /* along with revcharmap, this only used during construction but both are
1566      * useful during debugging so we store them in the struct when debugging.
1567      */
1568 #else
1569     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1570     STRLEN trie_charcount=0;
1571 #endif
1572     SV *re_trie_maxbuff;
1573     GET_RE_DEBUG_FLAGS_DECL;
1574
1575     PERL_ARGS_ASSERT_MAKE_TRIE;
1576 #ifndef DEBUGGING
1577     PERL_UNUSED_ARG(depth);
1578 #endif
1579
1580     switch (flags) {
1581         case EXACT: break;
1582         case EXACTFA:
1583         case EXACTFU_SS:
1584         case EXACTFU_TRICKYFOLD:
1585         case EXACTFU: folder = PL_fold_latin1; break;
1586         case EXACTF:  folder = PL_fold; break;
1587         case EXACTFL: folder = PL_fold_locale; break;
1588         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1589     }
1590
1591     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1592     trie->refcount = 1;
1593     trie->startstate = 1;
1594     trie->wordcount = word_count;
1595     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1596     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1597     if (flags == EXACT)
1598         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1599     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1600                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1601
1602     DEBUG_r({
1603         trie_words = newAV();
1604     });
1605
1606     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1607     if (!SvIOK(re_trie_maxbuff)) {
1608         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1609     }
1610     DEBUG_TRIE_COMPILE_r({
1611                 PerlIO_printf( Perl_debug_log,
1612                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1613                   (int)depth * 2 + 2, "",
1614                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1615                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1616                   (int)depth);
1617     });
1618
1619    /* Find the node we are going to overwrite */
1620     if ( first == startbranch && OP( last ) != BRANCH ) {
1621         /* whole branch chain */
1622         convert = first;
1623     } else {
1624         /* branch sub-chain */
1625         convert = NEXTOPER( first );
1626     }
1627
1628     /*  -- First loop and Setup --
1629
1630        We first traverse the branches and scan each word to determine if it
1631        contains widechars, and how many unique chars there are, this is
1632        important as we have to build a table with at least as many columns as we
1633        have unique chars.
1634
1635        We use an array of integers to represent the character codes 0..255
1636        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1637        native representation of the character value as the key and IV's for the
1638        coded index.
1639
1640        *TODO* If we keep track of how many times each character is used we can
1641        remap the columns so that the table compression later on is more
1642        efficient in terms of memory by ensuring the most common value is in the
1643        middle and the least common are on the outside.  IMO this would be better
1644        than a most to least common mapping as theres a decent chance the most
1645        common letter will share a node with the least common, meaning the node
1646        will not be compressible. With a middle is most common approach the worst
1647        case is when we have the least common nodes twice.
1648
1649      */
1650
1651     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1652         regnode *noper = NEXTOPER( cur );
1653         const U8 *uc = (U8*)STRING( noper );
1654         const U8 *e  = uc + STR_LEN( noper );
1655         STRLEN foldlen = 0;
1656         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1657         STRLEN skiplen = 0;
1658         const U8 *scan = (U8*)NULL;
1659         U32 wordlen      = 0;         /* required init */
1660         STRLEN chars = 0;
1661         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1662
1663         if (OP(noper) == NOTHING) {
1664             regnode *noper_next= regnext(noper);
1665             if (noper_next != tail && OP(noper_next) == flags) {
1666                 noper = noper_next;
1667                 uc= (U8*)STRING(noper);
1668                 e= uc + STR_LEN(noper);
1669                 trie->minlen= STR_LEN(noper);
1670             } else {
1671                 trie->minlen= 0;
1672                 continue;
1673             }
1674         }
1675
1676         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1677             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1678                                           regardless of encoding */
1679             if (OP( noper ) == EXACTFU_SS) {
1680                 /* false positives are ok, so just set this */
1681                 TRIE_BITMAP_SET(trie,0xDF);
1682             }
1683         }
1684         for ( ; uc < e ; uc += len ) {
1685             TRIE_CHARCOUNT(trie)++;
1686             TRIE_READ_CHAR;
1687             chars++;
1688             if ( uvc < 256 ) {
1689                 if ( folder ) {
1690                     U8 folded= folder[ (U8) uvc ];
1691                     if ( !trie->charmap[ folded ] ) {
1692                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1693                         TRIE_STORE_REVCHAR( folded );
1694                     }
1695                 }
1696                 if ( !trie->charmap[ uvc ] ) {
1697                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1698                     TRIE_STORE_REVCHAR( uvc );
1699                 }
1700                 if ( set_bit ) {
1701                     /* store the codepoint in the bitmap, and its folded
1702                      * equivalent. */
1703                     TRIE_BITMAP_SET(trie, uvc);
1704
1705                     /* store the folded codepoint */
1706                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1707
1708                     if ( !UTF ) {
1709                         /* store first byte of utf8 representation of
1710                            variant codepoints */
1711                         if (! UNI_IS_INVARIANT(uvc)) {
1712                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1713                         }
1714                     }
1715                     set_bit = 0; /* We've done our bit :-) */
1716                 }
1717             } else {
1718                 SV** svpp;
1719                 if ( !widecharmap )
1720                     widecharmap = newHV();
1721
1722                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1723
1724                 if ( !svpp )
1725                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1726
1727                 if ( !SvTRUE( *svpp ) ) {
1728                     sv_setiv( *svpp, ++trie->uniquecharcount );
1729                     TRIE_STORE_REVCHAR(uvc);
1730                 }
1731             }
1732         }
1733         if( cur == first ) {
1734             trie->minlen = chars;
1735             trie->maxlen = chars;
1736         } else if (chars < trie->minlen) {
1737             trie->minlen = chars;
1738         } else if (chars > trie->maxlen) {
1739             trie->maxlen = chars;
1740         }
1741         if (OP( noper ) == EXACTFU_SS) {
1742             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1743             if (trie->minlen > 1)
1744                 trie->minlen= 1;
1745         }
1746         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1747             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1748              *                - We assume that any such sequence might match a 2 byte string */
1749             if (trie->minlen > 2 )
1750                 trie->minlen= 2;
1751         }
1752
1753     } /* end first pass */
1754     DEBUG_TRIE_COMPILE_r(
1755         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1756                 (int)depth * 2 + 2,"",
1757                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1758                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1759                 (int)trie->minlen, (int)trie->maxlen )
1760     );
1761
1762     /*
1763         We now know what we are dealing with in terms of unique chars and
1764         string sizes so we can calculate how much memory a naive
1765         representation using a flat table  will take. If it's over a reasonable
1766         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1767         conservative but potentially much slower representation using an array
1768         of lists.
1769
1770         At the end we convert both representations into the same compressed
1771         form that will be used in regexec.c for matching with. The latter
1772         is a form that cannot be used to construct with but has memory
1773         properties similar to the list form and access properties similar
1774         to the table form making it both suitable for fast searches and
1775         small enough that its feasable to store for the duration of a program.
1776
1777         See the comment in the code where the compressed table is produced
1778         inplace from the flat tabe representation for an explanation of how
1779         the compression works.
1780
1781     */
1782
1783
1784     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1785     prev_states[1] = 0;
1786
1787     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1788         /*
1789             Second Pass -- Array Of Lists Representation
1790
1791             Each state will be represented by a list of charid:state records
1792             (reg_trie_trans_le) the first such element holds the CUR and LEN
1793             points of the allocated array. (See defines above).
1794
1795             We build the initial structure using the lists, and then convert
1796             it into the compressed table form which allows faster lookups
1797             (but cant be modified once converted).
1798         */
1799
1800         STRLEN transcount = 1;
1801
1802         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1803             "%*sCompiling trie using list compiler\n",
1804             (int)depth * 2 + 2, ""));
1805
1806         trie->states = (reg_trie_state *)
1807             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1808                                   sizeof(reg_trie_state) );
1809         TRIE_LIST_NEW(1);
1810         next_alloc = 2;
1811
1812         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1813
1814             regnode *noper   = NEXTOPER( cur );
1815             U8 *uc           = (U8*)STRING( noper );
1816             const U8 *e      = uc + STR_LEN( noper );
1817             U32 state        = 1;         /* required init */
1818             U16 charid       = 0;         /* sanity init */
1819             U8 *scan         = (U8*)NULL; /* sanity init */
1820             STRLEN foldlen   = 0;         /* required init */
1821             U32 wordlen      = 0;         /* required init */
1822             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1823             STRLEN skiplen   = 0;
1824
1825             if (OP(noper) == NOTHING) {
1826                 regnode *noper_next= regnext(noper);
1827                 if (noper_next != tail && OP(noper_next) == flags) {
1828                     noper = noper_next;
1829                     uc= (U8*)STRING(noper);
1830                     e= uc + STR_LEN(noper);
1831                 }
1832             }
1833
1834             if (OP(noper) != NOTHING) {
1835                 for ( ; uc < e ; uc += len ) {
1836
1837                     TRIE_READ_CHAR;
1838
1839                     if ( uvc < 256 ) {
1840                         charid = trie->charmap[ uvc ];
1841                     } else {
1842                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1843                         if ( !svpp ) {
1844                             charid = 0;
1845                         } else {
1846                             charid=(U16)SvIV( *svpp );
1847                         }
1848                     }
1849                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1850                     if ( charid ) {
1851
1852                         U16 check;
1853                         U32 newstate = 0;
1854
1855                         charid--;
1856                         if ( !trie->states[ state ].trans.list ) {
1857                             TRIE_LIST_NEW( state );
1858                         }
1859                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1860                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1861                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1862                                 break;
1863                             }
1864                         }
1865                         if ( ! newstate ) {
1866                             newstate = next_alloc++;
1867                             prev_states[newstate] = state;
1868                             TRIE_LIST_PUSH( state, charid, newstate );
1869                             transcount++;
1870                         }
1871                         state = newstate;
1872                     } else {
1873                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1874                     }
1875                 }
1876             }
1877             TRIE_HANDLE_WORD(state);
1878
1879         } /* end second pass */
1880
1881         /* next alloc is the NEXT state to be allocated */
1882         trie->statecount = next_alloc;
1883         trie->states = (reg_trie_state *)
1884             PerlMemShared_realloc( trie->states,
1885                                    next_alloc
1886                                    * sizeof(reg_trie_state) );
1887
1888         /* and now dump it out before we compress it */
1889         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1890                                                          revcharmap, next_alloc,
1891                                                          depth+1)
1892         );
1893
1894         trie->trans = (reg_trie_trans *)
1895             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1896         {
1897             U32 state;
1898             U32 tp = 0;
1899             U32 zp = 0;
1900
1901
1902             for( state=1 ; state < next_alloc ; state ++ ) {
1903                 U32 base=0;
1904
1905                 /*
1906                 DEBUG_TRIE_COMPILE_MORE_r(
1907                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1908                 );
1909                 */
1910
1911                 if (trie->states[state].trans.list) {
1912                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1913                     U16 maxid=minid;
1914                     U16 idx;
1915
1916                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1917                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1918                         if ( forid < minid ) {
1919                             minid=forid;
1920                         } else if ( forid > maxid ) {
1921                             maxid=forid;
1922                         }
1923                     }
1924                     if ( transcount < tp + maxid - minid + 1) {
1925                         transcount *= 2;
1926                         trie->trans = (reg_trie_trans *)
1927                             PerlMemShared_realloc( trie->trans,
1928                                                      transcount
1929                                                      * sizeof(reg_trie_trans) );
1930                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1931                     }
1932                     base = trie->uniquecharcount + tp - minid;
1933                     if ( maxid == minid ) {
1934                         U32 set = 0;
1935                         for ( ; zp < tp ; zp++ ) {
1936                             if ( ! trie->trans[ zp ].next ) {
1937                                 base = trie->uniquecharcount + zp - minid;
1938                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1939                                 trie->trans[ zp ].check = state;
1940                                 set = 1;
1941                                 break;
1942                             }
1943                         }
1944                         if ( !set ) {
1945                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1946                             trie->trans[ tp ].check = state;
1947                             tp++;
1948                             zp = tp;
1949                         }
1950                     } else {
1951                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1952                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1953                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1954                             trie->trans[ tid ].check = state;
1955                         }
1956                         tp += ( maxid - minid + 1 );
1957                     }
1958                     Safefree(trie->states[ state ].trans.list);
1959                 }
1960                 /*
1961                 DEBUG_TRIE_COMPILE_MORE_r(
1962                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1963                 );
1964                 */
1965                 trie->states[ state ].trans.base=base;
1966             }
1967             trie->lasttrans = tp + 1;
1968         }
1969     } else {
1970         /*
1971            Second Pass -- Flat Table Representation.
1972
1973            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1974            We know that we will need Charcount+1 trans at most to store the data
1975            (one row per char at worst case) So we preallocate both structures
1976            assuming worst case.
1977
1978            We then construct the trie using only the .next slots of the entry
1979            structs.
1980
1981            We use the .check field of the first entry of the node temporarily to
1982            make compression both faster and easier by keeping track of how many non
1983            zero fields are in the node.
1984
1985            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1986            transition.
1987
1988            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1989            number representing the first entry of the node, and state as a
1990            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1991            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1992            are 2 entrys per node. eg:
1993
1994              A B       A B
1995           1. 2 4    1. 3 7
1996           2. 0 3    3. 0 5
1997           3. 0 0    5. 0 0
1998           4. 0 0    7. 0 0
1999
2000            The table is internally in the right hand, idx form. However as we also
2001            have to deal with the states array which is indexed by nodenum we have to
2002            use TRIE_NODENUM() to convert.
2003
2004         */
2005         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2006             "%*sCompiling trie using table compiler\n",
2007             (int)depth * 2 + 2, ""));
2008
2009         trie->trans = (reg_trie_trans *)
2010             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2011                                   * trie->uniquecharcount + 1,
2012                                   sizeof(reg_trie_trans) );
2013         trie->states = (reg_trie_state *)
2014             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2015                                   sizeof(reg_trie_state) );
2016         next_alloc = trie->uniquecharcount + 1;
2017
2018
2019         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2020
2021             regnode *noper   = NEXTOPER( cur );
2022             const U8 *uc     = (U8*)STRING( noper );
2023             const U8 *e      = uc + STR_LEN( noper );
2024
2025             U32 state        = 1;         /* required init */
2026
2027             U16 charid       = 0;         /* sanity init */
2028             U32 accept_state = 0;         /* sanity init */
2029             U8 *scan         = (U8*)NULL; /* sanity init */
2030
2031             STRLEN foldlen   = 0;         /* required init */
2032             U32 wordlen      = 0;         /* required init */
2033             STRLEN skiplen   = 0;
2034             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2035
2036             if (OP(noper) == NOTHING) {
2037                 regnode *noper_next= regnext(noper);
2038                 if (noper_next != tail && OP(noper_next) == flags) {
2039                     noper = noper_next;
2040                     uc= (U8*)STRING(noper);
2041                     e= uc + STR_LEN(noper);
2042                 }
2043             }
2044
2045             if ( OP(noper) != NOTHING ) {
2046                 for ( ; uc < e ; uc += len ) {
2047
2048                     TRIE_READ_CHAR;
2049
2050                     if ( uvc < 256 ) {
2051                         charid = trie->charmap[ uvc ];
2052                     } else {
2053                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2054                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2055                     }
2056                     if ( charid ) {
2057                         charid--;
2058                         if ( !trie->trans[ state + charid ].next ) {
2059                             trie->trans[ state + charid ].next = next_alloc;
2060                             trie->trans[ state ].check++;
2061                             prev_states[TRIE_NODENUM(next_alloc)]
2062                                     = TRIE_NODENUM(state);
2063                             next_alloc += trie->uniquecharcount;
2064                         }
2065                         state = trie->trans[ state + charid ].next;
2066                     } else {
2067                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2068                     }
2069                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2070                 }
2071             }
2072             accept_state = TRIE_NODENUM( state );
2073             TRIE_HANDLE_WORD(accept_state);
2074
2075         } /* end second pass */
2076
2077         /* and now dump it out before we compress it */
2078         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2079                                                           revcharmap,
2080                                                           next_alloc, depth+1));
2081
2082         {
2083         /*
2084            * Inplace compress the table.*
2085
2086            For sparse data sets the table constructed by the trie algorithm will
2087            be mostly 0/FAIL transitions or to put it another way mostly empty.
2088            (Note that leaf nodes will not contain any transitions.)
2089
2090            This algorithm compresses the tables by eliminating most such
2091            transitions, at the cost of a modest bit of extra work during lookup:
2092
2093            - Each states[] entry contains a .base field which indicates the
2094            index in the state[] array wheres its transition data is stored.
2095
2096            - If .base is 0 there are no valid transitions from that node.
2097
2098            - If .base is nonzero then charid is added to it to find an entry in
2099            the trans array.
2100
2101            -If trans[states[state].base+charid].check!=state then the
2102            transition is taken to be a 0/Fail transition. Thus if there are fail
2103            transitions at the front of the node then the .base offset will point
2104            somewhere inside the previous nodes data (or maybe even into a node
2105            even earlier), but the .check field determines if the transition is
2106            valid.
2107
2108            XXX - wrong maybe?
2109            The following process inplace converts the table to the compressed
2110            table: We first do not compress the root node 1,and mark all its
2111            .check pointers as 1 and set its .base pointer as 1 as well. This
2112            allows us to do a DFA construction from the compressed table later,
2113            and ensures that any .base pointers we calculate later are greater
2114            than 0.
2115
2116            - We set 'pos' to indicate the first entry of the second node.
2117
2118            - We then iterate over the columns of the node, finding the first and
2119            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2120            and set the .check pointers accordingly, and advance pos
2121            appropriately and repreat for the next node. Note that when we copy
2122            the next pointers we have to convert them from the original
2123            NODEIDX form to NODENUM form as the former is not valid post
2124            compression.
2125
2126            - If a node has no transitions used we mark its base as 0 and do not
2127            advance the pos pointer.
2128
2129            - If a node only has one transition we use a second pointer into the
2130            structure to fill in allocated fail transitions from other states.
2131            This pointer is independent of the main pointer and scans forward
2132            looking for null transitions that are allocated to a state. When it
2133            finds one it writes the single transition into the "hole".  If the
2134            pointer doesnt find one the single transition is appended as normal.
2135
2136            - Once compressed we can Renew/realloc the structures to release the
2137            excess space.
2138
2139            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2140            specifically Fig 3.47 and the associated pseudocode.
2141
2142            demq
2143         */
2144         const U32 laststate = TRIE_NODENUM( next_alloc );
2145         U32 state, charid;
2146         U32 pos = 0, zp=0;
2147         trie->statecount = laststate;
2148
2149         for ( state = 1 ; state < laststate ; state++ ) {
2150             U8 flag = 0;
2151             const U32 stateidx = TRIE_NODEIDX( state );
2152             const U32 o_used = trie->trans[ stateidx ].check;
2153             U32 used = trie->trans[ stateidx ].check;
2154             trie->trans[ stateidx ].check = 0;
2155
2156             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2157                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2158                     if ( trie->trans[ stateidx + charid ].next ) {
2159                         if (o_used == 1) {
2160                             for ( ; zp < pos ; zp++ ) {
2161                                 if ( ! trie->trans[ zp ].next ) {
2162                                     break;
2163                                 }
2164                             }
2165                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2166                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2167                             trie->trans[ zp ].check = state;
2168                             if ( ++zp > pos ) pos = zp;
2169                             break;
2170                         }
2171                         used--;
2172                     }
2173                     if ( !flag ) {
2174                         flag = 1;
2175                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2176                     }
2177                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2178                     trie->trans[ pos ].check = state;
2179                     pos++;
2180                 }
2181             }
2182         }
2183         trie->lasttrans = pos + 1;
2184         trie->states = (reg_trie_state *)
2185             PerlMemShared_realloc( trie->states, laststate
2186                                    * sizeof(reg_trie_state) );
2187         DEBUG_TRIE_COMPILE_MORE_r(
2188                 PerlIO_printf( Perl_debug_log,
2189                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2190                     (int)depth * 2 + 2,"",
2191                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2192                     (IV)next_alloc,
2193                     (IV)pos,
2194                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2195             );
2196
2197         } /* end table compress */
2198     }
2199     DEBUG_TRIE_COMPILE_MORE_r(
2200             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2201                 (int)depth * 2 + 2, "",
2202                 (UV)trie->statecount,
2203                 (UV)trie->lasttrans)
2204     );
2205     /* resize the trans array to remove unused space */
2206     trie->trans = (reg_trie_trans *)
2207         PerlMemShared_realloc( trie->trans, trie->lasttrans
2208                                * sizeof(reg_trie_trans) );
2209
2210     {   /* Modify the program and insert the new TRIE node */
2211         U8 nodetype =(U8)(flags & 0xFF);
2212         char *str=NULL;
2213
2214 #ifdef DEBUGGING
2215         regnode *optimize = NULL;
2216 #ifdef RE_TRACK_PATTERN_OFFSETS
2217
2218         U32 mjd_offset = 0;
2219         U32 mjd_nodelen = 0;
2220 #endif /* RE_TRACK_PATTERN_OFFSETS */
2221 #endif /* DEBUGGING */
2222         /*
2223            This means we convert either the first branch or the first Exact,
2224            depending on whether the thing following (in 'last') is a branch
2225            or not and whther first is the startbranch (ie is it a sub part of
2226            the alternation or is it the whole thing.)
2227            Assuming its a sub part we convert the EXACT otherwise we convert
2228            the whole branch sequence, including the first.
2229          */
2230         /* Find the node we are going to overwrite */
2231         if ( first != startbranch || OP( last ) == BRANCH ) {
2232             /* branch sub-chain */
2233             NEXT_OFF( first ) = (U16)(last - first);
2234 #ifdef RE_TRACK_PATTERN_OFFSETS
2235             DEBUG_r({
2236                 mjd_offset= Node_Offset((convert));
2237                 mjd_nodelen= Node_Length((convert));
2238             });
2239 #endif
2240             /* whole branch chain */
2241         }
2242 #ifdef RE_TRACK_PATTERN_OFFSETS
2243         else {
2244             DEBUG_r({
2245                 const  regnode *nop = NEXTOPER( convert );
2246                 mjd_offset= Node_Offset((nop));
2247                 mjd_nodelen= Node_Length((nop));
2248             });
2249         }
2250         DEBUG_OPTIMISE_r(
2251             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2252                 (int)depth * 2 + 2, "",
2253                 (UV)mjd_offset, (UV)mjd_nodelen)
2254         );
2255 #endif
2256         /* But first we check to see if there is a common prefix we can
2257            split out as an EXACT and put in front of the TRIE node.  */
2258         trie->startstate= 1;
2259         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2260             U32 state;
2261             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2262                 U32 ofs = 0;
2263                 I32 idx = -1;
2264                 U32 count = 0;
2265                 const U32 base = trie->states[ state ].trans.base;
2266
2267                 if ( trie->states[state].wordnum )
2268                         count = 1;
2269
2270                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2271                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2272                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2273                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2274                     {
2275                         if ( ++count > 1 ) {
2276                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2277                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2278                             if ( state == 1 ) break;
2279                             if ( count == 2 ) {
2280                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2281                                 DEBUG_OPTIMISE_r(
2282                                     PerlIO_printf(Perl_debug_log,
2283                                         "%*sNew Start State=%"UVuf" Class: [",
2284                                         (int)depth * 2 + 2, "",
2285                                         (UV)state));
2286                                 if (idx >= 0) {
2287                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2288                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2289
2290                                     TRIE_BITMAP_SET(trie,*ch);
2291                                     if ( folder )
2292                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2293                                     DEBUG_OPTIMISE_r(
2294                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2295                                     );
2296                                 }
2297                             }
2298                             TRIE_BITMAP_SET(trie,*ch);
2299                             if ( folder )
2300                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2301                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2302                         }
2303                         idx = ofs;
2304                     }
2305                 }
2306                 if ( count == 1 ) {
2307                     SV **tmp = av_fetch( revcharmap, idx, 0);
2308                     STRLEN len;
2309                     char *ch = SvPV( *tmp, len );
2310                     DEBUG_OPTIMISE_r({
2311                         SV *sv=sv_newmortal();
2312                         PerlIO_printf( Perl_debug_log,
2313                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2314                             (int)depth * 2 + 2, "",
2315                             (UV)state, (UV)idx,
2316                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2317                                 PL_colors[0], PL_colors[1],
2318                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2319                                 PERL_PV_ESCAPE_FIRSTCHAR
2320                             )
2321                         );
2322                     });
2323                     if ( state==1 ) {
2324                         OP( convert ) = nodetype;
2325                         str=STRING(convert);
2326                         STR_LEN(convert)=0;
2327                     }
2328                     STR_LEN(convert) += len;
2329                     while (len--)
2330                         *str++ = *ch++;
2331                 } else {
2332 #ifdef DEBUGGING
2333                     if (state>1)
2334                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2335 #endif
2336                     break;
2337                 }
2338             }
2339             trie->prefixlen = (state-1);
2340             if (str) {
2341                 regnode *n = convert+NODE_SZ_STR(convert);
2342                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2343                 trie->startstate = state;
2344                 trie->minlen -= (state - 1);
2345                 trie->maxlen -= (state - 1);
2346 #ifdef DEBUGGING
2347                /* At least the UNICOS C compiler choked on this
2348                 * being argument to DEBUG_r(), so let's just have
2349                 * it right here. */
2350                if (
2351 #ifdef PERL_EXT_RE_BUILD
2352                    1
2353 #else
2354                    DEBUG_r_TEST
2355 #endif
2356                    ) {
2357                    regnode *fix = convert;
2358                    U32 word = trie->wordcount;
2359                    mjd_nodelen++;
2360                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2361                    while( ++fix < n ) {
2362                        Set_Node_Offset_Length(fix, 0, 0);
2363                    }
2364                    while (word--) {
2365                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2366                        if (tmp) {
2367                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2368                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2369                            else
2370                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2371                        }
2372                    }
2373                }
2374 #endif
2375                 if (trie->maxlen) {
2376                     convert = n;
2377                 } else {
2378                     NEXT_OFF(convert) = (U16)(tail - convert);
2379                     DEBUG_r(optimize= n);
2380                 }
2381             }
2382         }
2383         if (!jumper)
2384             jumper = last;
2385         if ( trie->maxlen ) {
2386             NEXT_OFF( convert ) = (U16)(tail - convert);
2387             ARG_SET( convert, data_slot );
2388             /* Store the offset to the first unabsorbed branch in
2389                jump[0], which is otherwise unused by the jump logic.
2390                We use this when dumping a trie and during optimisation. */
2391             if (trie->jump)
2392                 trie->jump[0] = (U16)(nextbranch - convert);
2393
2394             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2395              *   and there is a bitmap
2396              *   and the first "jump target" node we found leaves enough room
2397              * then convert the TRIE node into a TRIEC node, with the bitmap
2398              * embedded inline in the opcode - this is hypothetically faster.
2399              */
2400             if ( !trie->states[trie->startstate].wordnum
2401                  && trie->bitmap
2402                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2403             {
2404                 OP( convert ) = TRIEC;
2405                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2406                 PerlMemShared_free(trie->bitmap);
2407                 trie->bitmap= NULL;
2408             } else
2409                 OP( convert ) = TRIE;
2410
2411             /* store the type in the flags */
2412             convert->flags = nodetype;
2413             DEBUG_r({
2414             optimize = convert
2415                       + NODE_STEP_REGNODE
2416                       + regarglen[ OP( convert ) ];
2417             });
2418             /* XXX We really should free up the resource in trie now,
2419                    as we won't use them - (which resources?) dmq */
2420         }
2421         /* needed for dumping*/
2422         DEBUG_r(if (optimize) {
2423             regnode *opt = convert;
2424
2425             while ( ++opt < optimize) {
2426                 Set_Node_Offset_Length(opt,0,0);
2427             }
2428             /*
2429                 Try to clean up some of the debris left after the
2430                 optimisation.
2431              */
2432             while( optimize < jumper ) {
2433                 mjd_nodelen += Node_Length((optimize));
2434                 OP( optimize ) = OPTIMIZED;
2435                 Set_Node_Offset_Length(optimize,0,0);
2436                 optimize++;
2437             }
2438             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2439         });
2440     } /* end node insert */
2441
2442     /*  Finish populating the prev field of the wordinfo array.  Walk back
2443      *  from each accept state until we find another accept state, and if
2444      *  so, point the first word's .prev field at the second word. If the
2445      *  second already has a .prev field set, stop now. This will be the
2446      *  case either if we've already processed that word's accept state,
2447      *  or that state had multiple words, and the overspill words were
2448      *  already linked up earlier.
2449      */
2450     {
2451         U16 word;
2452         U32 state;
2453         U16 prev;
2454
2455         for (word=1; word <= trie->wordcount; word++) {
2456             prev = 0;
2457             if (trie->wordinfo[word].prev)
2458                 continue;
2459             state = trie->wordinfo[word].accept;
2460             while (state) {
2461                 state = prev_states[state];
2462                 if (!state)
2463                     break;
2464                 prev = trie->states[state].wordnum;
2465                 if (prev)
2466                     break;
2467             }
2468             trie->wordinfo[word].prev = prev;
2469         }
2470         Safefree(prev_states);
2471     }
2472
2473
2474     /* and now dump out the compressed format */
2475     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2476
2477     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2478 #ifdef DEBUGGING
2479     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2480     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2481 #else
2482     SvREFCNT_dec_NN(revcharmap);
2483 #endif
2484     return trie->jump
2485            ? MADE_JUMP_TRIE
2486            : trie->startstate>1
2487              ? MADE_EXACT_TRIE
2488              : MADE_TRIE;
2489 }
2490
2491 STATIC void
2492 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2493 {
2494 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2495
2496    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2497    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2498    ISBN 0-201-10088-6
2499
2500    We find the fail state for each state in the trie, this state is the longest proper
2501    suffix of the current state's 'word' that is also a proper prefix of another word in our
2502    trie. State 1 represents the word '' and is thus the default fail state. This allows
2503    the DFA not to have to restart after its tried and failed a word at a given point, it
2504    simply continues as though it had been matching the other word in the first place.
2505    Consider
2506       'abcdgu'=~/abcdefg|cdgu/
2507    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2508    fail, which would bring us to the state representing 'd' in the second word where we would
2509    try 'g' and succeed, proceeding to match 'cdgu'.
2510  */
2511  /* add a fail transition */
2512     const U32 trie_offset = ARG(source);
2513     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2514     U32 *q;
2515     const U32 ucharcount = trie->uniquecharcount;
2516     const U32 numstates = trie->statecount;
2517     const U32 ubound = trie->lasttrans + ucharcount;
2518     U32 q_read = 0;
2519     U32 q_write = 0;
2520     U32 charid;
2521     U32 base = trie->states[ 1 ].trans.base;
2522     U32 *fail;
2523     reg_ac_data *aho;
2524     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2525     GET_RE_DEBUG_FLAGS_DECL;
2526
2527     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2528 #ifndef DEBUGGING
2529     PERL_UNUSED_ARG(depth);
2530 #endif
2531
2532
2533     ARG_SET( stclass, data_slot );
2534     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2535     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2536     aho->trie=trie_offset;
2537     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2538     Copy( trie->states, aho->states, numstates, reg_trie_state );
2539     Newxz( q, numstates, U32);
2540     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2541     aho->refcount = 1;
2542     fail = aho->fail;
2543     /* initialize fail[0..1] to be 1 so that we always have
2544        a valid final fail state */
2545     fail[ 0 ] = fail[ 1 ] = 1;
2546
2547     for ( charid = 0; charid < ucharcount ; charid++ ) {
2548         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2549         if ( newstate ) {
2550             q[ q_write ] = newstate;
2551             /* set to point at the root */
2552             fail[ q[ q_write++ ] ]=1;
2553         }
2554     }
2555     while ( q_read < q_write) {
2556         const U32 cur = q[ q_read++ % numstates ];
2557         base = trie->states[ cur ].trans.base;
2558
2559         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2560             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2561             if (ch_state) {
2562                 U32 fail_state = cur;
2563                 U32 fail_base;
2564                 do {
2565                     fail_state = fail[ fail_state ];
2566                     fail_base = aho->states[ fail_state ].trans.base;
2567                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2568
2569                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2570                 fail[ ch_state ] = fail_state;
2571                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2572                 {
2573                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2574                 }
2575                 q[ q_write++ % numstates] = ch_state;
2576             }
2577         }
2578     }
2579     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2580        when we fail in state 1, this allows us to use the
2581        charclass scan to find a valid start char. This is based on the principle
2582        that theres a good chance the string being searched contains lots of stuff
2583        that cant be a start char.
2584      */
2585     fail[ 0 ] = fail[ 1 ] = 0;
2586     DEBUG_TRIE_COMPILE_r({
2587         PerlIO_printf(Perl_debug_log,
2588                       "%*sStclass Failtable (%"UVuf" states): 0",
2589                       (int)(depth * 2), "", (UV)numstates
2590         );
2591         for( q_read=1; q_read<numstates; q_read++ ) {
2592             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2593         }
2594         PerlIO_printf(Perl_debug_log, "\n");
2595     });
2596     Safefree(q);
2597     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2598 }
2599
2600
2601 /*
2602  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2603  * These need to be revisited when a newer toolchain becomes available.
2604  */
2605 #if defined(__sparc64__) && defined(__GNUC__)
2606 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2607 #       undef  SPARC64_GCC_WORKAROUND
2608 #       define SPARC64_GCC_WORKAROUND 1
2609 #   endif
2610 #endif
2611
2612 #define DEBUG_PEEP(str,scan,depth) \
2613     DEBUG_OPTIMISE_r({if (scan){ \
2614        SV * const mysv=sv_newmortal(); \
2615        regnode *Next = regnext(scan); \
2616        regprop(RExC_rx, mysv, scan); \
2617        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2618        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2619        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2620    }});
2621
2622
2623 /* The below joins as many adjacent EXACTish nodes as possible into a single
2624  * one.  The regop may be changed if the node(s) contain certain sequences that
2625  * require special handling.  The joining is only done if:
2626  * 1) there is room in the current conglomerated node to entirely contain the
2627  *    next one.
2628  * 2) they are the exact same node type
2629  *
2630  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2631  * these get optimized out
2632  *
2633  * If a node is to match under /i (folded), the number of characters it matches
2634  * can be different than its character length if it contains a multi-character
2635  * fold.  *min_subtract is set to the total delta of the input nodes.
2636  *
2637  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2638  * and contains LATIN SMALL LETTER SHARP S
2639  *
2640  * This is as good a place as any to discuss the design of handling these
2641  * multi-character fold sequences.  It's been wrong in Perl for a very long
2642  * time.  There are three code points in Unicode whose multi-character folds
2643  * were long ago discovered to mess things up.  The previous designs for
2644  * dealing with these involved assigning a special node for them.  This
2645  * approach doesn't work, as evidenced by this example:
2646  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2647  * Both these fold to "sss", but if the pattern is parsed to create a node that
2648  * would match just the \xDF, it won't be able to handle the case where a
2649  * successful match would have to cross the node's boundary.  The new approach
2650  * that hopefully generally solves the problem generates an EXACTFU_SS node
2651  * that is "sss".
2652  *
2653  * It turns out that there are problems with all multi-character folds, and not
2654  * just these three.  Now the code is general, for all such cases, but the
2655  * three still have some special handling.  The approach taken is:
2656  * 1)   This routine examines each EXACTFish node that could contain multi-
2657  *      character fold sequences.  It returns in *min_subtract how much to
2658  *      subtract from the the actual length of the string to get a real minimum
2659  *      match length; it is 0 if there are no multi-char folds.  This delta is
2660  *      used by the caller to adjust the min length of the match, and the delta
2661  *      between min and max, so that the optimizer doesn't reject these
2662  *      possibilities based on size constraints.
2663  * 2)   Certain of these sequences require special handling by the trie code,
2664  *      so, if found, this code changes the joined node type to special ops:
2665  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2666  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2667  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2668  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2669  *      there is a possible fold length change.  That means that a regular
2670  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2671  *      with length changes, and so can be processed faster.  regexec.c takes
2672  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2673  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2674  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2675  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2676  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2677  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2678  *      possibilities for the non-UTF8 patterns are quite simple, except for
2679  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2680  *      members of a fold-pair, and arrays are set up for all of them so that
2681  *      the other member of the pair can be found quickly.  Code elsewhere in
2682  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2683  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2684  *      described in the next item.
2685  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2686  *      'ss' or not is not knowable at compile time.  It will match iff the
2687  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2688  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2689  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2690  *      described in item 3).  An assumption that the optimizer part of
2691  *      regexec.c (probably unwittingly) makes is that a character in the
2692  *      pattern corresponds to at most a single character in the target string.
2693  *      (And I do mean character, and not byte here, unlike other parts of the
2694  *      documentation that have never been updated to account for multibyte
2695  *      Unicode.)  This assumption is wrong only in this case, as all other
2696  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2697  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2698  *      reluctant to try to change this assumption, so instead the code punts.
2699  *      This routine examines EXACTF nodes for the sharp s, and returns a
2700  *      boolean indicating whether or not the node is an EXACTF node that
2701  *      contains a sharp s.  When it is true, the caller sets a flag that later
2702  *      causes the optimizer in this file to not set values for the floating
2703  *      and fixed string lengths, and thus avoids the optimizer code in
2704  *      regexec.c that makes the invalid assumption.  Thus, there is no
2705  *      optimization based on string lengths for EXACTF nodes that contain the
2706  *      sharp s.  This only happens for /id rules (which means the pattern
2707  *      isn't in UTF-8).
2708  */
2709
2710 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2711     if (PL_regkind[OP(scan)] == EXACT) \
2712         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2713
2714 STATIC U32
2715 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2716     /* Merge several consecutive EXACTish nodes into one. */
2717     regnode *n = regnext(scan);
2718     U32 stringok = 1;
2719     regnode *next = scan + NODE_SZ_STR(scan);
2720     U32 merged = 0;
2721     U32 stopnow = 0;
2722 #ifdef DEBUGGING
2723     regnode *stop = scan;
2724     GET_RE_DEBUG_FLAGS_DECL;
2725 #else
2726     PERL_UNUSED_ARG(depth);
2727 #endif
2728
2729     PERL_ARGS_ASSERT_JOIN_EXACT;
2730 #ifndef EXPERIMENTAL_INPLACESCAN
2731     PERL_UNUSED_ARG(flags);
2732     PERL_UNUSED_ARG(val);
2733 #endif
2734     DEBUG_PEEP("join",scan,depth);
2735
2736     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2737      * EXACT ones that are mergeable to the current one. */
2738     while (n
2739            && (PL_regkind[OP(n)] == NOTHING
2740                || (stringok && OP(n) == OP(scan)))
2741            && NEXT_OFF(n)
2742            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2743     {
2744
2745         if (OP(n) == TAIL || n > next)
2746             stringok = 0;
2747         if (PL_regkind[OP(n)] == NOTHING) {
2748             DEBUG_PEEP("skip:",n,depth);
2749             NEXT_OFF(scan) += NEXT_OFF(n);
2750             next = n + NODE_STEP_REGNODE;
2751 #ifdef DEBUGGING
2752             if (stringok)
2753                 stop = n;
2754 #endif
2755             n = regnext(n);
2756         }
2757         else if (stringok) {
2758             const unsigned int oldl = STR_LEN(scan);
2759             regnode * const nnext = regnext(n);
2760
2761             /* XXX I (khw) kind of doubt that this works on platforms where
2762              * U8_MAX is above 255 because of lots of other assumptions */
2763             /* Don't join if the sum can't fit into a single node */
2764             if (oldl + STR_LEN(n) > U8_MAX)
2765                 break;
2766
2767             DEBUG_PEEP("merg",n,depth);
2768             merged++;
2769
2770             NEXT_OFF(scan) += NEXT_OFF(n);
2771             STR_LEN(scan) += STR_LEN(n);
2772             next = n + NODE_SZ_STR(n);
2773             /* Now we can overwrite *n : */
2774             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2775 #ifdef DEBUGGING
2776             stop = next - 1;
2777 #endif
2778             n = nnext;
2779             if (stopnow) break;
2780         }
2781
2782 #ifdef EXPERIMENTAL_INPLACESCAN
2783         if (flags && !NEXT_OFF(n)) {
2784             DEBUG_PEEP("atch", val, depth);
2785             if (reg_off_by_arg[OP(n)]) {
2786                 ARG_SET(n, val - n);
2787             }
2788             else {
2789                 NEXT_OFF(n) = val - n;
2790             }
2791             stopnow = 1;
2792         }
2793 #endif
2794     }
2795
2796     *min_subtract = 0;
2797     *has_exactf_sharp_s = FALSE;
2798
2799     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2800      * can now analyze for sequences of problematic code points.  (Prior to
2801      * this final joining, sequences could have been split over boundaries, and
2802      * hence missed).  The sequences only happen in folding, hence for any
2803      * non-EXACT EXACTish node */
2804     if (OP(scan) != EXACT) {
2805         const U8 * const s0 = (U8*) STRING(scan);
2806         const U8 * s = s0;
2807         const U8 * const s_end = s0 + STR_LEN(scan);
2808
2809         /* One pass is made over the node's string looking for all the
2810          * possibilities.  to avoid some tests in the loop, there are two main
2811          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2812          * non-UTF-8 */
2813         if (UTF) {
2814
2815             /* Examine the string for a multi-character fold sequence.  UTF-8
2816              * patterns have all characters pre-folded by the time this code is
2817              * executed */
2818             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2819                                      length sequence we are looking for is 2 */
2820             {
2821                 int count = 0;
2822                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2823                 if (! len) {    /* Not a multi-char fold: get next char */
2824                     s += UTF8SKIP(s);
2825                     continue;
2826                 }
2827
2828                 /* Nodes with 'ss' require special handling, except for EXACTFL
2829                  * and EXACTFA for which there is no multi-char fold to this */
2830                 if (len == 2 && *s == 's' && *(s+1) == 's'
2831                     && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2832                 {
2833                     count = 2;
2834                     OP(scan) = EXACTFU_SS;
2835                     s += 2;
2836                 }
2837                 else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2838                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2839                                       COMBINING_DIAERESIS_UTF8
2840                                       COMBINING_ACUTE_ACCENT_UTF8,
2841                                    6)
2842                              || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2843                                          COMBINING_DIAERESIS_UTF8
2844                                          COMBINING_ACUTE_ACCENT_UTF8,
2845                                      6)))
2846                 {
2847                     count = 3;
2848
2849                     /* These two folds require special handling by trie's, so
2850                      * change the node type to indicate this.  If EXACTFA and
2851                      * EXACTFL were ever to be handled by trie's, this would
2852                      * have to be changed.  If this node has already been
2853                      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2854                      * (khw) think it doesn't matter in regexec.c for UTF
2855                      * patterns, but no need to change it */
2856                     if (OP(scan) == EXACTFU) {
2857                         OP(scan) = EXACTFU_TRICKYFOLD;
2858                     }
2859                     s += 6;
2860                 }
2861                 else { /* Here is a generic multi-char fold. */
2862                     const U8* multi_end  = s + len;
2863
2864                     /* Count how many characters in it.  In the case of /l and
2865                      * /aa, no folds which contain ASCII code points are
2866                      * allowed, so check for those, and skip if found.  (In
2867                      * EXACTFL, no folds are allowed to any Latin1 code point,
2868                      * not just ASCII.  But there aren't any of these
2869                      * currently, nor ever likely, so don't take the time to
2870                      * test for them.  The code that generates the
2871                      * is_MULTI_foo() macros croaks should one actually get put
2872                      * into Unicode .) */
2873                     if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2874                         count = utf8_length(s, multi_end);
2875                         s = multi_end;
2876                     }
2877                     else {
2878                         while (s < multi_end) {
2879                             if (isASCII(*s)) {
2880                                 s++;
2881                                 goto next_iteration;
2882                             }
2883                             else {
2884                                 s += UTF8SKIP(s);
2885                             }
2886                             count++;
2887                         }
2888                     }
2889                 }
2890
2891                 /* The delta is how long the sequence is minus 1 (1 is how long
2892                  * the character that folds to the sequence is) */
2893                 *min_subtract += count - 1;
2894             next_iteration: ;
2895             }
2896         }
2897         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2898
2899             /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2900              * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2901              * nodes can't have multi-char folds to this range (and there are
2902              * no existing ones in the upper latin1 range).  In the EXACTF
2903              * case we look also for the sharp s, which can be in the final
2904              * position.  Otherwise we can stop looking 1 byte earlier because
2905              * have to find at least two characters for a multi-fold */
2906             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2907
2908             /* The below is perhaps overboard, but this allows us to save a
2909              * test each time through the loop at the expense of a mask.  This
2910              * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2911              * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2912              * are 64.  This uses an exclusive 'or' to find that bit and then
2913              * inverts it to form a mask, with just a single 0, in the bit
2914              * position where 'S' and 's' differ. */
2915             const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2916             const U8 s_masked = 's' & S_or_s_mask;
2917
2918             while (s < upper) {
2919                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2920                 if (! len) {    /* Not a multi-char fold. */
2921                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2922                     {
2923                         *has_exactf_sharp_s = TRUE;
2924                     }
2925                     s++;
2926                     continue;
2927                 }
2928
2929                 if (len == 2
2930                     && ((*s & S_or_s_mask) == s_masked)
2931                     && ((*(s+1) & S_or_s_mask) == s_masked))
2932                 {
2933
2934                     /* EXACTF nodes need to know that the minimum length
2935                      * changed so that a sharp s in the string can match this
2936                      * ss in the pattern, but they remain EXACTF nodes, as they
2937                      * won't match this unless the target string is is UTF-8,
2938                      * which we don't know until runtime */
2939                     if (OP(scan) != EXACTF) {
2940                         OP(scan) = EXACTFU_SS;
2941                     }
2942                 }
2943
2944                 *min_subtract += len - 1;
2945                 s += len;
2946             }
2947         }
2948     }
2949
2950 #ifdef DEBUGGING
2951     /* Allow dumping but overwriting the collection of skipped
2952      * ops and/or strings with fake optimized ops */
2953     n = scan + NODE_SZ_STR(scan);
2954     while (n <= stop) {
2955         OP(n) = OPTIMIZED;
2956         FLAGS(n) = 0;
2957         NEXT_OFF(n) = 0;
2958         n++;
2959     }
2960 #endif
2961     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2962     return stopnow;
2963 }
2964
2965 /* REx optimizer.  Converts nodes into quicker variants "in place".
2966    Finds fixed substrings.  */
2967
2968 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2969    to the position after last scanned or to NULL. */
2970
2971 #define INIT_AND_WITHP \
2972     assert(!and_withp); \
2973     Newx(and_withp,1,struct regnode_charclass_class); \
2974     SAVEFREEPV(and_withp)
2975
2976 /* this is a chain of data about sub patterns we are processing that
2977    need to be handled separately/specially in study_chunk. Its so
2978    we can simulate recursion without losing state.  */
2979 struct scan_frame;
2980 typedef struct scan_frame {
2981     regnode *last;  /* last node to process in this frame */
2982     regnode *next;  /* next node to process when last is reached */
2983     struct scan_frame *prev; /*previous frame*/
2984     I32 stop; /* what stopparen do we use */
2985 } scan_frame;
2986
2987
2988 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2989
2990 STATIC I32
2991 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2992                         I32 *minlenp, I32 *deltap,
2993                         regnode *last,
2994                         scan_data_t *data,
2995                         I32 stopparen,
2996                         U8* recursed,
2997                         struct regnode_charclass_class *and_withp,
2998                         U32 flags, U32 depth)
2999                         /* scanp: Start here (read-write). */
3000                         /* deltap: Write maxlen-minlen here. */
3001                         /* last: Stop before this one. */
3002                         /* data: string data about the pattern */
3003                         /* stopparen: treat close N as END */
3004                         /* recursed: which subroutines have we recursed into */
3005                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3006 {
3007     dVAR;
3008     I32 min = 0;    /* There must be at least this number of characters to match */
3009     I32 pars = 0, code;
3010     regnode *scan = *scanp, *next;
3011     I32 delta = 0;
3012     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3013     int is_inf_internal = 0;            /* The studied chunk is infinite */
3014     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3015     scan_data_t data_fake;
3016     SV *re_trie_maxbuff = NULL;
3017     regnode *first_non_open = scan;
3018     I32 stopmin = I32_MAX;
3019     scan_frame *frame = NULL;
3020     GET_RE_DEBUG_FLAGS_DECL;
3021
3022     PERL_ARGS_ASSERT_STUDY_CHUNK;
3023
3024 #ifdef DEBUGGING
3025     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3026 #endif
3027
3028     if ( depth == 0 ) {
3029         while (first_non_open && OP(first_non_open) == OPEN)
3030             first_non_open=regnext(first_non_open);
3031     }
3032
3033
3034   fake_study_recurse:
3035     while ( scan && OP(scan) != END && scan < last ){
3036         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3037                                    node length to get a real minimum (because
3038                                    the folded version may be shorter) */
3039         bool has_exactf_sharp_s = FALSE;
3040         /* Peephole optimizer: */
3041         DEBUG_STUDYDATA("Peep:", data,depth);
3042         DEBUG_PEEP("Peep",scan,depth);
3043
3044         /* Its not clear to khw or hv why this is done here, and not in the
3045          * clauses that deal with EXACT nodes.  khw's guess is that it's
3046          * because of a previous design */
3047         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3048
3049         /* Follow the next-chain of the current node and optimize
3050            away all the NOTHINGs from it.  */
3051         if (OP(scan) != CURLYX) {
3052             const int max = (reg_off_by_arg[OP(scan)]
3053                        ? I32_MAX
3054                        /* I32 may be smaller than U16 on CRAYs! */
3055                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3056             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3057             int noff;
3058             regnode *n = scan;
3059
3060             /* Skip NOTHING and LONGJMP. */
3061             while ((n = regnext(n))
3062                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3063                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3064                    && off + noff < max)
3065                 off += noff;
3066             if (reg_off_by_arg[OP(scan)])
3067                 ARG(scan) = off;
3068             else
3069                 NEXT_OFF(scan) = off;
3070         }
3071
3072
3073
3074         /* The principal pseudo-switch.  Cannot be a switch, since we
3075            look into several different things.  */
3076         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3077                    || OP(scan) == IFTHEN) {
3078             next = regnext(scan);
3079             code = OP(scan);
3080             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3081
3082             if (OP(next) == code || code == IFTHEN) {
3083                 /* NOTE - There is similar code to this block below for handling
3084                    TRIE nodes on a re-study.  If you change stuff here check there
3085                    too. */
3086                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3087                 struct regnode_charclass_class accum;
3088                 regnode * const startbranch=scan;
3089
3090                 if (flags & SCF_DO_SUBSTR)
3091                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3092                 if (flags & SCF_DO_STCLASS)
3093                     cl_init_zero(pRExC_state, &accum);
3094
3095                 while (OP(scan) == code) {
3096                     I32 deltanext, minnext, f = 0, fake;
3097                     struct regnode_charclass_class this_class;
3098
3099                     num++;
3100                     data_fake.flags = 0;
3101                     if (data) {
3102                         data_fake.whilem_c = data->whilem_c;
3103                         data_fake.last_closep = data->last_closep;
3104                     }
3105                     else
3106                         data_fake.last_closep = &fake;
3107
3108                     data_fake.pos_delta = delta;
3109                     next = regnext(scan);
3110                     scan = NEXTOPER(scan);
3111                     if (code != BRANCH)
3112                         scan = NEXTOPER(scan);
3113                     if (flags & SCF_DO_STCLASS) {
3114                         cl_init(pRExC_state, &this_class);
3115                         data_fake.start_class = &this_class;
3116                         f = SCF_DO_STCLASS_AND;
3117                     }
3118                     if (flags & SCF_WHILEM_VISITED_POS)
3119                         f |= SCF_WHILEM_VISITED_POS;
3120
3121                     /* we suppose the run is continuous, last=next...*/
3122                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3123                                           next, &data_fake,
3124                                           stopparen, recursed, NULL, f,depth+1);
3125                     if (min1 > minnext)
3126                         min1 = minnext;
3127                     if (max1 < minnext + deltanext)
3128                         max1 = minnext + deltanext;
3129                     if (deltanext == I32_MAX)
3130                         is_inf = is_inf_internal = 1;
3131                     scan = next;
3132                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3133                         pars++;
3134                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3135                         if ( stopmin > minnext)
3136                             stopmin = min + min1;
3137                         flags &= ~SCF_DO_SUBSTR;
3138                         if (data)
3139                             data->flags |= SCF_SEEN_ACCEPT;
3140                     }
3141                     if (data) {
3142                         if (data_fake.flags & SF_HAS_EVAL)
3143                             data->flags |= SF_HAS_EVAL;
3144                         data->whilem_c = data_fake.whilem_c;
3145                     }
3146                     if (flags & SCF_DO_STCLASS)
3147                         cl_or(pRExC_state, &accum, &this_class);
3148                 }
3149                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3150                     min1 = 0;
3151                 if (flags & SCF_DO_SUBSTR) {
3152                     data->pos_min += min1;
3153                     data->pos_delta += max1 - min1;
3154                     if (max1 != min1 || is_inf)
3155                         data->longest = &(data->longest_float);
3156                 }
3157                 min += min1;
3158                 delta += max1 - min1;
3159                 if (flags & SCF_DO_STCLASS_OR) {
3160                     cl_or(pRExC_state, data->start_class, &accum);
3161                     if (min1) {
3162                         cl_and(data->start_class, and_withp);
3163                         flags &= ~SCF_DO_STCLASS;
3164                     }
3165                 }
3166                 else if (flags & SCF_DO_STCLASS_AND) {
3167                     if (min1) {
3168                         cl_and(data->start_class, &accum);
3169                         flags &= ~SCF_DO_STCLASS;
3170                     }
3171                     else {
3172                         /* Switch to OR mode: cache the old value of
3173                          * data->start_class */
3174                         INIT_AND_WITHP;
3175                         StructCopy(data->start_class, and_withp,
3176                                    struct regnode_charclass_class);
3177                         flags &= ~SCF_DO_STCLASS_AND;
3178                         StructCopy(&accum, data->start_class,
3179                                    struct regnode_charclass_class);
3180                         flags |= SCF_DO_STCLASS_OR;
3181                         SET_SSC_EOS(data->start_class);
3182                     }
3183                 }
3184
3185                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3186                 /* demq.
3187
3188                    Assuming this was/is a branch we are dealing with: 'scan' now
3189                    points at the item that follows the branch sequence, whatever
3190                    it is. We now start at the beginning of the sequence and look
3191                    for subsequences of
3192
3193                    BRANCH->EXACT=>x1
3194                    BRANCH->EXACT=>x2
3195                    tail
3196
3197                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3198
3199                    If we can find such a subsequence we need to turn the first
3200                    element into a trie and then add the subsequent branch exact
3201                    strings to the trie.
3202
3203                    We have two cases
3204
3205                      1. patterns where the whole set of branches can be converted.
3206
3207                      2. patterns where only a subset can be converted.
3208
3209                    In case 1 we can replace the whole set with a single regop
3210                    for the trie. In case 2 we need to keep the start and end
3211                    branches so
3212
3213                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3214                      becomes BRANCH TRIE; BRANCH X;
3215
3216                   There is an additional case, that being where there is a
3217                   common prefix, which gets split out into an EXACT like node
3218                   preceding the TRIE node.
3219
3220                   If x(1..n)==tail then we can do a simple trie, if not we make
3221                   a "jump" trie, such that when we match the appropriate word
3222                   we "jump" to the appropriate tail node. Essentially we turn
3223                   a nested if into a case structure of sorts.
3224
3225                 */
3226
3227                     int made=0;
3228                     if (!re_trie_maxbuff) {
3229                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3230                         if (!SvIOK(re_trie_maxbuff))
3231                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3232                     }
3233                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3234                         regnode *cur;
3235                         regnode *first = (regnode *)NULL;
3236                         regnode *last = (regnode *)NULL;
3237                         regnode *tail = scan;
3238                         U8 trietype = 0;
3239                         U32 count=0;
3240
3241 #ifdef DEBUGGING
3242                         SV * const mysv = sv_newmortal();       /* for dumping */
3243 #endif
3244                         /* var tail is used because there may be a TAIL
3245                            regop in the way. Ie, the exacts will point to the
3246                            thing following the TAIL, but the last branch will
3247                            point at the TAIL. So we advance tail. If we
3248                            have nested (?:) we may have to move through several
3249                            tails.
3250                          */
3251
3252                         while ( OP( tail ) == TAIL ) {
3253                             /* this is the TAIL generated by (?:) */
3254                             tail = regnext( tail );
3255                         }
3256
3257
3258                         DEBUG_TRIE_COMPILE_r({
3259                             regprop(RExC_rx, mysv, tail );
3260                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3261                                 (int)depth * 2 + 2, "",
3262                                 "Looking for TRIE'able sequences. Tail node is: ",
3263                                 SvPV_nolen_const( mysv )
3264                             );
3265                         });
3266
3267                         /*
3268
3269                             Step through the branches
3270                                 cur represents each branch,
3271                                 noper is the first thing to be matched as part of that branch
3272                                 noper_next is the regnext() of that node.
3273
3274                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3275                             via a "jump trie" but we also support building with NOJUMPTRIE,
3276                             which restricts the trie logic to structures like /FOO|BAR/.
3277
3278                             If noper is a trieable nodetype then the branch is a possible optimization
3279                             target. If we are building under NOJUMPTRIE then we require that noper_next
3280                             is the same as scan (our current position in the regex program).
3281
3282                             Once we have two or more consecutive such branches we can create a
3283                             trie of the EXACT's contents and stitch it in place into the program.
3284
3285                             If the sequence represents all of the branches in the alternation we
3286                             replace the entire thing with a single TRIE node.
3287
3288                             Otherwise when it is a subsequence we need to stitch it in place and
3289                             replace only the relevant branches. This means the first branch has
3290                             to remain as it is used by the alternation logic, and its next pointer,
3291                             and needs to be repointed at the item on the branch chain following
3292                             the last branch we have optimized away.
3293
3294                             This could be either a BRANCH, in which case the subsequence is internal,
3295                             or it could be the item following the branch sequence in which case the
3296                             subsequence is at the end (which does not necessarily mean the first node
3297                             is the start of the alternation).
3298
3299                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3300
3301                                 optype          |  trietype
3302                                 ----------------+-----------
3303                                 NOTHING         | NOTHING
3304                                 EXACT           | EXACT
3305                                 EXACTFU         | EXACTFU
3306                                 EXACTFU_SS      | EXACTFU
3307                                 EXACTFU_TRICKYFOLD | EXACTFU
3308                                 EXACTFA         | 0
3309
3310
3311                         */
3312 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3313                        ( EXACT == (X) )   ? EXACT :        \
3314                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3315                        0 )
3316
3317                         /* dont use tail as the end marker for this traverse */
3318                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3319                             regnode * const noper = NEXTOPER( cur );
3320                             U8 noper_type = OP( noper );
3321                             U8 noper_trietype = TRIE_TYPE( noper_type );
3322 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3323                             regnode * const noper_next = regnext( noper );
3324                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3325                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3326 #endif
3327
3328                             DEBUG_TRIE_COMPILE_r({
3329                                 regprop(RExC_rx, mysv, cur);
3330                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3331                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3332
3333                                 regprop(RExC_rx, mysv, noper);
3334                                 PerlIO_printf( Perl_debug_log, " -> %s",
3335                                     SvPV_nolen_const(mysv));
3336
3337                                 if ( noper_next ) {
3338                                   regprop(RExC_rx, mysv, noper_next );
3339                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3340                                     SvPV_nolen_const(mysv));
3341                                 }
3342                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3343                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3344                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3345                                 );
3346                             });
3347
3348                             /* Is noper a trieable nodetype that can be merged with the
3349                              * current trie (if there is one)? */
3350                             if ( noper_trietype
3351                                   &&
3352                                   (
3353                                         ( noper_trietype == NOTHING)
3354                                         || ( trietype == NOTHING )
3355                                         || ( trietype == noper_trietype )
3356                                   )
3357 #ifdef NOJUMPTRIE
3358                                   && noper_next == tail
3359 #endif
3360                                   && count < U16_MAX)
3361                             {
3362                                 /* Handle mergable triable node
3363                                  * Either we are the first node in a new trieable sequence,
3364                                  * in which case we do some bookkeeping, otherwise we update
3365                                  * the end pointer. */
3366                                 if ( !first ) {
3367                                     first = cur;
3368                                     if ( noper_trietype == NOTHING ) {
3369 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3370                                         regnode * const noper_next = regnext( noper );
3371                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3372                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3373 #endif
3374
3375                                         if ( noper_next_trietype ) {
3376                                             trietype = noper_next_trietype;
3377                                         } else if (noper_next_type)  {
3378                                             /* a NOTHING regop is 1 regop wide. We need at least two
3379                                              * for a trie so we can't merge this in */
3380                                             first = NULL;
3381                                         }
3382                                     } else {
3383                                         trietype = noper_trietype;
3384                                     }
3385                                 } else {
3386                                     if ( trietype == NOTHING )
3387                                         trietype = noper_trietype;
3388                                     last = cur;
3389                                 }
3390                                 if (first)
3391                                     count++;
3392                             } /* end handle mergable triable node */
3393                             else {
3394                                 /* handle unmergable node -
3395                                  * noper may either be a triable node which can not be tried
3396                                  * together with the current trie, or a non triable node */
3397                                 if ( last ) {
3398                                     /* If last is set and trietype is not NOTHING then we have found
3399                                      * at least two triable branch sequences in a row of a similar
3400                                      * trietype so we can turn them into a trie. If/when we
3401                                      * allow NOTHING to start a trie sequence this condition will be
3402                                      * required, and it isn't expensive so we leave it in for now. */
3403                                     if ( trietype && trietype != NOTHING )
3404                                         make_trie( pRExC_state,
3405                                                 startbranch, first, cur, tail, count,
3406                                                 trietype, depth+1 );
3407                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3408                                 }
3409                                 if ( noper_trietype
3410 #ifdef NOJUMPTRIE
3411                                      && noper_next == tail
3412 #endif
3413                                 ){
3414                                     /* noper is triable, so we can start a new trie sequence */
3415                                     count = 1;
3416                                     first = cur;
3417                                     trietype = noper_trietype;
3418                                 } else if (first) {
3419                                     /* if we already saw a first but the current node is not triable then we have
3420                                      * to reset the first information. */
3421                                     count = 0;
3422                                     first = NULL;
3423                                     trietype = 0;
3424                                 }
3425                             } /* end handle unmergable node */
3426                         } /* loop over branches */
3427                         DEBUG_TRIE_COMPILE_r({
3428                             regprop(RExC_rx, mysv, cur);
3429                             PerlIO_printf( Perl_debug_log,
3430                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3431                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3432
3433                         });
3434                         if ( last && trietype ) {
3435                             if ( trietype != NOTHING ) {
3436                                 /* the last branch of the sequence was part of a trie,
3437                                  * so we have to construct it here outside of the loop
3438                                  */
3439                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3440 #ifdef TRIE_STUDY_OPT
3441                                 if ( ((made == MADE_EXACT_TRIE &&
3442                                      startbranch == first)
3443                                      || ( first_non_open == first )) &&
3444                                      depth==0 ) {
3445                                     flags |= SCF_TRIE_RESTUDY;
3446                                     if ( startbranch == first
3447                                          && scan == tail )
3448                                     {
3449                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3450                                     }
3451                                 }
3452 #endif
3453                             } else {
3454                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3455                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3456                                  */
3457                                 if ( startbranch == first ) {
3458                                     regnode *opt;
3459                                     /* the entire thing is a NOTHING sequence, something like this:
3460                                      * (?:|) So we can turn it into a plain NOTHING op. */
3461                                     DEBUG_TRIE_COMPILE_r({
3462                                         regprop(RExC_rx, mysv, cur);
3463                                         PerlIO_printf( Perl_debug_log,
3464                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3465                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3466
3467                                     });
3468                                     OP(startbranch)= NOTHING;
3469                                     NEXT_OFF(startbranch)= tail - startbranch;
3470                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3471                                         OP(opt)= OPTIMIZED;
3472                                 }
3473                             }
3474                         } /* end if ( last) */
3475                     } /* TRIE_MAXBUF is non zero */
3476
3477                 } /* do trie */
3478
3479             }
3480             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3481                 scan = NEXTOPER(NEXTOPER(scan));
3482             } else                      /* single branch is optimized. */
3483                 scan = NEXTOPER(scan);
3484             continue;
3485         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3486             scan_frame *newframe = NULL;
3487             I32 paren;
3488             regnode *start;
3489             regnode *end;
3490
3491             if (OP(scan) != SUSPEND) {
3492             /* set the pointer */
3493                 if (OP(scan) == GOSUB) {
3494                     paren = ARG(scan);
3495                     RExC_recurse[ARG2L(scan)] = scan;
3496                     start = RExC_open_parens[paren-1];
3497                     end   = RExC_close_parens[paren-1];
3498                 } else {
3499                     paren = 0;
3500                     start = RExC_rxi->program + 1;
3501                     end   = RExC_opend;
3502                 }
3503                 if (!recursed) {
3504                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3505                     SAVEFREEPV(recursed);
3506                 }
3507                 if (!PAREN_TEST(recursed,paren+1)) {
3508                     PAREN_SET(recursed,paren+1);
3509                     Newx(newframe,1,scan_frame);
3510                 } else {
3511                     if (flags & SCF_DO_SUBSTR) {
3512                         SCAN_COMMIT(pRExC_state,data,minlenp);
3513                         data->longest = &(data->longest_float);
3514                     }
3515                     is_inf = is_inf_internal = 1;
3516                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3517                         cl_anything(pRExC_state, data->start_class);
3518                     flags &= ~SCF_DO_STCLASS;
3519                 }
3520             } else {
3521                 Newx(newframe,1,scan_frame);
3522                 paren = stopparen;
3523                 start = scan+2;
3524                 end = regnext(scan);
3525             }
3526             if (newframe) {
3527                 assert(start);
3528                 assert(end);
3529                 SAVEFREEPV(newframe);
3530                 newframe->next = regnext(scan);
3531                 newframe->last = last;
3532                 newframe->stop = stopparen;
3533                 newframe->prev = frame;
3534
3535                 frame = newframe;
3536                 scan =  start;
3537                 stopparen = paren;
3538                 last = end;
3539
3540                 continue;
3541             }
3542         }
3543         else if (OP(scan) == EXACT) {
3544             I32 l = STR_LEN(scan);
3545             UV uc;
3546             if (UTF) {
3547                 const U8 * const s = (U8*)STRING(scan);
3548                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3549                 l = utf8_length(s, s + l);
3550             } else {
3551                 uc = *((U8*)STRING(scan));
3552             }
3553             min += l;
3554             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3555                 /* The code below prefers earlier match for fixed
3556                    offset, later match for variable offset.  */
3557                 if (data->last_end == -1) { /* Update the start info. */
3558                     data->last_start_min = data->pos_min;
3559                     data->last_start_max = is_inf
3560                         ? I32_MAX : data->pos_min + data->pos_delta;
3561                 }
3562                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3563                 if (UTF)
3564                     SvUTF8_on(data->last_found);
3565                 {
3566                     SV * const sv = data->last_found;
3567                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3568                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3569                     if (mg && mg->mg_len >= 0)
3570                         mg->mg_len += utf8_length((U8*)STRING(scan),
3571                                                   (U8*)STRING(scan)+STR_LEN(scan));
3572                 }
3573                 data->last_end = data->pos_min + l;
3574                 data->pos_min += l; /* As in the first entry. */
3575                 data->flags &= ~SF_BEFORE_EOL;
3576             }
3577             if (flags & SCF_DO_STCLASS_AND) {
3578                 /* Check whether it is compatible with what we know already! */
3579                 int compat = 1;
3580
3581
3582                 /* If compatible, we or it in below.  It is compatible if is
3583                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3584                  * it's for a locale.  Even if there isn't unicode semantics
3585                  * here, at runtime there may be because of matching against a
3586                  * utf8 string, so accept a possible false positive for
3587                  * latin1-range folds */
3588                 if (uc >= 0x100 ||
3589                     (!(data->start_class->flags & ANYOF_LOCALE)
3590                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3591                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3592                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3593                     )
3594                 {
3595                     compat = 0;
3596                 }
3597                 ANYOF_CLASS_ZERO(data->start_class);
3598                 ANYOF_BITMAP_ZERO(data->start_class);
3599                 if (compat)
3600                     ANYOF_BITMAP_SET(data->start_class, uc);
3601                 else if (uc >= 0x100) {
3602                     int i;
3603
3604                     /* Some Unicode code points fold to the Latin1 range; as
3605                      * XXX temporary code, instead of figuring out if this is
3606                      * one, just assume it is and set all the start class bits
3607                      * that could be some such above 255 code point's fold
3608                      * which will generate fals positives.  As the code
3609                      * elsewhere that does compute the fold settles down, it
3610                      * can be extracted out and re-used here */
3611                     for (i = 0; i < 256; i++){
3612                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3613                             ANYOF_BITMAP_SET(data->start_class, i);
3614                         }
3615                     }
3616                 }
3617                 CLEAR_SSC_EOS(data->start_class);
3618                 if (uc < 0x100)
3619                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3620             }
3621             else if (flags & SCF_DO_STCLASS_OR) {
3622                 /* false positive possible if the class is case-folded */
3623                 if (uc < 0x100)
3624                     ANYOF_BITMAP_SET(data->start_class, uc);
3625                 else
3626                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3627                 CLEAR_SSC_EOS(data->start_class);
3628                 cl_and(data->start_class, and_withp);
3629             }
3630             flags &= ~SCF_DO_STCLASS;
3631         }
3632         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3633             I32 l = STR_LEN(scan);
3634             UV uc = *((U8*)STRING(scan));
3635
3636             /* Search for fixed substrings supports EXACT only. */
3637             if (flags & SCF_DO_SUBSTR) {
3638                 assert(data);
3639                 SCAN_COMMIT(pRExC_state, data, minlenp);
3640             }
3641             if (UTF) {
3642                 const U8 * const s = (U8 *)STRING(scan);
3643                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3644                 l = utf8_length(s, s + l);
3645             }
3646             if (has_exactf_sharp_s) {
3647                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3648             }
3649             min += l - min_subtract;
3650             assert (min >= 0);
3651             delta += min_subtract;
3652             if (flags & SCF_DO_SUBSTR) {
3653                 data->pos_min += l - min_subtract;
3654                 if (data->pos_min < 0) {
3655                     data->pos_min = 0;
3656                 }
3657                 data->pos_delta += min_subtract;
3658                 if (min_subtract) {
3659                     data->longest = &(data->longest_float);
3660                 }
3661             }
3662             if (flags & SCF_DO_STCLASS_AND) {
3663                 /* Check whether it is compatible with what we know already! */
3664                 int compat = 1;
3665                 if (uc >= 0x100 ||
3666                  (!(data->start_class->flags & ANYOF_LOCALE)
3667                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3668                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3669                 {
3670                     compat = 0;
3671                 }
3672                 ANYOF_CLASS_ZERO(data->start_class);
3673                 ANYOF_BITMAP_ZERO(data->start_class);
3674                 if (compat) {
3675                     ANYOF_BITMAP_SET(data->start_class, uc);
3676                     CLEAR_SSC_EOS(data->start_class);
3677                     if (OP(scan) == EXACTFL) {
3678                         /* XXX This set is probably no longer necessary, and
3679                          * probably wrong as LOCALE now is on in the initial
3680                          * state */
3681                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3682                     }
3683                     else {
3684
3685                         /* Also set the other member of the fold pair.  In case
3686                          * that unicode semantics is called for at runtime, use
3687                          * the full latin1 fold.  (Can't do this for locale,
3688                          * because not known until runtime) */
3689                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3690
3691                         /* All other (EXACTFL handled above) folds except under
3692                          * /iaa that include s, S, and sharp_s also may include
3693                          * the others */
3694                         if (OP(scan) != EXACTFA) {
3695                             if (uc == 's' || uc == 'S') {
3696                                 ANYOF_BITMAP_SET(data->start_class,
3697                                                  LATIN_SMALL_LETTER_SHARP_S);
3698                             }
3699                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3700                                 ANYOF_BITMAP_SET(data->start_class, 's');
3701                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3702                             }
3703                         }
3704                     }
3705                 }
3706                 else if (uc >= 0x100) {
3707                     int i;
3708                     for (i = 0; i < 256; i++){
3709                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3710                             ANYOF_BITMAP_SET(data->start_class, i);
3711                         }
3712                     }
3713                 }
3714             }
3715             else if (flags & SCF_DO_STCLASS_OR) {
3716                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3717                     /* false positive possible if the class is case-folded.
3718                        Assume that the locale settings are the same... */
3719                     if (uc < 0x100) {
3720                         ANYOF_BITMAP_SET(data->start_class, uc);
3721                         if (OP(scan) != EXACTFL) {
3722
3723                             /* And set the other member of the fold pair, but
3724                              * can't do that in locale because not known until
3725                              * run-time */
3726                             ANYOF_BITMAP_SET(data->start_class,
3727                                              PL_fold_latin1[uc]);
3728
3729                             /* All folds except under /iaa that include s, S,
3730                              * and sharp_s also may include the others */
3731                             if (OP(scan) != EXACTFA) {
3732                                 if (uc == 's' || uc == 'S') {
3733                                     ANYOF_BITMAP_SET(data->start_class,
3734                                                    LATIN_SMALL_LETTER_SHARP_S);
3735                                 }
3736                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3737                                     ANYOF_BITMAP_SET(data->start_class, 's');
3738                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3739                                 }
3740                             }
3741                         }
3742                     }
3743                     CLEAR_SSC_EOS(data->start_class);
3744                 }
3745                 cl_and(data->start_class, and_withp);
3746             }
3747             flags &= ~SCF_DO_STCLASS;
3748         }
3749         else if (REGNODE_VARIES(OP(scan))) {
3750             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3751             I32 f = flags, pos_before = 0;
3752             regnode * const oscan = scan;
3753             struct regnode_charclass_class this_class;
3754             struct regnode_charclass_class *oclass = NULL;
3755             I32 next_is_eval = 0;
3756
3757             switch (PL_regkind[OP(scan)]) {
3758             case WHILEM:                /* End of (?:...)* . */
3759                 scan = NEXTOPER(scan);
3760                 goto finish;
3761             case PLUS:
3762                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3763                     next = NEXTOPER(scan);
3764                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3765                         mincount = 1;
3766                         maxcount = REG_INFTY;
3767                         next = regnext(scan);
3768                         scan = NEXTOPER(scan);
3769                         goto do_curly;
3770                     }
3771                 }
3772                 if (flags & SCF_DO_SUBSTR)
3773                     data->pos_min++;
3774                 min++;
3775                 /* Fall through. */
3776             case STAR:
3777                 if (flags & SCF_DO_STCLASS) {
3778                     mincount = 0;
3779                     maxcount = REG_INFTY;
3780                     next = regnext(scan);
3781                     scan = NEXTOPER(scan);
3782                     goto do_curly;
3783                 }
3784                 is_inf = is_inf_internal = 1;
3785                 scan = regnext(scan);
3786                 if (flags & SCF_DO_SUBSTR) {
3787                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3788                     data->longest = &(data->longest_float);
3789                 }
3790                 goto optimize_curly_tail;
3791             case CURLY:
3792                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3793                     && (scan->flags == stopparen))
3794                 {
3795                     mincount = 1;
3796                     maxcount = 1;
3797                 } else {
3798                     mincount = ARG1(scan);
3799                     maxcount = ARG2(scan);
3800                 }
3801                 next = regnext(scan);
3802                 if (OP(scan) == CURLYX) {
3803                     I32 lp = (data ? *(data->last_closep) : 0);
3804                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3805                 }
3806                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3807                 next_is_eval = (OP(scan) == EVAL);
3808               do_curly:
3809                 if (flags & SCF_DO_SUBSTR) {
3810                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3811                     pos_before = data->pos_min;
3812                 }
3813                 if (data) {
3814                     fl = data->flags;
3815                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3816                     if (is_inf)
3817                         data->flags |= SF_IS_INF;
3818                 }
3819                 if (flags & SCF_DO_STCLASS) {
3820                     cl_init(pRExC_state, &this_class);
3821                     oclass = data->start_class;
3822                     data->start_class = &this_class;
3823                     f |= SCF_DO_STCLASS_AND;
3824                     f &= ~SCF_DO_STCLASS_OR;
3825                 }
3826                 /* Exclude from super-linear cache processing any {n,m}
3827                    regops for which the combination of input pos and regex
3828                    pos is not enough information to determine if a match
3829                    will be possible.
3830
3831                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3832                    regex pos at the \s*, the prospects for a match depend not
3833                    only on the input position but also on how many (bar\s*)
3834                    repeats into the {4,8} we are. */
3835                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3836                     f &= ~SCF_WHILEM_VISITED_POS;
3837
3838                 /* This will finish on WHILEM, setting scan, or on NULL: */
3839                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3840                                       last, data, stopparen, recursed, NULL,
3841                                       (mincount == 0
3842                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3843
3844                 if (flags & SCF_DO_STCLASS)
3845                     data->start_class = oclass;
3846                 if (mincount == 0 || minnext == 0) {
3847                     if (flags & SCF_DO_STCLASS_OR) {
3848                         cl_or(pRExC_state, data->start_class, &this_class);
3849                     }
3850                     else if (flags & SCF_DO_STCLASS_AND) {
3851                         /* Switch to OR mode: cache the old value of
3852                          * data->start_class */
3853                         INIT_AND_WITHP;
3854                         StructCopy(data->start_class, and_withp,
3855                                    struct regnode_charclass_class);
3856                         flags &= ~SCF_DO_STCLASS_AND;
3857                         StructCopy(&this_class, data->start_class,
3858                                    struct regnode_charclass_class);
3859                         flags |= SCF_DO_STCLASS_OR;
3860                         SET_SSC_EOS(data->start_class);
3861                     }
3862                 } else {                /* Non-zero len */
3863                     if (flags & SCF_DO_STCLASS_OR) {
3864                         cl_or(pRExC_state, data->start_class, &this_class);
3865                         cl_and(data->start_class, and_withp);
3866                     }
3867                     else if (flags & SCF_DO_STCLASS_AND)
3868                         cl_and(data->start_class, &this_class);
3869                     flags &= ~SCF_DO_STCLASS;
3870                 }
3871                 if (!scan)              /* It was not CURLYX, but CURLY. */
3872                     scan = next;
3873                 if ( /* ? quantifier ok, except for (?{ ... }) */
3874                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3875                     && (minnext == 0) && (deltanext == 0)
3876                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3877                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3878                 {
3879                     /* Fatal warnings may leak the regexp without this: */
3880                     SAVEFREESV(RExC_rx_sv);
3881                     ckWARNreg(RExC_parse,
3882                               "Quantifier unexpected on zero-length expression");
3883                     (void)ReREFCNT_inc(RExC_rx_sv);
3884                 }
3885
3886                 min += minnext * mincount;
3887                 is_inf_internal |= ((maxcount == REG_INFTY
3888                                      && (minnext + deltanext) > 0)
3889                                     || deltanext == I32_MAX);
3890                 is_inf |= is_inf_internal;
3891                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3892
3893                 /* Try powerful optimization CURLYX => CURLYN. */
3894                 if (  OP(oscan) == CURLYX && data
3895                       && data->flags & SF_IN_PAR
3896                       && !(data->flags & SF_HAS_EVAL)
3897                       && !deltanext && minnext == 1 ) {
3898                     /* Try to optimize to CURLYN.  */
3899                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3900                     regnode * const nxt1 = nxt;
3901 #ifdef DEBUGGING
3902                     regnode *nxt2;
3903 #endif
3904
3905                     /* Skip open. */
3906                     nxt = regnext(nxt);
3907                     if (!REGNODE_SIMPLE(OP(nxt))
3908                         && !(PL_regkind[OP(nxt)] == EXACT
3909                              && STR_LEN(nxt) == 1))
3910                         goto nogo;
3911 #ifdef DEBUGGING
3912                     nxt2 = nxt;
3913 #endif
3914                     nxt = regnext(nxt);
3915                     if (OP(nxt) != CLOSE)
3916                         goto nogo;
3917                     if (RExC_open_parens) {
3918                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3919                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3920                     }
3921                     /* Now we know that nxt2 is the only contents: */
3922                     oscan->flags = (U8)ARG(nxt);
3923                     OP(oscan) = CURLYN;
3924                     OP(nxt1) = NOTHING; /* was OPEN. */
3925
3926 #ifdef DEBUGGING
3927                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3928                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3929                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3930                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3931                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3932                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3933 #endif
3934                 }
3935               nogo:
3936
3937                 /* Try optimization CURLYX => CURLYM. */
3938                 if (  OP(oscan) == CURLYX && data
3939                       && !(data->flags & SF_HAS_PAR)
3940                       && !(data->flags & SF_HAS_EVAL)
3941                       && !deltanext     /* atom is fixed width */
3942                       && minnext != 0   /* CURLYM can't handle zero width */
3943                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3944                 ) {
3945                     /* XXXX How to optimize if data == 0? */
3946                     /* Optimize to a simpler form.  */
3947                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3948                     regnode *nxt2;
3949
3950                     OP(oscan) = CURLYM;
3951                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3952                             && (OP(nxt2) != WHILEM))
3953                         nxt = nxt2;
3954                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3955                     /* Need to optimize away parenths. */
3956                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3957                         /* Set the parenth number.  */
3958                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3959
3960                         oscan->flags = (U8)ARG(nxt);
3961                         if (RExC_open_parens) {
3962                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3963                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3964                         }
3965                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3966                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3967
3968 #ifdef DEBUGGING
3969                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3970                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3971                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3972                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3973 #endif
3974 #if 0
3975                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3976                             regnode *nnxt = regnext(nxt1);
3977                             if (nnxt == nxt) {
3978                                 if (reg_off_by_arg[OP(nxt1)])
3979                                     ARG_SET(nxt1, nxt2 - nxt1);
3980                                 else if (nxt2 - nxt1 < U16_MAX)
3981                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3982                                 else
3983                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3984                             }
3985                             nxt1 = nnxt;
3986                         }
3987 #endif
3988                         /* Optimize again: */
3989                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3990                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3991                     }
3992                     else
3993                         oscan->flags = 0;
3994                 }
3995                 else if ((OP(oscan) == CURLYX)
3996                          && (flags & SCF_WHILEM_VISITED_POS)
3997                          /* See the comment on a similar expression above.
3998                             However, this time it's not a subexpression
3999                             we care about, but the expression itself. */
4000                          && (maxcount == REG_INFTY)
4001                          && data && ++data->whilem_c < 16) {
4002                     /* This stays as CURLYX, we can put the count/of pair. */
4003                     /* Find WHILEM (as in regexec.c) */
4004                     regnode *nxt = oscan + NEXT_OFF(oscan);
4005
4006                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4007                         nxt += ARG(nxt);
4008                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4009                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4010                 }
4011                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4012                     pars++;
4013                 if (flags & SCF_DO_SUBSTR) {
4014                     SV *last_str = NULL;
4015                     int counted = mincount != 0;
4016
4017                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4018 #if defined(SPARC64_GCC_WORKAROUND)
4019                         I32 b = 0;
4020                         STRLEN l = 0;
4021                         const char *s = NULL;
4022                         I32 old = 0;
4023
4024                         if (pos_before >= data->last_start_min)
4025                             b = pos_before;
4026                         else
4027                             b = data->last_start_min;
4028
4029                         l = 0;
4030                         s = SvPV_const(data->last_found, l);
4031                         old = b - data->last_start_min;
4032
4033 #else
4034                         I32 b = pos_before >= data->last_start_min
4035                             ? pos_before : data->last_start_min;
4036                         STRLEN l;
4037                         const char * const s = SvPV_const(data->last_found, l);
4038                         I32 old = b - data->last_start_min;
4039 #endif
4040
4041                         if (UTF)
4042                             old = utf8_hop((U8*)s, old) - (U8*)s;
4043                         l -= old;
4044                         /* Get the added string: */
4045                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4046                         if (deltanext == 0 && pos_before == b) {
4047                             /* What was added is a constant string */
4048                             if (mincount > 1) {
4049                                 SvGROW(last_str, (mincount * l) + 1);
4050                                 repeatcpy(SvPVX(last_str) + l,
4051                                           SvPVX_const(last_str), l, mincount - 1);
4052                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4053                                 /* Add additional parts. */
4054                                 SvCUR_set(data->last_found,
4055                                           SvCUR(data->last_found) - l);
4056                                 sv_catsv(data->last_found, last_str);
4057                                 {
4058                                     SV * sv = data->last_found;
4059                                     MAGIC *mg =
4060                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4061                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4062                                     if (mg && mg->mg_len >= 0)
4063                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4064                                 }
4065                                 data->last_end += l * (mincount - 1);
4066                             }
4067                         } else {
4068                             /* start offset must point into the last copy */
4069                             data->last_start_min += minnext * (mincount - 1);
4070                             data->last_start_max += is_inf ? I32_MAX
4071                                 : (maxcount - 1) * (minnext + data->pos_delta);
4072                         }
4073                     }
4074                     /* It is counted once already... */
4075                     data->pos_min += minnext * (mincount - counted);
4076                     data->pos_delta += - counted * deltanext +
4077                         (minnext + deltanext) * maxcount - minnext * mincount;
4078                     if (mincount != maxcount) {
4079                          /* Cannot extend fixed substrings found inside
4080                             the group.  */
4081                         SCAN_COMMIT(pRExC_state,data,minlenp);
4082                         if (mincount && last_str) {
4083                             SV * const sv = data->last_found;
4084                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4085                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4086
4087                             if (mg)
4088                                 mg->mg_len = -1;
4089                             sv_setsv(sv, last_str);
4090                             data->last_end = data->pos_min;
4091                             data->last_start_min =
4092                                 data->pos_min - CHR_SVLEN(last_str);
4093                             data->last_start_max = is_inf
4094                                 ? I32_MAX
4095                                 : data->pos_min + data->pos_delta
4096                                 - CHR_SVLEN(last_str);
4097                         }
4098                         data->longest = &(data->longest_float);
4099                     }
4100                     SvREFCNT_dec(last_str);
4101                 }
4102                 if (data && (fl & SF_HAS_EVAL))
4103                     data->flags |= SF_HAS_EVAL;
4104               optimize_curly_tail:
4105                 if (OP(oscan) != CURLYX) {
4106                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4107                            && NEXT_OFF(next))
4108                         NEXT_OFF(oscan) += NEXT_OFF(next);
4109                 }
4110                 continue;
4111             default:                    /* REF, ANYOFV, and CLUMP only? */
4112                 if (flags & SCF_DO_SUBSTR) {
4113                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4114                     data->longest = &(data->longest_float);
4115                 }
4116                 is_inf = is_inf_internal = 1;
4117                 if (flags & SCF_DO_STCLASS_OR)
4118                     cl_anything(pRExC_state, data->start_class);
4119                 flags &= ~SCF_DO_STCLASS;
4120                 break;
4121             }
4122         }
4123         else if (OP(scan) == LNBREAK) {
4124             if (flags & SCF_DO_STCLASS) {
4125                 int value = 0;
4126                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4127                 if (flags & SCF_DO_STCLASS_AND) {
4128                     for (value = 0; value < 256; value++)
4129                         if (!is_VERTWS_cp(value))
4130                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4131                 }
4132                 else {
4133                     for (value = 0; value < 256; value++)
4134                         if (is_VERTWS_cp(value))
4135                             ANYOF_BITMAP_SET(data->start_class, value);
4136                 }
4137                 if (flags & SCF_DO_STCLASS_OR)
4138                     cl_and(data->start_class, and_withp);
4139                 flags &= ~SCF_DO_STCLASS;
4140             }
4141             min++;
4142             delta++;    /* Because of the 2 char string cr-lf */
4143             if (flags & SCF_DO_SUBSTR) {
4144                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4145                 data->pos_min += 1;
4146                 data->pos_delta += 1;
4147                 data->longest = &(data->longest_float);
4148             }
4149         }
4150         else if (REGNODE_SIMPLE(OP(scan))) {
4151             int value = 0;
4152
4153             if (flags & SCF_DO_SUBSTR) {
4154                 SCAN_COMMIT(pRExC_state,data,minlenp);
4155                 data->pos_min++;
4156             }
4157             min++;
4158             if (flags & SCF_DO_STCLASS) {
4159                 int loop_max = 256;
4160                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4161
4162                 /* Some of the logic below assumes that switching
4163                    locale on will only add false positives. */
4164                 switch (PL_regkind[OP(scan)]) {
4165                     U8 classnum;
4166
4167                 case SANY:
4168                 default:
4169 #ifdef DEBUGGING
4170                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4171 #endif
4172                  do_default:
4173                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4174                         cl_anything(pRExC_state, data->start_class);
4175                     break;
4176                 case REG_ANY:
4177                     if (OP(scan) == SANY)
4178                         goto do_default;
4179                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4180                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4181                                 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4182                         cl_anything(pRExC_state, data->start_class);
4183                     }
4184                     if (flags & SCF_DO_STCLASS_AND || !value)
4185                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4186                     break;
4187                 case ANYOF:
4188                     if (flags & SCF_DO_STCLASS_AND)
4189                         cl_and(data->start_class,
4190                                (struct regnode_charclass_class*)scan);
4191                     else
4192                         cl_or(pRExC_state, data->start_class,
4193                               (struct regnode_charclass_class*)scan);
4194                     break;
4195                 case POSIXA:
4196                     loop_max = 128;
4197                     /* FALL THROUGH */
4198                 case POSIXL:
4199                 case POSIXD:
4200                 case POSIXU:
4201                     classnum = FLAGS(scan);
4202                     if (flags & SCF_DO_STCLASS_AND) {
4203                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4204                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4205                             for (value = 0; value < loop_max; value++) {
4206                                 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4207                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4208                                 }
4209                             }
4210                         }
4211                     }
4212                     else {
4213                         if (data->start_class->flags & ANYOF_LOCALE) {
4214                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4215                         }
4216                         else {
4217
4218                         /* Even if under locale, set the bits for non-locale
4219                          * in case it isn't a true locale-node.  This will
4220                          * create false positives if it truly is locale */
4221                         for (value = 0; value < loop_max; value++) {
4222                             if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4223                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4224                             }
4225                         }
4226                         }
4227                     }
4228                     break;
4229                 case NPOSIXA:
4230                     loop_max = 128;
4231                     /* FALL THROUGH */
4232                 case NPOSIXL:
4233                 case NPOSIXU:
4234                 case NPOSIXD:
4235                     classnum = FLAGS(scan);
4236                     if (flags & SCF_DO_STCLASS_AND) {
4237                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4238                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4239                             for (value = 0; value < loop_max; value++) {
4240                                 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4241                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4242                                 }
4243                             }
4244                         }
4245                     }
4246                     else {
4247                         if (data->start_class->flags & ANYOF_LOCALE) {
4248                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4249                         }
4250                         else {
4251
4252                         /* Even if under locale, set the bits for non-locale in
4253                          * case it isn't a true locale-node.  This will create
4254                          * false positives if it truly is locale */
4255                         for (value = 0; value < loop_max; value++) {
4256                             if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4257                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4258                             }
4259                         }
4260                         if (PL_regkind[OP(scan)] == NPOSIXD) {
4261                             data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4262                         }
4263                         }
4264                     }
4265                     break;
4266                 }
4267                 if (flags & SCF_DO_STCLASS_OR)
4268                     cl_and(data->start_class, and_withp);
4269                 flags &= ~SCF_DO_STCLASS;
4270             }
4271         }
4272         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4273             data->flags |= (OP(scan) == MEOL
4274                             ? SF_BEFORE_MEOL
4275                             : SF_BEFORE_SEOL);
4276             SCAN_COMMIT(pRExC_state, data, minlenp);
4277
4278         }
4279         else if (  PL_regkind[OP(scan)] == BRANCHJ
4280                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4281                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4282                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4283             if ( OP(scan) == UNLESSM &&
4284                  scan->flags == 0 &&
4285                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4286                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4287             ) {
4288                 regnode *opt;
4289                 regnode *upto= regnext(scan);
4290                 DEBUG_PARSE_r({
4291                     SV * const mysv_val=sv_newmortal();
4292                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4293
4294                     /*DEBUG_PARSE_MSG("opfail");*/
4295                     regprop(RExC_rx, mysv_val, upto);
4296                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4297                                   SvPV_nolen_const(mysv_val),
4298                                   (IV)REG_NODE_NUM(upto),
4299                                   (IV)(upto - scan)
4300                     );
4301                 });
4302                 OP(scan) = OPFAIL;
4303                 NEXT_OFF(scan) = upto - scan;
4304                 for (opt= scan + 1; opt < upto ; opt++)
4305                     OP(opt) = OPTIMIZED;
4306                 scan= upto;
4307                 continue;
4308             }
4309             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4310                 || OP(scan) == UNLESSM )
4311             {
4312                 /* Negative Lookahead/lookbehind
4313                    In this case we can't do fixed string optimisation.
4314                 */
4315
4316                 I32 deltanext, minnext, fake = 0;
4317                 regnode *nscan;
4318                 struct regnode_charclass_class intrnl;
4319                 int f = 0;
4320
4321                 data_fake.flags = 0;
4322                 if (data) {
4323                     data_fake.whilem_c = data->whilem_c;
4324                     data_fake.last_closep = data->last_closep;
4325                 }
4326                 else
4327                     data_fake.last_closep = &fake;
4328                 data_fake.pos_delta = delta;
4329                 if ( flags & SCF_DO_STCLASS && !scan->flags
4330                      && OP(scan) == IFMATCH ) { /* Lookahead */
4331                     cl_init(pRExC_state, &intrnl);
4332                     data_fake.start_class = &intrnl;
4333                     f |= SCF_DO_STCLASS_AND;
4334                 }
4335                 if (flags & SCF_WHILEM_VISITED_POS)
4336                     f |= SCF_WHILEM_VISITED_POS;
4337                 next = regnext(scan);
4338                 nscan = NEXTOPER(NEXTOPER(scan));
4339                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4340                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4341                 if (scan->flags) {
4342                     if (deltanext) {
4343                         FAIL("Variable length lookbehind not implemented");
4344                     }
4345                     else if (minnext > (I32)U8_MAX) {
4346                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4347                     }
4348                     scan->flags = (U8)minnext;
4349                 }
4350                 if (data) {
4351                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4352                         pars++;
4353                     if (data_fake.flags & SF_HAS_EVAL)
4354                         data->flags |= SF_HAS_EVAL;
4355                     data->whilem_c = data_fake.whilem_c;
4356                 }
4357                 if (f & SCF_DO_STCLASS_AND) {
4358                     if (flags & SCF_DO_STCLASS_OR) {
4359                         /* OR before, AND after: ideally we would recurse with
4360                          * data_fake to get the AND applied by study of the
4361                          * remainder of the pattern, and then derecurse;
4362                          * *** HACK *** for now just treat as "no information".
4363                          * See [perl #56690].
4364                          */
4365                         cl_init(pRExC_state, data->start_class);
4366                     }  else {
4367                         /* AND before and after: combine and continue */
4368                         const int was = TEST_SSC_EOS(data->start_class);
4369
4370                         cl_and(data->start_class, &intrnl);
4371                         if (was)
4372                             SET_SSC_EOS(data->start_class);
4373                     }
4374                 }
4375             }
4376 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4377             else {
4378                 /* Positive Lookahead/lookbehind
4379                    In this case we can do fixed string optimisation,
4380                    but we must be careful about it. Note in the case of
4381                    lookbehind the positions will be offset by the minimum
4382                    length of the pattern, something we won't know about
4383                    until after the recurse.
4384                 */
4385                 I32 deltanext, fake = 0;
4386                 regnode *nscan;
4387                 struct regnode_charclass_class intrnl;
4388                 int f = 0;
4389                 /* We use SAVEFREEPV so that when the full compile
4390                     is finished perl will clean up the allocated
4391                     minlens when it's all done. This way we don't
4392                     have to worry about freeing them when we know
4393                     they wont be used, which would be a pain.
4394                  */
4395                 I32 *minnextp;
4396                 Newx( minnextp, 1, I32 );
4397                 SAVEFREEPV(minnextp);
4398
4399                 if (data) {
4400                     StructCopy(data, &data_fake, scan_data_t);
4401                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4402                         f |= SCF_DO_SUBSTR;
4403                         if (scan->flags)
4404                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4405                         data_fake.last_found=newSVsv(data->last_found);
4406                     }
4407                 }
4408                 else
4409                     data_fake.last_closep = &fake;
4410                 data_fake.flags = 0;
4411                 data_fake.pos_delta = delta;
4412                 if (is_inf)
4413                     data_fake.flags |= SF_IS_INF;
4414                 if ( flags & SCF_DO_STCLASS && !scan->flags
4415                      && OP(scan) == IFMATCH ) { /* Lookahead */
4416                     cl_init(pRExC_state, &intrnl);
4417                     data_fake.start_class = &intrnl;
4418                     f |= SCF_DO_STCLASS_AND;
4419                 }
4420                 if (flags & SCF_WHILEM_VISITED_POS)
4421                     f |= SCF_WHILEM_VISITED_POS;
4422                 next = regnext(scan);
4423                 nscan = NEXTOPER(NEXTOPER(scan));
4424
4425                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4426                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4427                 if (scan->flags) {
4428                     if (deltanext) {
4429                         FAIL("Variable length lookbehind not implemented");
4430                     }
4431                     else if (*minnextp > (I32)U8_MAX) {
4432                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4433                     }
4434                     scan->flags = (U8)*minnextp;
4435                 }
4436
4437                 *minnextp += min;
4438
4439                 if (f & SCF_DO_STCLASS_AND) {
4440                     const int was = TEST_SSC_EOS(data.start_class);
4441
4442                     cl_and(data->start_class, &intrnl);
4443                     if (was)
4444                         SET_SSC_EOS(data->start_class);
4445                 }
4446                 if (data) {
4447                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4448                         pars++;
4449                     if (data_fake.flags & SF_HAS_EVAL)
4450                         data->flags |= SF_HAS_EVAL;
4451                     data->whilem_c = data_fake.whilem_c;
4452                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4453                         if (RExC_rx->minlen<*minnextp)
4454                             RExC_rx->minlen=*minnextp;
4455                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4456                         SvREFCNT_dec_NN(data_fake.last_found);
4457
4458                         if ( data_fake.minlen_fixed != minlenp )
4459                         {
4460                             data->offset_fixed= data_fake.offset_fixed;
4461                             data->minlen_fixed= data_fake.minlen_fixed;
4462                             data->lookbehind_fixed+= scan->flags;
4463                         }
4464                         if ( data_fake.minlen_float != minlenp )
4465                         {
4466                             data->minlen_float= data_fake.minlen_float;
4467                             data->offset_float_min=data_fake.offset_float_min;
4468                             data->offset_float_max=data_fake.offset_float_max;
4469                             data->lookbehind_float+= scan->flags;
4470                         }
4471                     }
4472                 }
4473             }
4474 #endif
4475         }
4476         else if (OP(scan) == OPEN) {
4477             if (stopparen != (I32)ARG(scan))
4478                 pars++;
4479         }
4480         else if (OP(scan) == CLOSE) {
4481             if (stopparen == (I32)ARG(scan)) {
4482                 break;
4483             }
4484             if ((I32)ARG(scan) == is_par) {
4485                 next = regnext(scan);
4486
4487                 if ( next && (OP(next) != WHILEM) && next < last)
4488                     is_par = 0;         /* Disable optimization */
4489             }
4490             if (data)
4491                 *(data->last_closep) = ARG(scan);
4492         }
4493         else if (OP(scan) == EVAL) {
4494                 if (data)
4495                     data->flags |= SF_HAS_EVAL;
4496         }
4497         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4498             if (flags & SCF_DO_SUBSTR) {
4499                 SCAN_COMMIT(pRExC_state,data,minlenp);
4500                 flags &= ~SCF_DO_SUBSTR;
4501             }
4502             if (data && OP(scan)==ACCEPT) {
4503                 data->flags |= SCF_SEEN_ACCEPT;
4504                 if (stopmin > min)
4505                     stopmin = min;
4506             }
4507         }
4508         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4509         {
4510                 if (flags & SCF_DO_SUBSTR) {
4511                     SCAN_COMMIT(pRExC_state,data,minlenp);
4512                     data->longest = &(data->longest_float);
4513                 }
4514                 is_inf = is_inf_internal = 1;
4515                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4516                     cl_anything(pRExC_state, data->start_class);
4517                 flags &= ~SCF_DO_STCLASS;
4518         }
4519         else if (OP(scan) == GPOS) {
4520             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4521                 !(delta || is_inf || (data && data->pos_delta)))
4522             {
4523                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4524                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4525                 if (RExC_rx->gofs < (U32)min)
4526                     RExC_rx->gofs = min;
4527             } else {
4528                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4529                 RExC_rx->gofs = 0;
4530             }
4531         }
4532 #ifdef TRIE_STUDY_OPT
4533 #ifdef FULL_TRIE_STUDY
4534         else if (PL_regkind[OP(scan)] == TRIE) {
4535             /* NOTE - There is similar code to this block above for handling
4536                BRANCH nodes on the initial study.  If you change stuff here
4537                check there too. */
4538             regnode *trie_node= scan;
4539             regnode *tail= regnext(scan);
4540             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4541             I32 max1 = 0, min1 = I32_MAX;
4542             struct regnode_charclass_class accum;
4543
4544             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4545                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4546             if (flags & SCF_DO_STCLASS)
4547                 cl_init_zero(pRExC_state, &accum);
4548
4549             if (!trie->jump) {
4550                 min1= trie->minlen;
4551                 max1= trie->maxlen;
4552             } else {
4553                 const regnode *nextbranch= NULL;
4554                 U32 word;
4555
4556                 for ( word=1 ; word <= trie->wordcount ; word++)
4557                 {
4558                     I32 deltanext=0, minnext=0, f = 0, fake;
4559                     struct regnode_charclass_class this_class;
4560
4561                     data_fake.flags = 0;
4562                     if (data) {
4563                         data_fake.whilem_c = data->whilem_c;
4564                         data_fake.last_closep = data->last_closep;
4565                     }
4566                     else
4567                         data_fake.last_closep = &fake;
4568                     data_fake.pos_delta = delta;
4569                     if (flags & SCF_DO_STCLASS) {
4570                         cl_init(pRExC_state, &this_class);
4571                         data_fake.start_class = &this_class;
4572                         f = SCF_DO_STCLASS_AND;
4573                     }
4574                     if (flags & SCF_WHILEM_VISITED_POS)
4575                         f |= SCF_WHILEM_VISITED_POS;
4576
4577                     if (trie->jump[word]) {
4578                         if (!nextbranch)
4579                             nextbranch = trie_node + trie->jump[0];
4580                         scan= trie_node + trie->jump[word];
4581                         /* We go from the jump point to the branch that follows
4582                            it. Note this means we need the vestigal unused branches
4583                            even though they arent otherwise used.
4584                          */
4585                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4586                             &deltanext, (regnode *)nextbranch, &data_fake,
4587                             stopparen, recursed, NULL, f,depth+1);
4588                     }
4589                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4590                         nextbranch= regnext((regnode*)nextbranch);
4591
4592                     if (min1 > (I32)(minnext + trie->minlen))
4593                         min1 = minnext + trie->minlen;
4594                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4595                         max1 = minnext + deltanext + trie->maxlen;
4596                     if (deltanext == I32_MAX)
4597                         is_inf = is_inf_internal = 1;
4598
4599                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4600                         pars++;
4601                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4602                         if ( stopmin > min + min1)
4603                             stopmin = min + min1;
4604                         flags &= ~SCF_DO_SUBSTR;
4605                         if (data)
4606                             data->flags |= SCF_SEEN_ACCEPT;
4607                     }
4608                     if (data) {
4609                         if (data_fake.flags & SF_HAS_EVAL)
4610                             data->flags |= SF_HAS_EVAL;
4611                         data->whilem_c = data_fake.whilem_c;
4612                     }
4613                     if (flags & SCF_DO_STCLASS)
4614                         cl_or(pRExC_state, &accum, &this_class);
4615                 }
4616             }
4617             if (flags & SCF_DO_SUBSTR) {
4618                 data->pos_min += min1;
4619                 data->pos_delta += max1 - min1;
4620                 if (max1 != min1 || is_inf)
4621                     data->longest = &(data->longest_float);
4622             }
4623             min += min1;
4624             delta += max1 - min1;
4625             if (flags & SCF_DO_STCLASS_OR) {
4626                 cl_or(pRExC_state, data->start_class, &accum);
4627                 if (min1) {
4628                     cl_and(data->start_class, and_withp);
4629                     flags &= ~SCF_DO_STCLASS;
4630                 }
4631             }
4632             else if (flags & SCF_DO_STCLASS_AND) {
4633                 if (min1) {
4634                     cl_and(data->start_class, &accum);
4635                     flags &= ~SCF_DO_STCLASS;
4636                 }
4637                 else {
4638                     /* Switch to OR mode: cache the old value of
4639                      * data->start_class */
4640                     INIT_AND_WITHP;
4641                     StructCopy(data->start_class, and_withp,
4642                                struct regnode_charclass_class);
4643                     flags &= ~SCF_DO_STCLASS_AND;
4644                     StructCopy(&accum, data->start_class,
4645                                struct regnode_charclass_class);
4646                     flags |= SCF_DO_STCLASS_OR;
4647                     SET_SSC_EOS(data->start_class);
4648                 }
4649             }
4650             scan= tail;
4651             continue;
4652         }
4653 #else
4654         else if (PL_regkind[OP(scan)] == TRIE) {
4655             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4656             U8*bang=NULL;
4657
4658             min += trie->minlen;
4659             delta += (trie->maxlen - trie->minlen);
4660             flags &= ~SCF_DO_STCLASS; /* xxx */
4661             if (flags & SCF_DO_SUBSTR) {
4662                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4663                 data->pos_min += trie->minlen;
4664                 data->pos_delta += (trie->maxlen - trie->minlen);
4665                 if (trie->maxlen != trie->minlen)
4666                     data->longest = &(data->longest_float);
4667             }
4668             if (trie->jump) /* no more substrings -- for now /grr*/
4669                 flags &= ~SCF_DO_SUBSTR;
4670         }
4671 #endif /* old or new */
4672 #endif /* TRIE_STUDY_OPT */
4673
4674         /* Else: zero-length, ignore. */
4675         scan = regnext(scan);
4676     }
4677     if (frame) {
4678         last = frame->last;
4679         scan = frame->next;
4680         stopparen = frame->stop;
4681         frame = frame->prev;
4682         goto fake_study_recurse;
4683     }
4684
4685   finish:
4686     assert(!frame);
4687     DEBUG_STUDYDATA("pre-fin:",data,depth);
4688
4689     *scanp = scan;
4690     *deltap = is_inf_internal ? I32_MAX : delta;
4691     if (flags & SCF_DO_SUBSTR && is_inf)
4692         data->pos_delta = I32_MAX - data->pos_min;
4693     if (is_par > (I32)U8_MAX)
4694         is_par = 0;
4695     if (is_par && pars==1 && data) {
4696         data->flags |= SF_IN_PAR;
4697         data->flags &= ~SF_HAS_PAR;
4698     }
4699     else if (pars && data) {
4700         data->flags |= SF_HAS_PAR;
4701         data->flags &= ~SF_IN_PAR;
4702     }
4703     if (flags & SCF_DO_STCLASS_OR)
4704         cl_and(data->start_class, and_withp);
4705     if (flags & SCF_TRIE_RESTUDY)
4706         data->flags |=  SCF_TRIE_RESTUDY;
4707
4708     DEBUG_STUDYDATA("post-fin:",data,depth);
4709
4710     return min < stopmin ? min : stopmin;
4711 }
4712
4713 STATIC U32
4714 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4715 {
4716     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4717
4718     PERL_ARGS_ASSERT_ADD_DATA;
4719
4720     Renewc(RExC_rxi->data,
4721            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4722            char, struct reg_data);
4723     if(count)
4724         Renew(RExC_rxi->data->what, count + n, U8);
4725     else
4726         Newx(RExC_rxi->data->what, n, U8);
4727     RExC_rxi->data->count = count + n;
4728     Copy(s, RExC_rxi->data->what + count, n, U8);
4729     return count;
4730 }
4731
4732 /*XXX: todo make this not included in a non debugging perl */
4733 #ifndef PERL_IN_XSUB_RE
4734 void
4735 Perl_reginitcolors(pTHX)
4736 {
4737     dVAR;
4738     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4739     if (s) {
4740         char *t = savepv(s);
4741         int i = 0;
4742         PL_colors[0] = t;
4743         while (++i < 6) {
4744             t = strchr(t, '\t');
4745             if (t) {
4746                 *t = '\0';
4747                 PL_colors[i] = ++t;
4748             }
4749             else
4750                 PL_colors[i] = t = (char *)"";
4751         }
4752     } else {
4753         int i = 0;
4754         while (i < 6)
4755             PL_colors[i++] = (char *)"";
4756     }
4757     PL_colorset = 1;
4758 }
4759 #endif
4760
4761
4762 #ifdef TRIE_STUDY_OPT
4763 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4764     STMT_START {                                            \
4765         if (                                                \
4766               (data.flags & SCF_TRIE_RESTUDY)               \
4767               && ! restudied++                              \
4768         ) {                                                 \
4769             dOsomething;                                    \
4770             goto reStudy;                                   \
4771         }                                                   \
4772     } STMT_END
4773 #else
4774 #define CHECK_RESTUDY_GOTO_butfirst
4775 #endif
4776
4777 /*
4778  * pregcomp - compile a regular expression into internal code
4779  *
4780  * Decides which engine's compiler to call based on the hint currently in
4781  * scope
4782  */
4783
4784 #ifndef PERL_IN_XSUB_RE
4785
4786 /* return the currently in-scope regex engine (or the default if none)  */
4787
4788 regexp_engine const *
4789 Perl_current_re_engine(pTHX)
4790 {
4791     dVAR;
4792
4793     if (IN_PERL_COMPILETIME) {
4794         HV * const table = GvHV(PL_hintgv);
4795         SV **ptr;
4796
4797         if (!table)
4798             return &PL_core_reg_engine;
4799         ptr = hv_fetchs(table, "regcomp", FALSE);
4800         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4801             return &PL_core_reg_engine;
4802         return INT2PTR(regexp_engine*,SvIV(*ptr));
4803     }
4804     else {
4805         SV *ptr;
4806         if (!PL_curcop->cop_hints_hash)
4807             return &PL_core_reg_engine;
4808         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4809         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4810             return &PL_core_reg_engine;
4811         return INT2PTR(regexp_engine*,SvIV(ptr));
4812     }
4813 }
4814
4815
4816 REGEXP *
4817 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4818 {
4819     dVAR;
4820     regexp_engine const *eng = current_re_engine();
4821     GET_RE_DEBUG_FLAGS_DECL;
4822
4823     PERL_ARGS_ASSERT_PREGCOMP;
4824
4825     /* Dispatch a request to compile a regexp to correct regexp engine. */
4826     DEBUG_COMPILE_r({
4827         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4828                         PTR2UV(eng));
4829     });
4830     return CALLREGCOMP_ENG(eng, pattern, flags);
4831 }
4832 #endif
4833
4834 /* public(ish) entry point for the perl core's own regex compiling code.
4835  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4836  * pattern rather than a list of OPs, and uses the internal engine rather
4837  * than the current one */
4838
4839 REGEXP *
4840 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4841 {
4842     SV *pat = pattern; /* defeat constness! */
4843     PERL_ARGS_ASSERT_RE_COMPILE;
4844     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4845 #ifdef PERL_IN_XSUB_RE
4846                                 &my_reg_engine,
4847 #else
4848                                 &PL_core_reg_engine,
4849 #endif
4850                                 NULL, NULL, rx_flags, 0);
4851 }
4852
4853 /* see if there are any run-time code blocks in the pattern.
4854  * False positives are allowed */
4855
4856 static bool
4857 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4858                     U32 pm_flags, char *pat, STRLEN plen)
4859 {
4860     int n = 0;
4861     STRLEN s;
4862
4863     /* avoid infinitely recursing when we recompile the pattern parcelled up
4864      * as qr'...'. A single constant qr// string can't have have any
4865      * run-time component in it, and thus, no runtime code. (A non-qr
4866      * string, however, can, e.g. $x =~ '(?{})') */
4867     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4868         return 0;
4869
4870     for (s = 0; s < plen; s++) {
4871         if (n < pRExC_state->num_code_blocks
4872             && s == pRExC_state->code_blocks[n].start)
4873         {
4874             s = pRExC_state->code_blocks[n].end;
4875             n++;
4876             continue;
4877         }
4878         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4879          * positives here */
4880         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
4881             (pat[s+2] == '{'
4882                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
4883         )
4884             return 1;
4885     }
4886     return 0;
4887 }
4888
4889 /* Handle run-time code blocks. We will already have compiled any direct
4890  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4891  * copy of it, but with any literal code blocks blanked out and
4892  * appropriate chars escaped; then feed it into
4893  *
4894  *    eval "qr'modified_pattern'"
4895  *
4896  * For example,
4897  *
4898  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
4899  *
4900  * becomes
4901  *
4902  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
4903  *
4904  * After eval_sv()-ing that, grab any new code blocks from the returned qr
4905  * and merge them with any code blocks of the original regexp.
4906  *
4907  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
4908  * instead, just save the qr and return FALSE; this tells our caller that
4909  * the original pattern needs upgrading to utf8.
4910  */
4911
4912 static bool
4913 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4914     char *pat, STRLEN plen)
4915 {
4916     SV *qr;
4917
4918     GET_RE_DEBUG_FLAGS_DECL;
4919
4920     if (pRExC_state->runtime_code_qr) {
4921         /* this is the second time we've been called; this should
4922          * only happen if the main pattern got upgraded to utf8
4923          * during compilation; re-use the qr we compiled first time
4924          * round (which should be utf8 too)
4925          */
4926         qr = pRExC_state->runtime_code_qr;
4927         pRExC_state->runtime_code_qr = NULL;
4928         assert(RExC_utf8 && SvUTF8(qr));
4929     }
4930     else {
4931         int n = 0;
4932         STRLEN s;
4933         char *p, *newpat;
4934         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
4935         SV *sv, *qr_ref;
4936         dSP;
4937
4938         /* determine how many extra chars we need for ' and \ escaping */
4939         for (s = 0; s < plen; s++) {
4940             if (pat[s] == '\'' || pat[s] == '\\')
4941                 newlen++;
4942         }
4943
4944         Newx(newpat, newlen, char);
4945         p = newpat;
4946         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
4947
4948         for (s = 0; s < plen; s++) {
4949             if (n < pRExC_state->num_code_blocks
4950                 && s == pRExC_state->code_blocks[n].start)
4951             {
4952                 /* blank out literal code block */
4953                 assert(pat[s] == '(');
4954                 while (s <= pRExC_state->code_blocks[n].end) {
4955                     *p++ = '_';
4956                     s++;
4957                 }
4958                 s--;
4959                 n++;
4960                 continue;
4961             }
4962             if (pat[s] == '\'' || pat[s] == '\\')
4963                 *p++ = '\\';
4964             *p++ = pat[s];
4965         }
4966         *p++ = '\'';
4967         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
4968             *p++ = 'x';
4969         *p++ = '\0';
4970         DEBUG_COMPILE_r({
4971             PerlIO_printf(Perl_debug_log,
4972                 "%sre-parsing pattern for runtime code:%s %s\n",
4973                 PL_colors[4],PL_colors[5],newpat);
4974         });
4975
4976         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
4977         Safefree(newpat);
4978
4979         ENTER;
4980         SAVETMPS;
4981         save_re_context();
4982         PUSHSTACKi(PERLSI_REQUIRE);
4983         /* this causes the toker to collapse \\ into \ when parsing
4984          * qr''; normally only q'' does this. It also alters hints
4985          * handling */
4986         PL_reg_state.re_reparsing = TRUE;
4987         eval_sv(sv, G_SCALAR);
4988         SvREFCNT_dec_NN(sv);
4989         SPAGAIN;
4990         qr_ref = POPs;
4991         PUTBACK;
4992         {
4993             SV * const errsv = ERRSV;
4994             if (SvTRUE_NN(errsv))
4995             {
4996                 Safefree(pRExC_state->code_blocks);
4997                 /* use croak_sv ? */
4998                 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
4999             }
5000         }
5001         assert(SvROK(qr_ref));
5002         qr = SvRV(qr_ref);
5003         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5004         /* the leaving below frees the tmp qr_ref.
5005          * Give qr a life of its own */
5006         SvREFCNT_inc(qr);
5007         POPSTACK;
5008         FREETMPS;
5009         LEAVE;
5010
5011     }
5012
5013     if (!RExC_utf8 && SvUTF8(qr)) {
5014         /* first time through; the pattern got upgraded; save the
5015          * qr for the next time through */
5016         assert(!pRExC_state->runtime_code_qr);
5017         pRExC_state->runtime_code_qr = qr;
5018         return 0;
5019     }
5020
5021
5022     /* extract any code blocks within the returned qr//  */
5023
5024
5025     /* merge the main (r1) and run-time (r2) code blocks into one */
5026     {
5027         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5028         struct reg_code_block *new_block, *dst;
5029         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5030         int i1 = 0, i2 = 0;
5031
5032         if (!r2->num_code_blocks) /* we guessed wrong */
5033         {
5034             SvREFCNT_dec_NN(qr);
5035             return 1;
5036         }
5037
5038         Newx(new_block,
5039             r1->num_code_blocks + r2->num_code_blocks,
5040             struct reg_code_block);
5041         dst = new_block;
5042
5043         while (    i1 < r1->num_code_blocks
5044                 || i2 < r2->num_code_blocks)
5045         {
5046             struct reg_code_block *src;
5047             bool is_qr = 0;
5048
5049             if (i1 == r1->num_code_blocks) {
5050                 src = &r2->code_blocks[i2++];
5051                 is_qr = 1;
5052             }
5053             else if (i2 == r2->num_code_blocks)
5054                 src = &r1->code_blocks[i1++];
5055             else if (  r1->code_blocks[i1].start
5056                      < r2->code_blocks[i2].start)
5057             {
5058                 src = &r1->code_blocks[i1++];
5059                 assert(src->end < r2->code_blocks[i2].start);
5060             }
5061             else {
5062                 assert(  r1->code_blocks[i1].start
5063                        > r2->code_blocks[i2].start);
5064                 src = &r2->code_blocks[i2++];
5065                 is_qr = 1;
5066                 assert(src->end < r1->code_blocks[i1].start);
5067             }
5068
5069             assert(pat[src->start] == '(');
5070             assert(pat[src->end]   == ')');
5071             dst->start      = src->start;
5072             dst->end        = src->end;
5073             dst->block      = src->block;
5074             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5075                                     : src->src_regex;
5076             dst++;
5077         }
5078         r1->num_code_blocks += r2->num_code_blocks;
5079         Safefree(r1->code_blocks);
5080         r1->code_blocks = new_block;
5081     }
5082
5083     SvREFCNT_dec_NN(qr);
5084     return 1;
5085 }
5086
5087
5088 STATIC bool
5089 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5090 {
5091     /* This is the common code for setting up the floating and fixed length
5092      * string data extracted from Perlre_op_compile() below.  Returns a boolean
5093      * as to whether succeeded or not */
5094
5095     I32 t,ml;
5096
5097     if (! (longest_length
5098            || (eol /* Can't have SEOL and MULTI */
5099                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5100           )
5101             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5102         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5103     {
5104         return FALSE;
5105     }
5106
5107     /* copy the information about the longest from the reg_scan_data
5108         over to the program. */
5109     if (SvUTF8(sv_longest)) {
5110         *rx_utf8 = sv_longest;
5111         *rx_substr = NULL;
5112     } else {
5113         *rx_substr = sv_longest;
5114         *rx_utf8 = NULL;
5115     }
5116     /* end_shift is how many chars that must be matched that
5117         follow this item. We calculate it ahead of time as once the
5118         lookbehind offset is added in we lose the ability to correctly
5119         calculate it.*/
5120     ml = minlen ? *(minlen) : (I32)longest_length;
5121     *rx_end_shift = ml - offset
5122         - longest_length + (SvTAIL(sv_longest) != 0)
5123         + lookbehind;
5124
5125     t = (eol/* Can't have SEOL and MULTI */
5126          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5127     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5128
5129     return TRUE;
5130 }
5131
5132 /*
5133  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5134  * regular expression into internal code.
5135  * The pattern may be passed either as:
5136  *    a list of SVs (patternp plus pat_count)
5137  *    a list of OPs (expr)
5138  * If both are passed, the SV list is used, but the OP list indicates
5139  * which SVs are actually pre-compiled code blocks
5140  *
5141  * The SVs in the list have magic and qr overloading applied to them (and
5142  * the list may be modified in-place with replacement SVs in the latter
5143  * case).
5144  *
5145  * If the pattern hasn't changed from old_re, then old_re will be
5146  * returned.
5147  *
5148  * eng is the current engine. If that engine has an op_comp method, then
5149  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5150  * do the initial concatenation of arguments and pass on to the external
5151  * engine.
5152  *
5153  * If is_bare_re is not null, set it to a boolean indicating whether the
5154  * arg list reduced (after overloading) to a single bare regex which has
5155  * been returned (i.e. /$qr/).
5156  *
5157  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5158  *
5159  * pm_flags contains the PMf_* flags, typically based on those from the
5160  * pm_flags field of the related PMOP. Currently we're only interested in
5161  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5162  *
5163  * We can't allocate space until we know how big the compiled form will be,
5164  * but we can't compile it (and thus know how big it is) until we've got a
5165  * place to put the code.  So we cheat:  we compile it twice, once with code
5166  * generation turned off and size counting turned on, and once "for real".
5167  * This also means that we don't allocate space until we are sure that the
5168  * thing really will compile successfully, and we never have to move the
5169  * code and thus invalidate pointers into it.  (Note that it has to be in
5170  * one piece because free() must be able to free it all.) [NB: not true in perl]
5171  *
5172  * Beware that the optimization-preparation code in here knows about some
5173  * of the structure of the compiled regexp.  [I'll say.]
5174  */
5175
5176 REGEXP *
5177 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5178                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5179                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5180 {
5181     dVAR;
5182     REGEXP *rx;
5183     struct regexp *r;
5184     regexp_internal *ri;
5185     STRLEN plen;
5186     char  * VOL exp;
5187     char* xend;
5188     regnode *scan;
5189     I32 flags;
5190     I32 minlen = 0;
5191     U32 rx_flags;
5192     SV * VOL pat;
5193     SV * VOL code_blocksv = NULL;
5194
5195     /* these are all flags - maybe they should be turned
5196      * into a single int with different bit masks */
5197     I32 sawlookahead = 0;
5198     I32 sawplus = 0;
5199     I32 sawopen = 0;
5200     bool used_setjump = FALSE;
5201     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5202     bool code_is_utf8 = 0;
5203     bool VOL recompile = 0;
5204     bool runtime_code = 0;
5205     U8 jump_ret = 0;
5206     dJMPENV;
5207     scan_data_t data;
5208     RExC_state_t RExC_state;
5209     RExC_state_t * const pRExC_state = &RExC_state;
5210 #ifdef TRIE_STUDY_OPT
5211     int restudied;
5212     RExC_state_t copyRExC_state;
5213 #endif
5214     GET_RE_DEBUG_FLAGS_DECL;
5215
5216     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5217
5218     DEBUG_r(if (!PL_colorset) reginitcolors());
5219
5220 #ifndef PERL_IN_XSUB_RE
5221     /* Initialize these here instead of as-needed, as is quick and avoids
5222      * having to test them each time otherwise */
5223     if (! PL_AboveLatin1) {
5224         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5225         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5226         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5227
5228         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5229                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5230         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5231                                 = _new_invlist_C_array(PosixAlnum_invlist);
5232
5233         PL_L1Posix_ptrs[_CC_ALPHA]
5234                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5235         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5236
5237         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5238         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5239
5240         /* Cased is the same as Alpha in the ASCII range */
5241         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5242         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5243
5244         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5245         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5246
5247         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5248         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5249
5250         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5251         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5252
5253         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5254         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5255
5256         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5257         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5258
5259         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5260         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5261
5262         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5263         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5264         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5265         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5266
5267         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5268         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5269
5270         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5271
5272         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5273         PL_L1Posix_ptrs[_CC_WORDCHAR]
5274                                 = _new_invlist_C_array(L1PosixWord_invlist);
5275
5276         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5277         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5278
5279         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5280     }
5281 #endif
5282
5283     pRExC_state->code_blocks = NULL;
5284     pRExC_state->num_code_blocks = 0;
5285
5286     if (is_bare_re)
5287         *is_bare_re = FALSE;
5288
5289     if (expr && (expr->op_type == OP_LIST ||
5290                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5291
5292         /* is the source UTF8, and how many code blocks are there? */
5293         OP *o;
5294         int ncode = 0;
5295
5296         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5297             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5298                 code_is_utf8 = 1;
5299             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5300                 /* count of DO blocks */
5301                 ncode++;
5302         }
5303         if (ncode) {
5304             pRExC_state->num_code_blocks = ncode;
5305             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5306         }
5307     }
5308
5309     if (pat_count) {
5310         /* handle a list of SVs */
5311
5312         SV **svp;
5313
5314         /* apply magic and RE overloading to each arg */
5315         for (svp = patternp; svp < patternp + pat_count; svp++) {
5316             SV *rx = *svp;
5317             SvGETMAGIC(rx);
5318             if (SvROK(rx) && SvAMAGIC(rx)) {
5319                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5320                 if (sv) {
5321                     if (SvROK(sv))
5322                         sv = SvRV(sv);
5323                     if (SvTYPE(sv) != SVt_REGEXP)
5324                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5325                     *svp = sv;
5326                 }
5327             }
5328         }
5329
5330         if (pat_count > 1) {
5331             /* concat multiple args and find any code block indexes */
5332
5333             OP *o = NULL;
5334             int n = 0;
5335             bool utf8 = 0;
5336             STRLEN orig_patlen = 0;
5337
5338             if (pRExC_state->num_code_blocks) {
5339                 o = cLISTOPx(expr)->op_first;
5340                 assert(   o->op_type == OP_PUSHMARK
5341                        || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5342                        || o->op_type == OP_PADRANGE);
5343                 o = o->op_sibling;
5344             }
5345
5346             pat = newSVpvn("", 0);
5347             SAVEFREESV(pat);
5348
5349             /* determine if the pattern is going to be utf8 (needed
5350              * in advance to align code block indices correctly).
5351              * XXX This could fail to be detected for an arg with
5352              * overloading but not concat overloading; but the main effect
5353              * in this obscure case is to need a 'use re eval' for a
5354              * literal code block */
5355             for (svp = patternp; svp < patternp + pat_count; svp++) {
5356                 if (SvUTF8(*svp))
5357                     utf8 = 1;
5358             }
5359             if (utf8)
5360                 SvUTF8_on(pat);
5361
5362             for (svp = patternp; svp < patternp + pat_count; svp++) {
5363                 SV *sv, *msv = *svp;
5364                 SV *rx;
5365                 bool code = 0;
5366                 /* we make the assumption here that each op in the list of
5367                  * op_siblings maps to one SV pushed onto the stack,
5368                  * except for code blocks, with have both an OP_NULL and
5369                  * and OP_CONST.
5370                  * This allows us to match up the list of SVs against the
5371                  * list of OPs to find the next code block.
5372                  *
5373                  * Note that       PUSHMARK PADSV PADSV ..
5374                  * is optimised to
5375                  *                 PADRANGE NULL  NULL  ..
5376                  * so the alignment still works. */
5377                 if (o) {
5378                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5379                         assert(n < pRExC_state->num_code_blocks);
5380                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5381                         pRExC_state->code_blocks[n].block = o;
5382                         pRExC_state->code_blocks[n].src_regex = NULL;
5383                         n++;
5384                         code = 1;
5385                         o = o->op_sibling; /* skip CONST */
5386                         assert(o);
5387                     }
5388                     o = o->op_sibling;;
5389                 }
5390
5391                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5392                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5393                 {
5394                     sv_setsv(pat, sv);
5395                     /* overloading involved: all bets are off over literal
5396                      * code. Pretend we haven't seen it */
5397                     pRExC_state->num_code_blocks -= n;
5398                     n = 0;
5399                     rx = NULL;
5400
5401                 }
5402                 else  {
5403                     while (SvAMAGIC(msv)
5404                             && (sv = AMG_CALLunary(msv, string_amg))
5405                             && sv != msv
5406                             &&  !(   SvROK(msv)
5407                                   && SvROK(sv)
5408                                   && SvRV(msv) == SvRV(sv))
5409                     ) {
5410                         msv = sv;
5411                         SvGETMAGIC(msv);
5412                     }
5413                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5414                         msv = SvRV(msv);
5415                     orig_patlen = SvCUR(pat);
5416                     sv_catsv_nomg(pat, msv);
5417                     rx = msv;
5418                     if (code)
5419                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5420                 }
5421
5422                 /* extract any code blocks within any embedded qr//'s */
5423                 if (rx && SvTYPE(rx) == SVt_REGEXP
5424                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5425                 {
5426
5427                     RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5428                     if (ri->num_code_blocks) {
5429                         int i;
5430                         /* the presence of an embedded qr// with code means
5431                          * we should always recompile: the text of the
5432                          * qr// may not have changed, but it may be a
5433                          * different closure than last time */
5434                         recompile = 1;
5435                         Renew(pRExC_state->code_blocks,
5436                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5437                             struct reg_code_block);
5438                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5439                         for (i=0; i < ri->num_code_blocks; i++) {
5440                             struct reg_code_block *src, *dst;
5441                             STRLEN offset =  orig_patlen
5442                                 + ReANY((REGEXP *)rx)->pre_prefix;
5443                             assert(n < pRExC_state->num_code_blocks);
5444                             src = &ri->code_blocks[i];
5445                             dst = &pRExC_state->code_blocks[n];
5446                             dst->start      = src->start + offset;
5447                             dst->end        = src->end   + offset;
5448                             dst->block      = src->block;
5449                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5450                                                     src->src_regex
5451                                                         ? src->src_regex
5452                                                         : (REGEXP*)rx);
5453                             n++;
5454                         }
5455                     }
5456                 }
5457             }
5458             SvSETMAGIC(pat);
5459         }
5460         else {
5461             SV *sv;
5462             pat = *patternp;
5463             while (SvAMAGIC(pat)
5464                     && (sv = AMG_CALLunary(pat, string_amg))
5465                     && sv != pat)
5466             {
5467                 pat = sv;
5468                 SvGETMAGIC(pat);
5469             }
5470         }
5471
5472         /* handle bare regex: foo =~ $re */
5473         {
5474             SV *re = pat;
5475             if (SvROK(re))
5476                 re = SvRV(re);
5477             if (SvTYPE(re) == SVt_REGEXP) {
5478                 if (is_bare_re)
5479                     *is_bare_re = TRUE;
5480                 SvREFCNT_inc(re);
5481                 Safefree(pRExC_state->code_blocks);
5482                 return (REGEXP*)re;
5483             }
5484         }
5485     }
5486     else {
5487         /* not a list of SVs, so must be a list of OPs */
5488         assert(expr);
5489         if (expr->op_type == OP_LIST) {
5490             int i = -1;
5491             bool is_code = 0;
5492             OP *o;
5493
5494             pat = newSVpvn("", 0);
5495             SAVEFREESV(pat);
5496             if (code_is_utf8)
5497                 SvUTF8_on(pat);
5498
5499             /* given a list of CONSTs and DO blocks in expr, append all
5500              * the CONSTs to pat, and record the start and end of each
5501              * code block in code_blocks[] (each DO{} op is followed by an
5502              * OP_CONST containing the corresponding literal '(?{...})
5503              * text)
5504              */
5505             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5506                 if (o->op_type == OP_CONST) {
5507                     sv_catsv(pat, cSVOPo_sv);
5508                     if (is_code) {
5509                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5510                         is_code = 0;
5511                     }
5512                 }
5513                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5514                     assert(i+1 < pRExC_state->num_code_blocks);
5515                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5516                     pRExC_state->code_blocks[i].block = o;
5517                     pRExC_state->code_blocks[i].src_regex = NULL;
5518                     is_code = 1;
5519                 }
5520             }
5521         }
5522         else {
5523             assert(expr->op_type == OP_CONST);
5524             pat = cSVOPx_sv(expr);
5525         }
5526     }
5527
5528     exp = SvPV_nomg(pat, plen);
5529
5530     if (!eng->op_comp) {
5531         if ((SvUTF8(pat) && IN_BYTES)
5532                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5533         {
5534             /* make a temporary copy; either to convert to bytes,
5535              * or to avoid repeating get-magic / overloaded stringify */
5536             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5537                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5538         }
5539         Safefree(pRExC_state->code_blocks);
5540         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5541     }
5542
5543     /* ignore the utf8ness if the pattern is 0 length */
5544     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5545     RExC_uni_semantics = 0;
5546     RExC_contains_locale = 0;
5547     pRExC_state->runtime_code_qr = NULL;
5548
5549     /****************** LONG JUMP TARGET HERE***********************/
5550     /* Longjmp back to here if have to switch in midstream to utf8 */
5551     if (! RExC_orig_utf8) {
5552         JMPENV_PUSH(jump_ret);
5553         used_setjump = TRUE;
5554     }
5555
5556     if (jump_ret == 0) {    /* First time through */
5557         xend = exp + plen;
5558
5559         DEBUG_COMPILE_r({
5560             SV *dsv= sv_newmortal();
5561             RE_PV_QUOTED_DECL(s, RExC_utf8,
5562                 dsv, exp, plen, 60);
5563             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5564                            PL_colors[4],PL_colors[5],s);
5565         });
5566     }
5567     else {  /* longjumped back */
5568         U8 *src, *dst;
5569         int n=0;
5570         STRLEN s = 0, d = 0;
5571         bool do_end = 0;
5572
5573         /* If the cause for the longjmp was other than changing to utf8, pop
5574          * our own setjmp, and longjmp to the correct handler */
5575         if (jump_ret != UTF8_LONGJMP) {
5576             JMPENV_POP;
5577             JMPENV_JUMP(jump_ret);
5578         }
5579
5580         GET_RE_DEBUG_FLAGS;
5581
5582         /* It's possible to write a regexp in ascii that represents Unicode
5583         codepoints outside of the byte range, such as via \x{100}. If we
5584         detect such a sequence we have to convert the entire pattern to utf8
5585         and then recompile, as our sizing calculation will have been based
5586         on 1 byte == 1 character, but we will need to use utf8 to encode
5587         at least some part of the pattern, and therefore must convert the whole
5588         thing.
5589         -- dmq */
5590         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5591             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5592
5593         /* upgrade pattern to UTF8, and if there are code blocks,
5594          * recalculate the indices.
5595          * This is essentially an unrolled Perl_bytes_to_utf8() */
5596
5597         src = (U8*)SvPV_nomg(pat, plen);
5598         Newx(dst, plen * 2 + 1, U8);
5599
5600         while (s < plen) {
5601             const UV uv = NATIVE_TO_ASCII(src[s]);
5602             if (UNI_IS_INVARIANT(uv))
5603                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5604             else {
5605                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5606                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5607             }
5608             if (n < pRExC_state->num_code_blocks) {
5609                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5610                     pRExC_state->code_blocks[n].start = d;
5611                     assert(dst[d] == '(');
5612                     do_end = 1;
5613                 }
5614                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5615                     pRExC_state->code_blocks[n].end = d;
5616                     assert(dst[d] == ')');
5617                     do_end = 0;
5618                     n++;
5619                 }
5620             }
5621             s++;
5622             d++;
5623         }
5624         dst[d] = '\0';
5625         plen = d;
5626         exp = (char*) dst;
5627         xend = exp + plen;
5628         SAVEFREEPV(exp);
5629         RExC_orig_utf8 = RExC_utf8 = 1;
5630     }
5631
5632     /* return old regex if pattern hasn't changed */
5633
5634     if (   old_re
5635         && !recompile
5636         && !!RX_UTF8(old_re) == !!RExC_utf8
5637         && RX_PRECOMP(old_re)
5638         && RX_PRELEN(old_re) == plen
5639         && memEQ(RX_PRECOMP(old_re), exp, plen))
5640     {
5641         /* with runtime code, always recompile */
5642         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5643                                             exp, plen);
5644         if (!runtime_code) {
5645             if (used_setjump) {
5646                 JMPENV_POP;
5647             }
5648             Safefree(pRExC_state->code_blocks);
5649             return old_re;
5650         }
5651     }
5652     else if ((pm_flags & PMf_USE_RE_EVAL)
5653                 /* this second condition covers the non-regex literal case,
5654                  * i.e.  $foo =~ '(?{})'. */
5655                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5656                     && (PL_hints & HINT_RE_EVAL))
5657     )
5658         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5659                             exp, plen);
5660
5661 #ifdef TRIE_STUDY_OPT
5662     restudied = 0;
5663 #endif
5664
5665     rx_flags = orig_rx_flags;
5666
5667     if (initial_charset == REGEX_LOCALE_CHARSET) {
5668         RExC_contains_locale = 1;
5669     }
5670     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5671
5672         /* Set to use unicode semantics if the pattern is in utf8 and has the
5673          * 'depends' charset specified, as it means unicode when utf8  */
5674         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5675     }
5676
5677     RExC_precomp = exp;
5678     RExC_flags = rx_flags;
5679     RExC_pm_flags = pm_flags;
5680
5681     if (runtime_code) {
5682         if (TAINTING_get && TAINT_get)
5683             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5684
5685         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5686             /* whoops, we have a non-utf8 pattern, whilst run-time code
5687              * got compiled as utf8. Try again with a utf8 pattern */
5688              JMPENV_JUMP(UTF8_LONGJMP);
5689         }
5690     }
5691     assert(!pRExC_state->runtime_code_qr);
5692
5693     RExC_sawback = 0;
5694
5695     RExC_seen = 0;
5696     RExC_in_lookbehind = 0;
5697     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5698     RExC_extralen = 0;
5699     RExC_override_recoding = 0;
5700     RExC_in_multi_char_class = 0;
5701
5702     /* First pass: determine size, legality. */
5703     RExC_parse = exp;
5704     RExC_start = exp;
5705     RExC_end = xend;
5706     RExC_naughty = 0;
5707     RExC_npar = 1;
5708     RExC_nestroot = 0;
5709     RExC_size = 0L;
5710     RExC_emit = &PL_regdummy;
5711     RExC_whilem_seen = 0;
5712     RExC_open_parens = NULL;
5713     RExC_close_parens = NULL;
5714     RExC_opend = NULL;
5715     RExC_paren_names = NULL;
5716 #ifdef DEBUGGING
5717     RExC_paren_name_list = NULL;
5718 #endif
5719     RExC_recurse = NULL;
5720     RExC_recurse_count = 0;
5721     pRExC_state->code_index = 0;
5722
5723 #if 0 /* REGC() is (currently) a NOP at the first pass.
5724        * Clever compilers notice this and complain. --jhi */
5725     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5726 #endif
5727     DEBUG_PARSE_r(
5728         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5729         RExC_lastnum=0;
5730         RExC_lastparse=NULL;
5731     );
5732     /* reg may croak on us, not giving us a chance to free
5733        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5734        need it to survive as long as the regexp (qr/(?{})/).
5735        We must check that code_blocksv is not already set, because we may
5736        have longjmped back. */
5737     if (pRExC_state->code_blocks && !code_blocksv) {
5738         code_blocksv = newSV_type(SVt_PV);
5739         SAVEFREESV(code_blocksv);
5740         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5741         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5742     }
5743     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5744         RExC_precomp = NULL;
5745         return(NULL);
5746     }
5747     if (code_blocksv)
5748         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5749
5750     /* Here, finished first pass.  Get rid of any added setjmp */
5751     if (used_setjump) {
5752         JMPENV_POP;
5753     }
5754
5755     DEBUG_PARSE_r({
5756         PerlIO_printf(Perl_debug_log,
5757             "Required size %"IVdf" nodes\n"
5758             "Starting second pass (creation)\n",
5759             (IV)RExC_size);
5760         RExC_lastnum=0;
5761         RExC_lastparse=NULL;
5762     });
5763
5764     /* The first pass could have found things that force Unicode semantics */
5765     if ((RExC_utf8 || RExC_uni_semantics)
5766          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5767     {
5768         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5769     }
5770
5771     /* Small enough for pointer-storage convention?
5772        If extralen==0, this means that we will not need long jumps. */
5773     if (RExC_size >= 0x10000L && RExC_extralen)
5774         RExC_size += RExC_extralen;
5775     else
5776         RExC_extralen = 0;
5777     if (RExC_whilem_seen > 15)
5778         RExC_whilem_seen = 15;
5779
5780     /* Allocate space and zero-initialize. Note, the two step process
5781        of zeroing when in debug mode, thus anything assigned has to
5782        happen after that */
5783     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5784     r = ReANY(rx);
5785     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5786          char, regexp_internal);
5787     if ( r == NULL || ri == NULL )
5788         FAIL("Regexp out of space");
5789 #ifdef DEBUGGING
5790     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5791     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5792 #else
5793     /* bulk initialize base fields with 0. */
5794     Zero(ri, sizeof(regexp_internal), char);
5795 #endif
5796
5797     /* non-zero initialization begins here */
5798     RXi_SET( r, ri );
5799     r->engine= eng;
5800     r->extflags = rx_flags;
5801     if (pm_flags & PMf_IS_QR) {
5802         ri->code_blocks = pRExC_state->code_blocks;
5803         ri->num_code_blocks = pRExC_state->num_code_blocks;
5804     }
5805     else
5806     {
5807         int n;
5808         for (n = 0; n < pRExC_state->num_code_blocks; n++)
5809             if (pRExC_state->code_blocks[n].src_regex)
5810                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5811         SAVEFREEPV(pRExC_state->code_blocks);
5812     }
5813
5814     {
5815         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5816         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5817
5818         /* The caret is output if there are any defaults: if not all the STD
5819          * flags are set, or if no character set specifier is needed */
5820         bool has_default =
5821                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5822                     || ! has_charset);
5823         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5824         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5825                             >> RXf_PMf_STD_PMMOD_SHIFT);
5826         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5827         char *p;
5828         /* Allocate for the worst case, which is all the std flags are turned
5829          * on.  If more precision is desired, we could do a population count of
5830          * the flags set.  This could be done with a small lookup table, or by
5831          * shifting, masking and adding, or even, when available, assembly
5832          * language for a machine-language population count.
5833          * We never output a minus, as all those are defaults, so are
5834          * covered by the caret */
5835         const STRLEN wraplen = plen + has_p + has_runon
5836             + has_default       /* If needs a caret */
5837
5838                 /* If needs a character set specifier */
5839             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5840             + (sizeof(STD_PAT_MODS) - 1)
5841             + (sizeof("(?:)") - 1);
5842
5843         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5844         r->xpv_len_u.xpvlenu_pv = p;
5845         if (RExC_utf8)
5846             SvFLAGS(rx) |= SVf_UTF8;
5847         *p++='('; *p++='?';
5848
5849         /* If a default, cover it using the caret */
5850         if (has_default) {
5851             *p++= DEFAULT_PAT_MOD;
5852         }
5853         if (has_charset) {
5854             STRLEN len;
5855             const char* const name = get_regex_charset_name(r->extflags, &len);
5856             Copy(name, p, len, char);
5857             p += len;
5858         }
5859         if (has_p)
5860             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5861         {
5862             char ch;
5863             while((ch = *fptr++)) {
5864                 if(reganch & 1)
5865                     *p++ = ch;
5866                 reganch >>= 1;
5867             }
5868         }
5869
5870         *p++ = ':';
5871         Copy(RExC_precomp, p, plen, char);
5872         assert ((RX_WRAPPED(rx) - p) < 16);
5873         r->pre_prefix = p - RX_WRAPPED(rx);
5874         p += plen;
5875         if (has_runon)
5876             *p++ = '\n';
5877         *p++ = ')';
5878         *p = 0;
5879         SvCUR_set(rx, p - RX_WRAPPED(rx));
5880     }
5881
5882     r->intflags = 0;
5883     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5884
5885     if (RExC_seen & REG_SEEN_RECURSE) {
5886         Newxz(RExC_open_parens, RExC_npar,regnode *);
5887         SAVEFREEPV(RExC_open_parens);
5888         Newxz(RExC_close_parens,RExC_npar,regnode *);
5889         SAVEFREEPV(RExC_close_parens);
5890     }
5891
5892     /* Useful during FAIL. */
5893 #ifdef RE_TRACK_PATTERN_OFFSETS
5894     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5895     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5896                           "%s %"UVuf" bytes for offset annotations.\n",
5897                           ri->u.offsets ? "Got" : "Couldn't get",
5898                           (UV)((2*RExC_size+1) * sizeof(U32))));
5899 #endif
5900     SetProgLen(ri,RExC_size);
5901     RExC_rx_sv = rx;
5902     RExC_rx = r;
5903     RExC_rxi = ri;
5904
5905     /* Second pass: emit code. */
5906     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5907     RExC_pm_flags = pm_flags;
5908     RExC_parse = exp;
5909     RExC_end = xend;
5910     RExC_naughty = 0;
5911     RExC_npar = 1;
5912     RExC_emit_start = ri->program;
5913     RExC_emit = ri->program;
5914     RExC_emit_bound = ri->program + RExC_size + 1;
5915     pRExC_state->code_index = 0;
5916
5917     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5918     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5919         ReREFCNT_dec(rx);
5920         return(NULL);
5921     }
5922     /* XXXX To minimize changes to RE engine we always allocate
5923        3-units-long substrs field. */
5924     Newx(r->substrs, 1, struct reg_substr_data);
5925     if (RExC_recurse_count) {
5926         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5927         SAVEFREEPV(RExC_recurse);
5928     }
5929
5930 reStudy:
5931     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5932     Zero(r->substrs, 1, struct reg_substr_data);
5933
5934 #ifdef TRIE_STUDY_OPT
5935     if (!restudied) {
5936         StructCopy(&zero_scan_data, &data, scan_data_t);
5937         copyRExC_state = RExC_state;
5938     } else {
5939         U32 seen=RExC_seen;
5940         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5941
5942         RExC_state = copyRExC_state;
5943         if (seen & REG_TOP_LEVEL_BRANCHES)
5944             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5945         else
5946             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5947         StructCopy(&zero_scan_data, &data, scan_data_t);
5948     }
5949 #else
5950     StructCopy(&zero_scan_data, &data, scan_data_t);
5951 #endif
5952
5953     /* Dig out information for optimizations. */
5954     r->extflags = RExC_flags; /* was pm_op */
5955     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5956
5957     if (UTF)
5958         SvUTF8_on(rx);  /* Unicode in it? */
5959     ri->regstclass = NULL;
5960     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5961         r->intflags |= PREGf_NAUGHTY;
5962     scan = ri->program + 1;             /* First BRANCH. */
5963
5964     /* testing for BRANCH here tells us whether there is "must appear"
5965        data in the pattern. If there is then we can use it for optimisations */
5966     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5967         I32 fake;
5968         STRLEN longest_float_length, longest_fixed_length;
5969         struct regnode_charclass_class ch_class; /* pointed to by data */
5970         int stclass_flag;
5971         I32 last_close = 0; /* pointed to by data */
5972         regnode *first= scan;
5973         regnode *first_next= regnext(first);
5974         /*
5975          * Skip introductions and multiplicators >= 1
5976          * so that we can extract the 'meat' of the pattern that must
5977          * match in the large if() sequence following.
5978          * NOTE that EXACT is NOT covered here, as it is normally
5979          * picked up by the optimiser separately.
5980          *
5981          * This is unfortunate as the optimiser isnt handling lookahead
5982          * properly currently.
5983          *
5984          */
5985         while ((OP(first) == OPEN && (sawopen = 1)) ||
5986                /* An OR of *one* alternative - should not happen now. */
5987             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5988             /* for now we can't handle lookbehind IFMATCH*/
5989             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5990             (OP(first) == PLUS) ||
5991             (OP(first) == MINMOD) ||
5992                /* An {n,m} with n>0 */
5993             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5994             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5995         {
5996                 /*
5997                  * the only op that could be a regnode is PLUS, all the rest
5998                  * will be regnode_1 or regnode_2.
5999                  *
6000                  */
6001                 if (OP(first) == PLUS)
6002                     sawplus = 1;
6003                 else
6004                     first += regarglen[OP(first)];
6005
6006                 first = NEXTOPER(first);
6007                 first_next= regnext(first);
6008         }
6009
6010         /* Starting-point info. */
6011       again:
6012         DEBUG_PEEP("first:",first,0);
6013         /* Ignore EXACT as we deal with it later. */
6014         if (PL_regkind[OP(first)] == EXACT) {
6015             if (OP(first) == EXACT)
6016                 NOOP;   /* Empty, get anchored substr later. */
6017             else
6018                 ri->regstclass = first;
6019         }
6020 #ifdef TRIE_STCLASS
6021         else if (PL_regkind[OP(first)] == TRIE &&
6022                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6023         {
6024             regnode *trie_op;
6025             /* this can happen only on restudy */
6026             if ( OP(first) == TRIE ) {
6027                 struct regnode_1 *trieop = (struct regnode_1 *)
6028                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6029                 StructCopy(first,trieop,struct regnode_1);
6030                 trie_op=(regnode *)trieop;
6031             } else {
6032                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6033                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6034                 StructCopy(first,trieop,struct regnode_charclass);
6035                 trie_op=(regnode *)trieop;
6036             }
6037             OP(trie_op)+=2;
6038             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6039             ri->regstclass = trie_op;
6040         }
6041 #endif
6042         else if (REGNODE_SIMPLE(OP(first)))
6043             ri->regstclass = first;
6044         else if (PL_regkind[OP(first)] == BOUND ||
6045                  PL_regkind[OP(first)] == NBOUND)
6046             ri->regstclass = first;
6047         else if (PL_regkind[OP(first)] == BOL) {
6048             r->extflags |= (OP(first) == MBOL
6049                            ? RXf_ANCH_MBOL
6050                            : (OP(first) == SBOL
6051                               ? RXf_ANCH_SBOL
6052                               : RXf_ANCH_BOL));
6053             first = NEXTOPER(first);
6054             goto again;
6055         }
6056         else if (OP(first) == GPOS) {
6057             r->extflags |= RXf_ANCH_GPOS;
6058             first = NEXTOPER(first);
6059             goto again;
6060         }
6061         else if ((!sawopen || !RExC_sawback) &&
6062             (OP(first) == STAR &&
6063             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6064             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6065         {
6066             /* turn .* into ^.* with an implied $*=1 */
6067             const int type =
6068                 (OP(NEXTOPER(first)) == REG_ANY)
6069                     ? RXf_ANCH_MBOL
6070                     : RXf_ANCH_SBOL;
6071             r->extflags |= type;
6072             r->intflags |= PREGf_IMPLICIT;
6073             first = NEXTOPER(first);
6074             goto again;
6075         }
6076         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6077             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6078             /* x+ must match at the 1st pos of run of x's */
6079             r->intflags |= PREGf_SKIP;
6080
6081         /* Scan is after the zeroth branch, first is atomic matcher. */
6082 #ifdef TRIE_STUDY_OPT
6083         DEBUG_PARSE_r(
6084             if (!restudied)
6085                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6086                               (IV)(first - scan + 1))
6087         );
6088 #else
6089         DEBUG_PARSE_r(
6090             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6091                 (IV)(first - scan + 1))
6092         );
6093 #endif
6094
6095
6096         /*
6097         * If there's something expensive in the r.e., find the
6098         * longest literal string that must appear and make it the
6099         * regmust.  Resolve ties in favor of later strings, since
6100         * the regstart check works with the beginning of the r.e.
6101         * and avoiding duplication strengthens checking.  Not a
6102         * strong reason, but sufficient in the absence of others.
6103         * [Now we resolve ties in favor of the earlier string if
6104         * it happens that c_offset_min has been invalidated, since the
6105         * earlier string may buy us something the later one won't.]
6106         */
6107
6108         data.longest_fixed = newSVpvs("");
6109         data.longest_float = newSVpvs("");
6110         data.last_found = newSVpvs("");
6111         data.longest = &(data.longest_fixed);
6112         ENTER_with_name("study_chunk");
6113         SAVEFREESV(data.longest_fixed);
6114         SAVEFREESV(data.longest_float);
6115         SAVEFREESV(data.last_found);
6116         first = scan;
6117         if (!ri->regstclass) {
6118             cl_init(pRExC_state, &ch_class);
6119             data.start_class = &ch_class;
6120             stclass_flag = SCF_DO_STCLASS_AND;
6121         } else                          /* XXXX Check for BOUND? */
6122             stclass_flag = 0;
6123         data.last_closep = &last_close;
6124
6125         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6126             &data, -1, NULL, NULL,
6127             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6128
6129
6130         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6131
6132
6133         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6134              && data.last_start_min == 0 && data.last_end > 0
6135              && !RExC_seen_zerolen
6136              && !(RExC_seen & REG_SEEN_VERBARG)
6137              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6138             r->extflags |= RXf_CHECK_ALL;
6139         scan_commit(pRExC_state, &data,&minlen,0);
6140
6141         longest_float_length = CHR_SVLEN(data.longest_float);
6142
6143         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6144                    && data.offset_fixed == data.offset_float_min
6145                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6146             && S_setup_longest (aTHX_ pRExC_state,
6147                                     data.longest_float,
6148                                     &(r->float_utf8),
6149                                     &(r->float_substr),
6150                                     &(r->float_end_shift),
6151                                     data.lookbehind_float,
6152                                     data.offset_float_min,
6153                                     data.minlen_float,
6154                                     longest_float_length,
6155                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6156                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6157         {
6158             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6159             r->float_max_offset = data.offset_float_max;
6160             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6161                 r->float_max_offset -= data.lookbehind_float;
6162             SvREFCNT_inc_simple_void_NN(data.longest_float);
6163         }
6164         else {
6165             r->float_substr = r->float_utf8 = NULL;
6166             longest_float_length = 0;
6167         }
6168
6169         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6170
6171         if (S_setup_longest (aTHX_ pRExC_state,
6172                                 data.longest_fixed,
6173                                 &(r->anchored_utf8),
6174                                 &(r->anchored_substr),
6175                                 &(r->anchored_end_shift),
6176                                 data.lookbehind_fixed,
6177                                 data.offset_fixed,
6178                                 data.minlen_fixed,
6179                                 longest_fixed_length,
6180                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6181                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6182         {
6183             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6184             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6185         }
6186         else {
6187             r->anchored_substr = r->anchored_utf8 = NULL;
6188             longest_fixed_length = 0;
6189         }
6190         LEAVE_with_name("study_chunk");
6191
6192         if (ri->regstclass
6193             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6194             ri->regstclass = NULL;
6195
6196         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6197             && stclass_flag
6198             && ! TEST_SSC_EOS(data.start_class)
6199             && !cl_is_anything(data.start_class))
6200         {
6201             const U32 n = add_data(pRExC_state, 1, "f");
6202             OP(data.start_class) = ANYOF_SYNTHETIC;
6203
6204             Newx(RExC_rxi->data->data[n], 1,
6205                 struct regnode_charclass_class);
6206             StructCopy(data.start_class,
6207                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6208                        struct regnode_charclass_class);
6209             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6210             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6211             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6212                       regprop(r, sv, (regnode*)data.start_class);
6213                       PerlIO_printf(Perl_debug_log,
6214                                     "synthetic stclass \"%s\".\n",
6215                                     SvPVX_const(sv));});
6216         }
6217
6218         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6219         if (longest_fixed_length > longest_float_length) {
6220             r->check_end_shift = r->anchored_end_shift;
6221             r->check_substr = r->anchored_substr;
6222             r->check_utf8 = r->anchored_utf8;
6223             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6224             if (r->extflags & RXf_ANCH_SINGLE)
6225                 r->extflags |= RXf_NOSCAN;
6226         }
6227         else {
6228             r->check_end_shift = r->float_end_shift;
6229             r->check_substr = r->float_substr;
6230             r->check_utf8 = r->float_utf8;
6231             r->check_offset_min = r->float_min_offset;
6232             r->check_offset_max = r->float_max_offset;
6233         }
6234         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6235            This should be changed ASAP!  */
6236         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6237             r->extflags |= RXf_USE_INTUIT;
6238             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6239                 r->extflags |= RXf_INTUIT_TAIL;
6240         }
6241         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6242         if ( (STRLEN)minlen < longest_float_length )
6243             minlen= longest_float_length;
6244         if ( (STRLEN)minlen < longest_fixed_length )
6245             minlen= longest_fixed_length;
6246         */
6247     }
6248     else {
6249         /* Several toplevels. Best we can is to set minlen. */
6250         I32 fake;
6251         struct regnode_charclass_class ch_class;
6252         I32 last_close = 0;
6253
6254         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6255
6256         scan = ri->program + 1;
6257         cl_init(pRExC_state, &ch_class);
6258         data.start_class = &ch_class;
6259         data.last_closep = &last_close;
6260
6261
6262         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6263             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6264
6265         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6266
6267         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6268                 = r->float_substr = r->float_utf8 = NULL;
6269
6270         if (! TEST_SSC_EOS(data.start_class)
6271             && !cl_is_anything(data.start_class))
6272         {
6273             const U32 n = add_data(pRExC_state, 1, "f");
6274             OP(data.start_class) = ANYOF_SYNTHETIC;
6275
6276             Newx(RExC_rxi->data->data[n], 1,
6277                 struct regnode_charclass_class);
6278             StructCopy(data.start_class,
6279                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6280                        struct regnode_charclass_class);
6281             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6282             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6283             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6284                       regprop(r, sv, (regnode*)data.start_class);
6285                       PerlIO_printf(Perl_debug_log,
6286                                     "synthetic stclass \"%s\".\n",
6287                                     SvPVX_const(sv));});
6288         }
6289     }
6290
6291     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6292        the "real" pattern. */
6293     DEBUG_OPTIMISE_r({
6294         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6295                       (IV)minlen, (IV)r->minlen);
6296     });
6297     r->minlenret = minlen;
6298     if (r->minlen < minlen)
6299         r->minlen = minlen;
6300
6301     if (RExC_seen & REG_SEEN_GPOS)
6302         r->extflags |= RXf_GPOS_SEEN;
6303     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6304         r->extflags |= RXf_LOOKBEHIND_SEEN;
6305     if (pRExC_state->num_code_blocks)
6306         r->extflags |= RXf_EVAL_SEEN;
6307     if (RExC_seen & REG_SEEN_CANY)
6308         r->extflags |= RXf_CANY_SEEN;
6309     if (RExC_seen & REG_SEEN_VERBARG)
6310     {
6311         r->intflags |= PREGf_VERBARG_SEEN;
6312         r->extflags |= RXf_MODIFIES_VARS;
6313     }
6314     if (RExC_seen & REG_SEEN_CUTGROUP)
6315         r->intflags |= PREGf_CUTGROUP_SEEN;
6316     if (pm_flags & PMf_USE_RE_EVAL)
6317         r->intflags |= PREGf_USE_RE_EVAL;
6318     if (RExC_paren_names)
6319         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6320     else
6321         RXp_PAREN_NAMES(r) = NULL;
6322
6323 #ifdef STUPID_PATTERN_CHECKS
6324     if (RX_PRELEN(rx) == 0)
6325         r->extflags |= RXf_NULL;
6326     if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6327         r->extflags |= RXf_WHITE;
6328     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6329         r->extflags |= RXf_START_ONLY;
6330 #else
6331     {
6332         regnode *first = ri->program + 1;
6333         U8 fop = OP(first);
6334
6335         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6336             r->extflags |= RXf_NULL;
6337         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6338             r->extflags |= RXf_START_ONLY;
6339         else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
6340                              && OP(regnext(first)) == END)
6341             r->extflags |= RXf_WHITE;
6342     }
6343 #endif
6344 #ifdef DEBUGGING
6345     if (RExC_paren_names) {
6346         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6347         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6348     } else
6349 #endif
6350         ri->name_list_idx = 0;
6351
6352     if (RExC_recurse_count) {
6353         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6354             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6355             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6356         }
6357     }
6358     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6359     /* assume we don't need to swap parens around before we match */
6360
6361     DEBUG_DUMP_r({
6362         PerlIO_printf(Perl_debug_log,"Final program:\n");
6363         regdump(r);
6364     });
6365 #ifdef RE_TRACK_PATTERN_OFFSETS
6366     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6367         const U32 len = ri->u.offsets[0];
6368         U32 i;
6369         GET_RE_DEBUG_FLAGS_DECL;
6370         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6371         for (i = 1; i <= len; i++) {
6372             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6373                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6374                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6375             }
6376         PerlIO_printf(Perl_debug_log, "\n");
6377     });
6378 #endif
6379
6380 #ifdef USE_ITHREADS
6381     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6382      * by setting the regexp SV to readonly-only instead. If the
6383      * pattern's been recompiled, the USEDness should remain. */
6384     if (old_re && SvREADONLY(old_re))
6385         SvREADONLY_on(rx);
6386 #endif
6387     return rx;
6388 }
6389
6390
6391 SV*
6392 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6393                     const U32 flags)
6394 {
6395     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6396
6397     PERL_UNUSED_ARG(value);
6398
6399     if (flags & RXapif_FETCH) {
6400         return reg_named_buff_fetch(rx, key, flags);
6401     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6402         Perl_croak_no_modify();
6403         return NULL;
6404     } else if (flags & RXapif_EXISTS) {
6405         return reg_named_buff_exists(rx, key, flags)
6406             ? &PL_sv_yes
6407             : &PL_sv_no;
6408     } else if (flags & RXapif_REGNAMES) {
6409         return reg_named_buff_all(rx, flags);
6410     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6411         return reg_named_buff_scalar(rx, flags);
6412     } else {
6413         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6414         return NULL;
6415     }
6416 }
6417
6418 SV*
6419 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6420                          const U32 flags)
6421 {
6422     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6423     PERL_UNUSED_ARG(lastkey);
6424
6425     if (flags & RXapif_FIRSTKEY)
6426         return reg_named_buff_firstkey(rx, flags);
6427     else if (flags & RXapif_NEXTKEY)
6428         return reg_named_buff_nextkey(rx, flags);
6429     else {
6430         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6431         return NULL;
6432     }
6433 }
6434
6435 SV*
6436 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6437                           const U32 flags)
6438 {
6439     AV *retarray = NULL;
6440     SV *ret;
6441     struct regexp *const rx = ReANY(r);
6442
6443     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6444
6445     if (flags & RXapif_ALL)
6446         retarray=newAV();
6447
6448     if (rx && RXp_PAREN_NAMES(rx)) {
6449         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6450         if (he_str) {
6451             IV i;
6452             SV* sv_dat=HeVAL(he_str);
6453             I32 *nums=(I32*)SvPVX(sv_dat);
6454             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6455                 if ((I32)(rx->nparens) >= nums[i]
6456                     && rx->offs[nums[i]].start != -1
6457                     && rx->offs[nums[i]].end != -1)
6458                 {
6459                     ret = newSVpvs("");
6460                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6461                     if (!retarray)
6462                         return ret;
6463                 } else {
6464                     if (retarray)
6465                         ret = newSVsv(&PL_sv_undef);
6466                 }
6467                 if (retarray)
6468                     av_push(retarray, ret);
6469             }
6470             if (retarray)
6471                 return newRV_noinc(MUTABLE_SV(retarray));
6472         }
6473     }
6474     return NULL;
6475 }
6476
6477 bool
6478 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6479                            const U32 flags)
6480 {
6481     struct regexp *const rx = ReANY(r);
6482
6483     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6484
6485     if (rx && RXp_PAREN_NAMES(rx)) {
6486         if (flags & RXapif_ALL) {
6487             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6488         } else {
6489             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6490             if (sv) {
6491                 SvREFCNT_dec_NN(sv);
6492                 return TRUE;
6493             } else {
6494                 return FALSE;
6495             }
6496         }
6497     } else {
6498         return FALSE;
6499     }
6500 }
6501
6502 SV*
6503 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6504 {
6505     struct regexp *const rx = ReANY(r);
6506
6507     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6508
6509     if ( rx && RXp_PAREN_NAMES(rx) ) {
6510         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6511
6512         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6513     } else {
6514         return FALSE;
6515     }
6516 }
6517
6518 SV*
6519 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6520 {
6521     struct regexp *const rx = ReANY(r);
6522     GET_RE_DEBUG_FLAGS_DECL;
6523
6524     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6525
6526     if (rx && RXp_PAREN_NAMES(rx)) {
6527         HV *hv = RXp_PAREN_NAMES(rx);
6528         HE *temphe;
6529         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6530             IV i;
6531             IV parno = 0;
6532             SV* sv_dat = HeVAL(temphe);
6533             I32 *nums = (I32*)SvPVX(sv_dat);
6534             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6535                 if ((I32)(rx->lastparen) >= nums[i] &&
6536                     rx->offs[nums[i]].start != -1 &&
6537                     rx->offs[nums[i]].end != -1)
6538                 {
6539                     parno = nums[i];
6540                     break;
6541                 }
6542             }
6543             if (parno || flags & RXapif_ALL) {
6544                 return newSVhek(HeKEY_hek(temphe));
6545             }
6546         }
6547     }
6548     return NULL;
6549 }
6550
6551 SV*
6552 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6553 {
6554     SV *ret;
6555     AV *av;
6556     I32 length;
6557     struct regexp *const rx = ReANY(r);
6558
6559     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6560
6561     if (rx && RXp_PAREN_NAMES(rx)) {
6562         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6563             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6564         } else if (flags & RXapif_ONE) {
6565             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6566             av = MUTABLE_AV(SvRV(ret));
6567             length = av_len(av);
6568             SvREFCNT_dec_NN(ret);
6569             return newSViv(length + 1);
6570         } else {
6571             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6572             return NULL;
6573         }
6574     }
6575     return &PL_sv_undef;
6576 }
6577
6578 SV*
6579 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6580 {
6581     struct regexp *const rx = ReANY(r);
6582     AV *av = newAV();
6583
6584     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6585
6586     if (rx && RXp_PAREN_NAMES(rx)) {
6587         HV *hv= RXp_PAREN_NAMES(rx);
6588         HE *temphe;
6589         (void)hv_iterinit(hv);
6590         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6591             IV i;
6592             IV parno = 0;
6593             SV* sv_dat = HeVAL(temphe);
6594             I32 *nums = (I32*)SvPVX(sv_dat);
6595             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6596                 if ((I32)(rx->lastparen) >= nums[i] &&
6597                     rx->offs[nums[i]].start != -1 &&
6598                     rx->offs[nums[i]].end != -1)
6599                 {
6600                     parno = nums[i];
6601                     break;
6602                 }
6603             }
6604             if (parno || flags & RXapif_ALL) {
6605                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6606             }
6607         }
6608     }
6609
6610     return newRV_noinc(MUTABLE_SV(av));
6611 }
6612
6613 void
6614 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6615                              SV * const sv)
6616 {
6617     struct regexp *const rx = ReANY(r);
6618     char *s = NULL;
6619     I32 i = 0;
6620     I32 s1, t1;
6621     I32 n = paren;
6622
6623     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6624
6625     if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6626            || n == RX_BUFF_IDX_CARET_FULLMATCH
6627            || n == RX_BUFF_IDX_CARET_POSTMATCH
6628          )
6629          && !(rx->extflags & RXf_PMf_KEEPCOPY)
6630     )
6631         goto ret_undef;
6632
6633     if (!rx->subbeg)
6634         goto ret_undef;
6635
6636     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6637         /* no need to distinguish between them any more */
6638         n = RX_BUFF_IDX_FULLMATCH;
6639
6640     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6641         && rx->offs[0].start != -1)
6642     {
6643         /* $`, ${^PREMATCH} */
6644         i = rx->offs[0].start;
6645         s = rx->subbeg;
6646     }
6647     else
6648     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6649         && rx->offs[0].end != -1)
6650     {
6651         /* $', ${^POSTMATCH} */
6652         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6653         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6654     }
6655     else
6656     if ( 0 <= n && n <= (I32)rx->nparens &&
6657         (s1 = rx->offs[n].start) != -1 &&
6658         (t1 = rx->offs[n].end) != -1)
6659     {
6660         /* $&, ${^MATCH},  $1 ... */
6661         i = t1 - s1;
6662         s = rx->subbeg + s1 - rx->suboffset;
6663     } else {
6664         goto ret_undef;
6665     }
6666
6667     assert(s >= rx->subbeg);
6668     assert(rx->sublen >= (s - rx->subbeg) + i );
6669     if (i >= 0) {
6670 #if NO_TAINT_SUPPORT
6671         sv_setpvn(sv, s, i);
6672 #else
6673         const int oldtainted = TAINT_get;
6674         TAINT_NOT;
6675         sv_setpvn(sv, s, i);
6676         TAINT_set(oldtainted);
6677 #endif
6678         if ( (rx->extflags & RXf_CANY_SEEN)
6679             ? (RXp_MATCH_UTF8(rx)
6680                         && (!i || is_utf8_string((U8*)s, i)))
6681             : (RXp_MATCH_UTF8(rx)) )
6682         {
6683             SvUTF8_on(sv);
6684         }
6685         else
6686             SvUTF8_off(sv);
6687         if (TAINTING_get) {
6688             if (RXp_MATCH_TAINTED(rx)) {
6689                 if (SvTYPE(sv) >= SVt_PVMG) {
6690                     MAGIC* const mg = SvMAGIC(sv);
6691                     MAGIC* mgt;
6692                     TAINT;
6693                     SvMAGIC_set(sv, mg->mg_moremagic);
6694                     SvTAINT(sv);
6695                     if ((mgt = SvMAGIC(sv))) {
6696                         mg->mg_moremagic = mgt;
6697                         SvMAGIC_set(sv, mg);
6698                     }
6699                 } else {
6700                     TAINT;
6701                     SvTAINT(sv);
6702                 }
6703             } else
6704                 SvTAINTED_off(sv);
6705         }
6706     } else {
6707       ret_undef:
6708         sv_setsv(sv,&PL_sv_undef);
6709         return;
6710     }
6711 }
6712
6713 void
6714 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6715                                                          SV const * const value)
6716 {
6717     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6718
6719     PERL_UNUSED_ARG(rx);
6720     PERL_UNUSED_ARG(paren);
6721     PERL_UNUSED_ARG(value);
6722
6723     if (!PL_localizing)
6724         Perl_croak_no_modify();
6725 }
6726
6727 I32
6728 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6729                               const I32 paren)
6730 {
6731     struct regexp *const rx = ReANY(r);
6732     I32 i;
6733     I32 s1, t1;
6734
6735     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6736
6737     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6738     switch (paren) {
6739       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6740          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6741             goto warn_undef;
6742         /*FALLTHROUGH*/
6743
6744       case RX_BUFF_IDX_PREMATCH:       /* $` */
6745         if (rx->offs[0].start != -1) {
6746                         i = rx->offs[0].start;
6747                         if (i > 0) {
6748                                 s1 = 0;
6749                                 t1 = i;
6750                                 goto getlen;
6751                         }
6752             }
6753         return 0;
6754
6755       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6756          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6757             goto warn_undef;
6758       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6759             if (rx->offs[0].end != -1) {
6760                         i = rx->sublen - rx->offs[0].end;
6761                         if (i > 0) {
6762                                 s1 = rx->offs[0].end;
6763                                 t1 = rx->sublen;
6764                                 goto getlen;
6765                         }
6766             }
6767         return 0;
6768
6769       case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6770          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6771             goto warn_undef;
6772         /*FALLTHROUGH*/
6773
6774       /* $& / ${^MATCH}, $1, $2, ... */
6775       default:
6776             if (paren <= (I32)rx->nparens &&
6777             (s1 = rx->offs[paren].start) != -1 &&
6778             (t1 = rx->offs[paren].end) != -1)
6779             {
6780             i = t1 - s1;
6781             goto getlen;
6782         } else {
6783           warn_undef:
6784             if (ckWARN(WARN_UNINITIALIZED))
6785                 report_uninit((const SV *)sv);
6786             return 0;
6787         }
6788     }
6789   getlen:
6790     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6791         const char * const s = rx->subbeg - rx->suboffset + s1;
6792         const U8 *ep;
6793         STRLEN el;
6794
6795         i = t1 - s1;
6796         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6797                         i = el;
6798     }
6799     return i;
6800 }
6801
6802 SV*
6803 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6804 {
6805     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6806         PERL_UNUSED_ARG(rx);
6807         if (0)
6808             return NULL;
6809         else
6810             return newSVpvs("Regexp");
6811 }
6812
6813 /* Scans the name of a named buffer from the pattern.
6814  * If flags is REG_RSN_RETURN_NULL returns null.
6815  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6816  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6817  * to the parsed name as looked up in the RExC_paren_names hash.
6818  * If there is an error throws a vFAIL().. type exception.
6819  */
6820
6821 #define REG_RSN_RETURN_NULL    0
6822 #define REG_RSN_RETURN_NAME    1
6823 #define REG_RSN_RETURN_DATA    2
6824
6825 STATIC SV*
6826 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6827 {
6828     char *name_start = RExC_parse;
6829
6830     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6831
6832     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6833          /* skip IDFIRST by using do...while */
6834         if (UTF)
6835             do {
6836                 RExC_parse += UTF8SKIP(RExC_parse);
6837             } while (isWORDCHAR_utf8((U8*)RExC_parse));
6838         else
6839             do {
6840                 RExC_parse++;
6841             } while (isWORDCHAR(*RExC_parse));
6842     } else {
6843         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6844         vFAIL("Group name must start with a non-digit word character");
6845     }
6846     if ( flags ) {
6847         SV* sv_name
6848             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6849                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6850         if ( flags == REG_RSN_RETURN_NAME)
6851             return sv_name;
6852         else if (flags==REG_RSN_RETURN_DATA) {
6853             HE *he_str = NULL;
6854             SV *sv_dat = NULL;
6855             if ( ! sv_name )      /* should not happen*/
6856                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6857             if (RExC_paren_names)
6858                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6859             if ( he_str )
6860                 sv_dat = HeVAL(he_str);
6861             if ( ! sv_dat )
6862                 vFAIL("Reference to nonexistent named group");
6863             return sv_dat;
6864         }
6865         else {
6866             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6867                        (unsigned long) flags);
6868         }
6869         assert(0); /* NOT REACHED */
6870     }
6871     return NULL;
6872 }
6873
6874 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6875     int rem=(int)(RExC_end - RExC_parse);                       \
6876     int cut;                                                    \
6877     int num;                                                    \
6878     int iscut=0;                                                \
6879     if (rem>10) {                                               \
6880         rem=10;                                                 \
6881         iscut=1;                                                \
6882     }                                                           \
6883     cut=10-rem;                                                 \
6884     if (RExC_lastparse!=RExC_parse)                             \
6885         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6886             rem, RExC_parse,                                    \
6887             cut + 4,                                            \
6888             iscut ? "..." : "<"                                 \
6889         );                                                      \
6890     else                                                        \
6891         PerlIO_printf(Perl_debug_log,"%16s","");                \
6892                                                                 \
6893     if (SIZE_ONLY)                                              \
6894        num = RExC_size + 1;                                     \
6895     else                                                        \
6896        num=REG_NODE_NUM(RExC_emit);                             \
6897     if (RExC_lastnum!=num)                                      \
6898        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6899     else                                                        \
6900        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6901     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6902         (int)((depth*2)), "",                                   \
6903         (funcname)                                              \
6904     );                                                          \
6905     RExC_lastnum=num;                                           \
6906     RExC_lastparse=RExC_parse;                                  \
6907 })
6908
6909
6910
6911 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6912     DEBUG_PARSE_MSG((funcname));                            \
6913     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6914 })
6915 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6916     DEBUG_PARSE_MSG((funcname));                            \
6917     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6918 })
6919
6920 /* This section of code defines the inversion list object and its methods.  The
6921  * interfaces are highly subject to change, so as much as possible is static to
6922  * this file.  An inversion list is here implemented as a malloc'd C UV array
6923  * with some added info that is placed as UVs at the beginning in a header
6924  * portion.  An inversion list for Unicode is an array of code points, sorted
6925  * by ordinal number.  The zeroth element is the first code point in the list.
6926  * The 1th element is the first element beyond that not in the list.  In other
6927  * words, the first range is
6928  *  invlist[0]..(invlist[1]-1)
6929  * The other ranges follow.  Thus every element whose index is divisible by two
6930  * marks the beginning of a range that is in the list, and every element not
6931  * divisible by two marks the beginning of a range not in the list.  A single
6932  * element inversion list that contains the single code point N generally
6933  * consists of two elements
6934  *  invlist[0] == N
6935  *  invlist[1] == N+1
6936  * (The exception is when N is the highest representable value on the
6937  * machine, in which case the list containing just it would be a single
6938  * element, itself.  By extension, if the last range in the list extends to
6939  * infinity, then the first element of that range will be in the inversion list
6940  * at a position that is divisible by two, and is the final element in the
6941  * list.)
6942  * Taking the complement (inverting) an inversion list is quite simple, if the
6943  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6944  * This implementation reserves an element at the beginning of each inversion
6945  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
6946  * actual beginning of the list is either that element if 0, or the next one if
6947  * 1.
6948  *
6949  * More about inversion lists can be found in "Unicode Demystified"
6950  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6951  * More will be coming when functionality is added later.
6952  *
6953  * The inversion list data structure is currently implemented as an SV pointing
6954  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6955  * array of UV whose memory management is automatically handled by the existing
6956  * facilities for SV's.
6957  *
6958  * Some of the methods should always be private to the implementation, and some
6959  * should eventually be made public */
6960
6961 /* The header definitions are in F<inline_invlist.c> */
6962 #define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
6963 #define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
6964
6965 #define INVLIST_INITIAL_LEN 10
6966
6967 PERL_STATIC_INLINE UV*
6968 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6969 {
6970     /* Returns a pointer to the first element in the inversion list's array.
6971      * This is called upon initialization of an inversion list.  Where the
6972      * array begins depends on whether the list has the code point U+0000
6973      * in it or not.  The other parameter tells it whether the code that
6974      * follows this call is about to put a 0 in the inversion list or not.
6975      * The first element is either the element with 0, if 0, or the next one,
6976      * if 1 */
6977
6978     UV* zero = get_invlist_zero_addr(invlist);
6979
6980     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6981
6982     /* Must be empty */
6983     assert(! *_get_invlist_len_addr(invlist));
6984
6985     /* 1^1 = 0; 1^0 = 1 */
6986     *zero = 1 ^ will_have_0;
6987     return zero + *zero;
6988 }
6989
6990 PERL_STATIC_INLINE UV*
6991 S_invlist_array(pTHX_ SV* const invlist)
6992 {
6993     /* Returns the pointer to the inversion list's array.  Every time the
6994      * length changes, this needs to be called in case malloc or realloc moved
6995      * it */
6996
6997     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6998
6999     /* Must not be empty.  If these fail, you probably didn't check for <len>
7000      * being non-zero before trying to get the array */
7001     assert(*_get_invlist_len_addr(invlist));
7002     assert(*get_invlist_zero_addr(invlist) == 0
7003            || *get_invlist_zero_addr(invlist) == 1);
7004
7005     /* The array begins either at the element reserved for zero if the
7006      * list contains 0 (that element will be set to 0), or otherwise the next
7007      * element (in which case the reserved element will be set to 1). */
7008     return (UV *) (get_invlist_zero_addr(invlist)
7009                    + *get_invlist_zero_addr(invlist));
7010 }
7011
7012 PERL_STATIC_INLINE void
7013 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7014 {
7015     /* Sets the current number of elements stored in the inversion list */
7016
7017     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7018
7019     *_get_invlist_len_addr(invlist) = len;
7020
7021     assert(len <= SvLEN(invlist));
7022
7023     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7024     /* If the list contains U+0000, that element is part of the header,
7025      * and should not be counted as part of the array.  It will contain
7026      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7027      * subtract:
7028      *  SvCUR_set(invlist,
7029      *            TO_INTERNAL_SIZE(len
7030      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7031      * But, this is only valid if len is not 0.  The consequences of not doing
7032      * this is that the memory allocation code may think that 1 more UV is
7033      * being used than actually is, and so might do an unnecessary grow.  That
7034      * seems worth not bothering to make this the precise amount.
7035      *
7036      * Note that when inverting, SvCUR shouldn't change */
7037 }
7038
7039 PERL_STATIC_INLINE IV*
7040 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7041 {
7042     /* Return the address of the UV that is reserved to hold the cached index
7043      * */
7044
7045     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7046
7047     return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7048 }
7049
7050 PERL_STATIC_INLINE IV
7051 S_invlist_previous_index(pTHX_ SV* const invlist)
7052 {
7053     /* Returns cached index of previous search */
7054
7055     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7056
7057     return *get_invlist_previous_index_addr(invlist);
7058 }
7059
7060 PERL_STATIC_INLINE void
7061 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7062 {
7063     /* Caches <index> for later retrieval */
7064
7065     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7066
7067     assert(index == 0 || index < (int) _invlist_len(invlist));
7068
7069     *get_invlist_previous_index_addr(invlist) = index;
7070 }
7071
7072 PERL_STATIC_INLINE UV
7073 S_invlist_max(pTHX_ SV* const invlist)
7074 {
7075     /* Returns the maximum number of elements storable in the inversion list's
7076      * array, without having to realloc() */
7077
7078     PERL_ARGS_ASSERT_INVLIST_MAX;
7079
7080     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7081            ? _invlist_len(invlist)
7082            : FROM_INTERNAL_SIZE(SvLEN(invlist));
7083 }
7084
7085 PERL_STATIC_INLINE UV*
7086 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7087 {
7088     /* Return the address of the UV that is reserved to hold 0 if the inversion
7089      * list contains 0.  This has to be the last element of the heading, as the
7090      * list proper starts with either it if 0, or the next element if not.
7091      * (But we force it to contain either 0 or 1) */
7092
7093     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7094
7095     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7096 }
7097
7098 #ifndef PERL_IN_XSUB_RE
7099 SV*
7100 Perl__new_invlist(pTHX_ IV initial_size)
7101 {
7102
7103     /* Return a pointer to a newly constructed inversion list, with enough
7104      * space to store 'initial_size' elements.  If that number is negative, a
7105      * system default is used instead */
7106
7107     SV* new_list;
7108
7109     if (initial_size < 0) {
7110         initial_size = INVLIST_INITIAL_LEN;
7111     }
7112
7113     /* Allocate the initial space */
7114     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7115     invlist_set_len(new_list, 0);
7116
7117     /* Force iterinit() to be used to get iteration to work */
7118     *get_invlist_iter_addr(new_list) = UV_MAX;
7119
7120     /* This should force a segfault if a method doesn't initialize this
7121      * properly */
7122     *get_invlist_zero_addr(new_list) = UV_MAX;
7123
7124     *get_invlist_previous_index_addr(new_list) = 0;
7125     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7126 #if HEADER_LENGTH != 5
7127 #   error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7128 #endif
7129
7130     return new_list;
7131 }
7132 #endif
7133
7134 STATIC SV*
7135 S__new_invlist_C_array(pTHX_ UV* list)
7136 {
7137     /* Return a pointer to a newly constructed inversion list, initialized to
7138      * point to <list>, which has to be in the exact correct inversion list
7139      * form, including internal fields.  Thus this is a dangerous routine that
7140      * should not be used in the wrong hands */
7141
7142     SV* invlist = newSV_type(SVt_PV);
7143
7144     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7145
7146     SvPV_set(invlist, (char *) list);
7147     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7148                                shouldn't touch it */
7149     SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7150
7151     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7152         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7153     }
7154
7155     /* Initialize the iteration pointer.
7156      * XXX This could be done at compile time in charclass_invlists.h, but I
7157      * (khw) am not confident that the suffixes for specifying the C constant
7158      * UV_MAX are portable, e.g.  'ull' on a 32 bit machine that is configured
7159      * to use 64 bits; might need a Configure probe */
7160     invlist_iterfinish(invlist);
7161
7162     return invlist;
7163 }
7164
7165 STATIC void
7166 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7167 {
7168     /* Grow the maximum size of an inversion list */
7169
7170     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7171
7172     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7173 }
7174
7175 PERL_STATIC_INLINE void
7176 S_invlist_trim(pTHX_ SV* const invlist)
7177 {
7178     PERL_ARGS_ASSERT_INVLIST_TRIM;
7179
7180     /* Change the length of the inversion list to how many entries it currently
7181      * has */
7182
7183     SvPV_shrink_to_cur((SV *) invlist);
7184 }
7185
7186 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7187
7188 STATIC void
7189 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7190 {
7191    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7192     * the end of the inversion list.  The range must be above any existing
7193     * ones. */
7194
7195     UV* array;
7196     UV max = invlist_max(invlist);
7197     UV len = _invlist_len(invlist);
7198
7199     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7200
7201     if (len == 0) { /* Empty lists must be initialized */
7202         array = _invlist_array_init(invlist, start == 0);
7203     }
7204     else {
7205         /* Here, the existing list is non-empty. The current max entry in the
7206          * list is generally the first value not in the set, except when the
7207          * set extends to the end of permissible values, in which case it is
7208          * the first entry in that final set, and so this call is an attempt to
7209          * append out-of-order */
7210
7211         UV final_element = len - 1;
7212         array = invlist_array(invlist);
7213         if (array[final_element] > start
7214             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7215         {
7216             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7217                        array[final_element], start,
7218                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7219         }
7220
7221         /* Here, it is a legal append.  If the new range begins with the first
7222          * value not in the set, it is extending the set, so the new first
7223          * value not in the set is one greater than the newly extended range.
7224          * */
7225         if (array[final_element] == start) {
7226             if (end != UV_MAX) {
7227                 array[final_element] = end + 1;
7228             }
7229             else {
7230                 /* But if the end is the maximum representable on the machine,
7231                  * just let the range that this would extend to have no end */
7232                 invlist_set_len(invlist, len - 1);
7233             }
7234             return;
7235         }
7236     }
7237
7238     /* Here the new range doesn't extend any existing set.  Add it */
7239
7240     len += 2;   /* Includes an element each for the start and end of range */
7241
7242     /* If overflows the existing space, extend, which may cause the array to be
7243      * moved */
7244     if (max < len) {
7245         invlist_extend(invlist, len);
7246         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7247                                            failure in invlist_array() */
7248         array = invlist_array(invlist);
7249     }
7250     else {
7251         invlist_set_len(invlist, len);
7252     }
7253
7254     /* The next item on the list starts the range, the one after that is
7255      * one past the new range.  */
7256     array[len - 2] = start;
7257     if (end != UV_MAX) {
7258         array[len - 1] = end + 1;
7259     }
7260     else {
7261         /* But if the end is the maximum representable on the machine, just let
7262          * the range have no end */
7263         invlist_set_len(invlist, len - 1);
7264     }
7265 }
7266
7267 #ifndef PERL_IN_XSUB_RE
7268
7269 IV
7270 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7271 {
7272     /* Searches the inversion list for the entry that contains the input code
7273      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7274      * return value is the index into the list's array of the range that
7275      * contains <cp> */
7276
7277     IV low = 0;
7278     IV mid;
7279     IV high = _invlist_len(invlist);
7280     const IV highest_element = high - 1;
7281     const UV* array;
7282
7283     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7284
7285     /* If list is empty, return failure. */
7286     if (high == 0) {
7287         return -1;
7288     }
7289
7290     /* (We can't get the array unless we know the list is non-empty) */
7291     array = invlist_array(invlist);
7292
7293     mid = invlist_previous_index(invlist);
7294     assert(mid >=0 && mid <= highest_element);
7295
7296     /* <mid> contains the cache of the result of the previous call to this
7297      * function (0 the first time).  See if this call is for the same result,
7298      * or if it is for mid-1.  This is under the theory that calls to this
7299      * function will often be for related code points that are near each other.
7300      * And benchmarks show that caching gives better results.  We also test
7301      * here if the code point is within the bounds of the list.  These tests
7302      * replace others that would have had to be made anyway to make sure that
7303      * the array bounds were not exceeded, and these give us extra information
7304      * at the same time */
7305     if (cp >= array[mid]) {
7306         if (cp >= array[highest_element]) {
7307             return highest_element;
7308         }
7309
7310         /* Here, array[mid] <= cp < array[highest_element].  This means that
7311          * the final element is not the answer, so can exclude it; it also
7312          * means that <mid> is not the final element, so can refer to 'mid + 1'
7313          * safely */
7314         if (cp < array[mid + 1]) {
7315             return mid;
7316         }
7317         high--;
7318         low = mid + 1;
7319     }
7320     else { /* cp < aray[mid] */
7321         if (cp < array[0]) { /* Fail if outside the array */
7322             return -1;
7323         }
7324         high = mid;
7325         if (cp >= array[mid - 1]) {
7326             goto found_entry;
7327         }
7328     }
7329
7330     /* Binary search.  What we are looking for is <i> such that
7331      *  array[i] <= cp < array[i+1]
7332      * The loop below converges on the i+1.  Note that there may not be an
7333      * (i+1)th element in the array, and things work nonetheless */
7334     while (low < high) {
7335         mid = (low + high) / 2;
7336         assert(mid <= highest_element);
7337         if (array[mid] <= cp) { /* cp >= array[mid] */
7338             low = mid + 1;
7339
7340             /* We could do this extra test to exit the loop early.
7341             if (cp < array[low]) {
7342                 return mid;
7343             }
7344             */
7345         }
7346         else { /* cp < array[mid] */
7347             high = mid;
7348         }
7349     }
7350
7351   found_entry:
7352     high--;
7353     invlist_set_previous_index(invlist, high);
7354     return high;
7355 }
7356
7357 void
7358 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7359 {
7360     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7361      * but is used when the swash has an inversion list.  This makes this much
7362      * faster, as it uses a binary search instead of a linear one.  This is
7363      * intimately tied to that function, and perhaps should be in utf8.c,
7364      * except it is intimately tied to inversion lists as well.  It assumes
7365      * that <swatch> is all 0's on input */
7366
7367     UV current = start;
7368     const IV len = _invlist_len(invlist);
7369     IV i;
7370     const UV * array;
7371
7372     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7373
7374     if (len == 0) { /* Empty inversion list */
7375         return;
7376     }
7377
7378     array = invlist_array(invlist);
7379
7380     /* Find which element it is */
7381     i = _invlist_search(invlist, start);
7382
7383     /* We populate from <start> to <end> */
7384     while (current < end) {
7385         UV upper;
7386
7387         /* The inversion list gives the results for every possible code point
7388          * after the first one in the list.  Only those ranges whose index is
7389          * even are ones that the inversion list matches.  For the odd ones,
7390          * and if the initial code point is not in the list, we have to skip
7391          * forward to the next element */
7392         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7393             i++;
7394             if (i >= len) { /* Finished if beyond the end of the array */
7395                 return;
7396             }
7397             current = array[i];
7398             if (current >= end) {   /* Finished if beyond the end of what we
7399                                        are populating */
7400                 if (LIKELY(end < UV_MAX)) {
7401                     return;
7402                 }
7403
7404                 /* We get here when the upper bound is the maximum
7405                  * representable on the machine, and we are looking for just
7406                  * that code point.  Have to special case it */
7407                 i = len;
7408                 goto join_end_of_list;
7409             }
7410         }
7411         assert(current >= start);
7412
7413         /* The current range ends one below the next one, except don't go past
7414          * <end> */
7415         i++;
7416         upper = (i < len && array[i] < end) ? array[i] : end;
7417
7418         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7419          * for each code point in it */
7420         for (; current < upper; current++) {
7421             const STRLEN offset = (STRLEN)(current - start);
7422             swatch[offset >> 3] |= 1 << (offset & 7);
7423         }
7424
7425     join_end_of_list:
7426
7427         /* Quit if at the end of the list */
7428         if (i >= len) {
7429
7430             /* But first, have to deal with the highest possible code point on
7431              * the platform.  The previous code assumes that <end> is one
7432              * beyond where we want to populate, but that is impossible at the
7433              * platform's infinity, so have to handle it specially */
7434             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7435             {
7436                 const STRLEN offset = (STRLEN)(end - start);
7437                 swatch[offset >> 3] |= 1 << (offset & 7);
7438             }
7439             return;
7440         }
7441
7442         /* Advance to the next range, which will be for code points not in the
7443          * inversion list */
7444         current = array[i];
7445     }
7446
7447     return;
7448 }
7449
7450 void
7451 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7452 {
7453     /* Take the union of two inversion lists and point <output> to it.  *output
7454      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7455      * the reference count to that list will be decremented.  The first list,
7456      * <a>, may be NULL, in which case a copy of the second list is returned.
7457      * If <complement_b> is TRUE, the union is taken of the complement
7458      * (inversion) of <b> instead of b itself.
7459      *
7460      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7461      * Richard Gillam, published by Addison-Wesley, and explained at some
7462      * length there.  The preface says to incorporate its examples into your
7463      * code at your own risk.
7464      *
7465      * The algorithm is like a merge sort.
7466      *
7467      * XXX A potential performance improvement is to keep track as we go along
7468      * if only one of the inputs contributes to the result, meaning the other
7469      * is a subset of that one.  In that case, we can skip the final copy and
7470      * return the larger of the input lists, but then outside code might need
7471      * to keep track of whether to free the input list or not */
7472
7473     UV* array_a;    /* a's array */
7474     UV* array_b;
7475     UV len_a;       /* length of a's array */
7476     UV len_b;
7477
7478     SV* u;                      /* the resulting union */
7479     UV* array_u;
7480     UV len_u;
7481
7482     UV i_a = 0;             /* current index into a's array */
7483     UV i_b = 0;
7484     UV i_u = 0;
7485
7486     /* running count, as explained in the algorithm source book; items are
7487      * stopped accumulating and are output when the count changes to/from 0.
7488      * The count is incremented when we start a range that's in the set, and
7489      * decremented when we start a range that's not in the set.  So its range
7490      * is 0 to 2.  Only when the count is zero is something not in the set.
7491      */
7492     UV count = 0;
7493
7494     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7495     assert(a != b);
7496
7497     /* If either one is empty, the union is the other one */
7498     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7499         if (*output == a) {
7500             if (a != NULL) {
7501                 SvREFCNT_dec_NN(a);
7502             }
7503         }
7504         if (*output != b) {
7505             *output = invlist_clone(b);
7506             if (complement_b) {
7507                 _invlist_invert(*output);
7508             }
7509         } /* else *output already = b; */
7510         return;
7511     }
7512     else if ((len_b = _invlist_len(b)) == 0) {
7513         if (*output == b) {
7514             SvREFCNT_dec_NN(b);
7515         }
7516
7517         /* The complement of an empty list is a list that has everything in it,
7518          * so the union with <a> includes everything too */
7519         if (complement_b) {
7520             if (a == *output) {
7521                 SvREFCNT_dec_NN(a);
7522             }
7523             *output = _new_invlist(1);
7524             _append_range_to_invlist(*output, 0, UV_MAX);
7525         }
7526         else if (*output != a) {
7527             *output = invlist_clone(a);
7528         }
7529         /* else *output already = a; */
7530         return;
7531     }
7532
7533     /* Here both lists exist and are non-empty */
7534     array_a = invlist_array(a);
7535     array_b = invlist_array(b);
7536
7537     /* If are to take the union of 'a' with the complement of b, set it
7538      * up so are looking at b's complement. */
7539     if (complement_b) {
7540
7541         /* To complement, we invert: if the first element is 0, remove it.  To
7542          * do this, we just pretend the array starts one later, and clear the
7543          * flag as we don't have to do anything else later */
7544         if (array_b[0] == 0) {
7545             array_b++;
7546             len_b--;
7547             complement_b = FALSE;
7548         }
7549         else {
7550
7551             /* But if the first element is not zero, we unshift a 0 before the
7552              * array.  The data structure reserves a space for that 0 (which
7553              * should be a '1' right now), so physical shifting is unneeded,
7554              * but temporarily change that element to 0.  Before exiting the
7555              * routine, we must restore the element to '1' */
7556             array_b--;
7557             len_b++;
7558             array_b[0] = 0;
7559         }
7560     }
7561
7562     /* Size the union for the worst case: that the sets are completely
7563      * disjoint */
7564     u = _new_invlist(len_a + len_b);
7565
7566     /* Will contain U+0000 if either component does */
7567     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7568                                       || (len_b > 0 && array_b[0] == 0));
7569
7570     /* Go through each list item by item, stopping when exhausted one of
7571      * them */
7572     while (i_a < len_a && i_b < len_b) {
7573         UV cp;      /* The element to potentially add to the union's array */
7574         bool cp_in_set;   /* is it in the the input list's set or not */
7575
7576         /* We need to take one or the other of the two inputs for the union.
7577          * Since we are merging two sorted lists, we take the smaller of the
7578          * next items.  In case of a tie, we take the one that is in its set
7579          * first.  If we took one not in the set first, it would decrement the
7580          * count, possibly to 0 which would cause it to be output as ending the
7581          * range, and the next time through we would take the same number, and
7582          * output it again as beginning the next range.  By doing it the
7583          * opposite way, there is no possibility that the count will be
7584          * momentarily decremented to 0, and thus the two adjoining ranges will
7585          * be seamlessly merged.  (In a tie and both are in the set or both not
7586          * in the set, it doesn't matter which we take first.) */
7587         if (array_a[i_a] < array_b[i_b]
7588             || (array_a[i_a] == array_b[i_b]
7589                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7590         {
7591             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7592             cp= array_a[i_a++];
7593         }
7594         else {
7595             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7596             cp = array_b[i_b++];
7597         }
7598
7599         /* Here, have chosen which of the two inputs to look at.  Only output
7600          * if the running count changes to/from 0, which marks the
7601          * beginning/end of a range in that's in the set */
7602         if (cp_in_set) {
7603             if (count == 0) {
7604                 array_u[i_u++] = cp;
7605             }
7606             count++;
7607         }
7608         else {
7609             count--;
7610             if (count == 0) {
7611                 array_u[i_u++] = cp;
7612             }
7613         }
7614     }
7615
7616     /* Here, we are finished going through at least one of the lists, which
7617      * means there is something remaining in at most one.  We check if the list
7618      * that hasn't been exhausted is positioned such that we are in the middle
7619      * of a range in its set or not.  (i_a and i_b point to the element beyond
7620      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7621      * is potentially more to output.
7622      * There are four cases:
7623      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7624      *     in the union is entirely from the non-exhausted set.
7625      *  2) Both were in their sets, count is 2.  Nothing further should
7626      *     be output, as everything that remains will be in the exhausted
7627      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7628      *     that
7629      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7630      *     Nothing further should be output because the union includes
7631      *     everything from the exhausted set.  Not decrementing ensures that.
7632      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7633      *     decrementing to 0 insures that we look at the remainder of the
7634      *     non-exhausted set */
7635     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7636         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7637     {
7638         count--;
7639     }
7640
7641     /* The final length is what we've output so far, plus what else is about to
7642      * be output.  (If 'count' is non-zero, then the input list we exhausted
7643      * has everything remaining up to the machine's limit in its set, and hence
7644      * in the union, so there will be no further output. */
7645     len_u = i_u;
7646     if (count == 0) {
7647         /* At most one of the subexpressions will be non-zero */
7648         len_u += (len_a - i_a) + (len_b - i_b);
7649     }
7650
7651     /* Set result to final length, which can change the pointer to array_u, so
7652      * re-find it */
7653     if (len_u != _invlist_len(u)) {
7654         invlist_set_len(u, len_u);
7655         invlist_trim(u);
7656         array_u = invlist_array(u);
7657     }
7658
7659     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7660      * the other) ended with everything above it not in its set.  That means
7661      * that the remaining part of the union is precisely the same as the
7662      * non-exhausted list, so can just copy it unchanged.  (If both list were
7663      * exhausted at the same time, then the operations below will be both 0.)
7664      */
7665     if (count == 0) {
7666         IV copy_count; /* At most one will have a non-zero copy count */
7667         if ((copy_count = len_a - i_a) > 0) {
7668             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7669         }
7670         else if ((copy_count = len_b - i_b) > 0) {
7671             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7672         }
7673     }
7674
7675     /* If we've changed b, restore it */
7676     if (complement_b) {
7677         array_b[0] = 1;
7678     }
7679
7680     /*  We may be removing a reference to one of the inputs */
7681     if (a == *output || b == *output) {
7682         assert(! invlist_is_iterating(*output));
7683         SvREFCNT_dec_NN(*output);
7684     }
7685
7686     *output = u;
7687     return;
7688 }
7689
7690 void
7691 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7692 {
7693     /* Take the intersection of two inversion lists and point <i> to it.  *i
7694      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7695      * the reference count to that list will be decremented.
7696      * If <complement_b> is TRUE, the result will be the intersection of <a>
7697      * and the complement (or inversion) of <b> instead of <b> directly.
7698      *
7699      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7700      * Richard Gillam, published by Addison-Wesley, and explained at some
7701      * length there.  The preface says to incorporate its examples into your
7702      * code at your own risk.  In fact, it had bugs
7703      *
7704      * The algorithm is like a merge sort, and is essentially the same as the
7705      * union above
7706      */
7707
7708     UV* array_a;                /* a's array */
7709     UV* array_b;
7710     UV len_a;   /* length of a's array */
7711     UV len_b;
7712
7713     SV* r;                   /* the resulting intersection */
7714     UV* array_r;
7715     UV len_r;
7716
7717     UV i_a = 0;             /* current index into a's array */
7718     UV i_b = 0;
7719     UV i_r = 0;
7720
7721     /* running count, as explained in the algorithm source book; items are
7722      * stopped accumulating and are output when the count changes to/from 2.
7723      * The count is incremented when we start a range that's in the set, and
7724      * decremented when we start a range that's not in the set.  So its range
7725      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7726      */
7727     UV count = 0;
7728
7729     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7730     assert(a != b);
7731
7732     /* Special case if either one is empty */
7733     len_a = _invlist_len(a);
7734     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7735
7736         if (len_a != 0 && complement_b) {
7737
7738             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7739              * be empty.  Here, also we are using 'b's complement, which hence
7740              * must be every possible code point.  Thus the intersection is
7741              * simply 'a'. */
7742             if (*i != a) {
7743                 *i = invlist_clone(a);
7744
7745                 if (*i == b) {
7746                     SvREFCNT_dec_NN(b);
7747                 }
7748             }
7749             /* else *i is already 'a' */
7750             return;
7751         }
7752
7753         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7754          * intersection must be empty */
7755         if (*i == a) {
7756             SvREFCNT_dec_NN(a);
7757         }
7758         else if (*i == b) {
7759             SvREFCNT_dec_NN(b);
7760         }
7761         *i = _new_invlist(0);
7762         return;
7763     }
7764
7765     /* Here both lists exist and are non-empty */
7766     array_a = invlist_array(a);
7767     array_b = invlist_array(b);
7768
7769     /* If are to take the intersection of 'a' with the complement of b, set it
7770      * up so are looking at b's complement. */
7771     if (complement_b) {
7772
7773         /* To complement, we invert: if the first element is 0, remove it.  To
7774          * do this, we just pretend the array starts one later, and clear the
7775          * flag as we don't have to do anything else later */
7776         if (array_b[0] == 0) {
7777             array_b++;
7778             len_b--;
7779             complement_b = FALSE;
7780         }
7781         else {
7782
7783             /* But if the first element is not zero, we unshift a 0 before the
7784              * array.  The data structure reserves a space for that 0 (which
7785              * should be a '1' right now), so physical shifting is unneeded,
7786              * but temporarily change that element to 0.  Before exiting the
7787              * routine, we must restore the element to '1' */
7788             array_b--;
7789             len_b++;
7790             array_b[0] = 0;
7791         }
7792     }
7793
7794     /* Size the intersection for the worst case: that the intersection ends up
7795      * fragmenting everything to be completely disjoint */
7796     r= _new_invlist(len_a + len_b);
7797
7798     /* Will contain U+0000 iff both components do */
7799     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7800                                      && len_b > 0 && array_b[0] == 0);
7801
7802     /* Go through each list item by item, stopping when exhausted one of
7803      * them */
7804     while (i_a < len_a && i_b < len_b) {
7805         UV cp;      /* The element to potentially add to the intersection's
7806                        array */
7807         bool cp_in_set; /* Is it in the input list's set or not */
7808
7809         /* We need to take one or the other of the two inputs for the
7810          * intersection.  Since we are merging two sorted lists, we take the
7811          * smaller of the next items.  In case of a tie, we take the one that
7812          * is not in its set first (a difference from the union algorithm).  If
7813          * we took one in the set first, it would increment the count, possibly
7814          * to 2 which would cause it to be output as starting a range in the
7815          * intersection, and the next time through we would take that same
7816          * number, and output it again as ending the set.  By doing it the
7817          * opposite of this, there is no possibility that the count will be
7818          * momentarily incremented to 2.  (In a tie and both are in the set or
7819          * both not in the set, it doesn't matter which we take first.) */
7820         if (array_a[i_a] < array_b[i_b]
7821             || (array_a[i_a] == array_b[i_b]
7822                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7823         {
7824             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7825             cp= array_a[i_a++];
7826         }
7827         else {
7828             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7829             cp= array_b[i_b++];
7830         }
7831
7832         /* Here, have chosen which of the two inputs to look at.  Only output
7833          * if the running count changes to/from 2, which marks the
7834          * beginning/end of a range that's in the intersection */
7835         if (cp_in_set) {
7836             count++;
7837             if (count == 2) {
7838                 array_r[i_r++] = cp;
7839             }
7840         }
7841         else {
7842             if (count == 2) {
7843                 array_r[i_r++] = cp;
7844             }
7845             count--;
7846         }
7847     }
7848
7849     /* Here, we are finished going through at least one of the lists, which
7850      * means there is something remaining in at most one.  We check if the list
7851      * that has been exhausted is positioned such that we are in the middle
7852      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7853      * the ones we care about.)  There are four cases:
7854      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7855      *     nothing left in the intersection.
7856      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7857      *     above 2.  What should be output is exactly that which is in the
7858      *     non-exhausted set, as everything it has is also in the intersection
7859      *     set, and everything it doesn't have can't be in the intersection
7860      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7861      *     gets incremented to 2.  Like the previous case, the intersection is
7862      *     everything that remains in the non-exhausted set.
7863      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7864      *     remains 1.  And the intersection has nothing more. */
7865     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7866         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7867     {
7868         count++;
7869     }
7870
7871     /* The final length is what we've output so far plus what else is in the
7872      * intersection.  At most one of the subexpressions below will be non-zero */
7873     len_r = i_r;
7874     if (count >= 2) {
7875         len_r += (len_a - i_a) + (len_b - i_b);
7876     }
7877
7878     /* Set result to final length, which can change the pointer to array_r, so
7879      * re-find it */
7880     if (len_r != _invlist_len(r)) {
7881         invlist_set_len(r, len_r);
7882         invlist_trim(r);
7883         array_r = invlist_array(r);
7884     }
7885
7886     /* Finish outputting any remaining */
7887     if (count >= 2) { /* At most one will have a non-zero copy count */
7888         IV copy_count;
7889         if ((copy_count = len_a - i_a) > 0) {
7890             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7891         }
7892         else if ((copy_count = len_b - i_b) > 0) {
7893             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7894         }
7895     }
7896
7897     /* If we've changed b, restore it */
7898     if (complement_b) {
7899         array_b[0] = 1;
7900     }
7901
7902     /*  We may be removing a reference to one of the inputs */
7903     if (a == *i || b == *i) {
7904         assert(! invlist_is_iterating(*i));
7905         SvREFCNT_dec_NN(*i);
7906     }
7907
7908     *i = r;
7909     return;
7910 }
7911
7912 SV*
7913 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7914 {
7915     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7916      * set.  A pointer to the inversion list is returned.  This may actually be
7917      * a new list, in which case the passed in one has been destroyed.  The
7918      * passed in inversion list can be NULL, in which case a new one is created
7919      * with just the one range in it */
7920
7921     SV* range_invlist;
7922     UV len;
7923
7924     if (invlist == NULL) {
7925         invlist = _new_invlist(2);
7926         len = 0;
7927     }
7928     else {
7929         len = _invlist_len(invlist);
7930     }
7931
7932     /* If comes after the final entry actually in the list, can just append it
7933      * to the end, */
7934     if (len == 0
7935         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
7936             && start >= invlist_array(invlist)[len - 1]))
7937     {
7938         _append_range_to_invlist(invlist, start, end);
7939         return invlist;
7940     }
7941
7942     /* Here, can't just append things, create and return a new inversion list
7943      * which is the union of this range and the existing inversion list */
7944     range_invlist = _new_invlist(2);
7945     _append_range_to_invlist(range_invlist, start, end);
7946
7947     _invlist_union(invlist, range_invlist, &invlist);
7948
7949     /* The temporary can be freed */
7950     SvREFCNT_dec_NN(range_invlist);
7951
7952     return invlist;
7953 }
7954
7955 #endif
7956
7957 PERL_STATIC_INLINE SV*
7958 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7959     return _add_range_to_invlist(invlist, cp, cp);
7960 }
7961
7962 #ifndef PERL_IN_XSUB_RE
7963 void
7964 Perl__invlist_invert(pTHX_ SV* const invlist)
7965 {
7966     /* Complement the input inversion list.  This adds a 0 if the list didn't
7967      * have a zero; removes it otherwise.  As described above, the data
7968      * structure is set up so that this is very efficient */
7969
7970     UV* len_pos = _get_invlist_len_addr(invlist);
7971
7972     PERL_ARGS_ASSERT__INVLIST_INVERT;
7973
7974     assert(! invlist_is_iterating(invlist));
7975
7976     /* The inverse of matching nothing is matching everything */
7977     if (*len_pos == 0) {
7978         _append_range_to_invlist(invlist, 0, UV_MAX);
7979         return;
7980     }
7981
7982     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7983      * zero element was a 0, so it is being removed, so the length decrements
7984      * by 1; and vice-versa.  SvCUR is unaffected */
7985     if (*get_invlist_zero_addr(invlist) ^= 1) {
7986         (*len_pos)--;
7987     }
7988     else {
7989         (*len_pos)++;
7990     }
7991 }
7992
7993 void
7994 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7995 {
7996     /* Complement the input inversion list (which must be a Unicode property,
7997      * all of which don't match above the Unicode maximum code point.)  And
7998      * Perl has chosen to not have the inversion match above that either.  This
7999      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8000      */
8001
8002     UV len;
8003     UV* array;
8004
8005     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8006
8007     _invlist_invert(invlist);
8008
8009     len = _invlist_len(invlist);
8010
8011     if (len != 0) { /* If empty do nothing */
8012         array = invlist_array(invlist);
8013         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8014             /* Add 0x110000.  First, grow if necessary */
8015             len++;
8016             if (invlist_max(invlist) < len) {
8017                 invlist_extend(invlist, len);
8018                 array = invlist_array(invlist);
8019             }
8020             invlist_set_len(invlist, len);
8021             array[len - 1] = PERL_UNICODE_MAX + 1;
8022         }
8023         else {  /* Remove the 0x110000 */
8024             invlist_set_len(invlist, len - 1);
8025         }
8026     }
8027
8028     return;
8029 }
8030 #endif
8031
8032 PERL_STATIC_INLINE SV*
8033 S_invlist_clone(pTHX_ SV* const invlist)
8034 {
8035
8036     /* Return a new inversion list that is a copy of the input one, which is
8037      * unchanged */
8038
8039     /* Need to allocate extra space to accommodate Perl's addition of a
8040      * trailing NUL to SvPV's, since it thinks they are always strings */
8041     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8042     STRLEN length = SvCUR(invlist);
8043
8044     PERL_ARGS_ASSERT_INVLIST_CLONE;
8045
8046     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8047     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8048
8049     return new_invlist;
8050 }
8051
8052 PERL_STATIC_INLINE UV*
8053 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8054 {
8055     /* Return the address of the UV that contains the current iteration
8056      * position */
8057
8058     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8059
8060     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8061 }
8062
8063 PERL_STATIC_INLINE UV*
8064 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8065 {
8066     /* Return the address of the UV that contains the version id. */
8067
8068     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8069
8070     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8071 }
8072
8073 PERL_STATIC_INLINE void
8074 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8075 {
8076     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8077
8078     *get_invlist_iter_addr(invlist) = 0;
8079 }
8080
8081 PERL_STATIC_INLINE void
8082 S_invlist_iterfinish(pTHX_ SV* invlist)
8083 {
8084     /* Terminate iterator for invlist.  This is to catch development errors.
8085      * Any iteration that is interrupted before completed should call this
8086      * function.  Functions that add code points anywhere else but to the end
8087      * of an inversion list assert that they are not in the middle of an
8088      * iteration.  If they were, the addition would make the iteration
8089      * problematical: if the iteration hadn't reached the place where things
8090      * were being added, it would be ok */
8091
8092     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8093
8094     *get_invlist_iter_addr(invlist) = UV_MAX;
8095 }
8096
8097 STATIC bool
8098 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8099 {
8100     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8101      * This call sets in <*start> and <*end>, the next range in <invlist>.
8102      * Returns <TRUE> if successful and the next call will return the next
8103      * range; <FALSE> if was already at the end of the list.  If the latter,
8104      * <*start> and <*end> are unchanged, and the next call to this function
8105      * will start over at the beginning of the list */
8106
8107     UV* pos = get_invlist_iter_addr(invlist);
8108     UV len = _invlist_len(invlist);
8109     UV *array;
8110
8111     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8112
8113     if (*pos >= len) {
8114         *pos = UV_MAX;  /* Force iterinit() to be required next time */
8115         return FALSE;
8116     }
8117
8118     array = invlist_array(invlist);
8119
8120     *start = array[(*pos)++];
8121
8122     if (*pos >= len) {
8123         *end = UV_MAX;
8124     }
8125     else {
8126         *end = array[(*pos)++] - 1;
8127     }
8128
8129     return TRUE;
8130 }
8131
8132 PERL_STATIC_INLINE bool
8133 S_invlist_is_iterating(pTHX_ SV* const invlist)
8134 {
8135     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8136
8137     return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8138 }
8139
8140 PERL_STATIC_INLINE UV
8141 S_invlist_highest(pTHX_ SV* const invlist)
8142 {
8143     /* Returns the highest code point that matches an inversion list.  This API
8144      * has an ambiguity, as it returns 0 under either the highest is actually
8145      * 0, or if the list is empty.  If this distinction matters to you, check
8146      * for emptiness before calling this function */
8147
8148     UV len = _invlist_len(invlist);
8149     UV *array;
8150
8151     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8152
8153     if (len == 0) {
8154         return 0;
8155     }
8156
8157     array = invlist_array(invlist);
8158
8159     /* The last element in the array in the inversion list always starts a
8160      * range that goes to infinity.  That range may be for code points that are
8161      * matched in the inversion list, or it may be for ones that aren't
8162      * matched.  In the latter case, the highest code point in the set is one
8163      * less than the beginning of this range; otherwise it is the final element
8164      * of this range: infinity */
8165     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8166            ? UV_MAX
8167            : array[len - 1] - 1;
8168 }
8169
8170 #ifndef PERL_IN_XSUB_RE
8171 SV *
8172 Perl__invlist_contents(pTHX_ SV* const invlist)
8173 {
8174     /* Get the contents of an inversion list into a string SV so that they can
8175      * be printed out.  It uses the format traditionally done for debug tracing
8176      */
8177
8178     UV start, end;
8179     SV* output = newSVpvs("\n");
8180
8181     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8182
8183     assert(! invlist_is_iterating(invlist));
8184
8185     invlist_iterinit(invlist);
8186     while (invlist_iternext(invlist, &start, &end)) {
8187         if (end == UV_MAX) {
8188             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8189         }
8190         else if (end != start) {
8191             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8192                     start,       end);
8193         }
8194         else {
8195             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8196         }
8197     }
8198
8199     return output;
8200 }
8201 #endif
8202
8203 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8204 void
8205 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8206 {
8207     /* Dumps out the ranges in an inversion list.  The string 'header'
8208      * if present is output on a line before the first range */
8209
8210     UV start, end;
8211
8212     PERL_ARGS_ASSERT__INVLIST_DUMP;
8213
8214     if (header && strlen(header)) {
8215         PerlIO_printf(Perl_debug_log, "%s\n", header);
8216     }
8217     if (invlist_is_iterating(invlist)) {
8218         PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8219         return;
8220     }
8221
8222     invlist_iterinit(invlist);
8223     while (invlist_iternext(invlist, &start, &end)) {
8224         if (end == UV_MAX) {
8225             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8226         }
8227         else if (end != start) {
8228             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8229                                                  start,         end);
8230         }
8231         else {
8232             PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8233         }
8234     }
8235 }
8236 #endif
8237
8238 #if 0
8239 bool
8240 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8241 {
8242     /* Return a boolean as to if the two passed in inversion lists are
8243      * identical.  The final argument, if TRUE, says to take the complement of
8244      * the second inversion list before doing the comparison */
8245
8246     UV* array_a = invlist_array(a);
8247     UV* array_b = invlist_array(b);
8248     UV len_a = _invlist_len(a);
8249     UV len_b = _invlist_len(b);
8250
8251     UV i = 0;               /* current index into the arrays */
8252     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8253
8254     PERL_ARGS_ASSERT__INVLISTEQ;
8255
8256     /* If are to compare 'a' with the complement of b, set it
8257      * up so are looking at b's complement. */
8258     if (complement_b) {
8259
8260         /* The complement of nothing is everything, so <a> would have to have
8261          * just one element, starting at zero (ending at infinity) */
8262         if (len_b == 0) {
8263             return (len_a == 1 && array_a[0] == 0);
8264         }
8265         else if (array_b[0] == 0) {
8266
8267             /* Otherwise, to complement, we invert.  Here, the first element is
8268              * 0, just remove it.  To do this, we just pretend the array starts
8269              * one later, and clear the flag as we don't have to do anything
8270              * else later */
8271
8272             array_b++;
8273             len_b--;
8274             complement_b = FALSE;
8275         }
8276         else {
8277
8278             /* But if the first element is not zero, we unshift a 0 before the
8279              * array.  The data structure reserves a space for that 0 (which
8280              * should be a '1' right now), so physical shifting is unneeded,
8281              * but temporarily change that element to 0.  Before exiting the
8282              * routine, we must restore the element to '1' */
8283             array_b--;
8284             len_b++;
8285             array_b[0] = 0;
8286         }
8287     }
8288
8289     /* Make sure that the lengths are the same, as well as the final element
8290      * before looping through the remainder.  (Thus we test the length, final,
8291      * and first elements right off the bat) */
8292     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8293         retval = FALSE;
8294     }
8295     else for (i = 0; i < len_a - 1; i++) {
8296         if (array_a[i] != array_b[i]) {
8297             retval = FALSE;
8298             break;
8299         }
8300     }
8301
8302     if (complement_b) {
8303         array_b[0] = 1;
8304     }
8305     return retval;
8306 }
8307 #endif
8308
8309 #undef HEADER_LENGTH
8310 #undef INVLIST_INITIAL_LENGTH
8311 #undef TO_INTERNAL_SIZE
8312 #undef FROM_INTERNAL_SIZE
8313 #undef INVLIST_LEN_OFFSET
8314 #undef INVLIST_ZERO_OFFSET
8315 #undef INVLIST_ITER_OFFSET
8316 #undef INVLIST_VERSION_ID
8317 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8318
8319 /* End of inversion list object */
8320
8321 STATIC void
8322 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8323 {
8324     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8325      * constructs, and updates RExC_flags with them.  On input, RExC_parse
8326      * should point to the first flag; it is updated on output to point to the
8327      * final ')' or ':'.  There needs to be at least one flag, or this will
8328      * abort */
8329
8330     /* for (?g), (?gc), and (?o) warnings; warning
8331        about (?c) will warn about (?g) -- japhy    */
8332
8333 #define WASTED_O  0x01
8334 #define WASTED_G  0x02
8335 #define WASTED_C  0x04
8336 #define WASTED_GC (0x02|0x04)
8337     I32 wastedflags = 0x00;
8338     U32 posflags = 0, negflags = 0;
8339     U32 *flagsp = &posflags;
8340     char has_charset_modifier = '\0';
8341     regex_charset cs;
8342     bool has_use_defaults = FALSE;
8343     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8344
8345     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8346
8347     /* '^' as an initial flag sets certain defaults */
8348     if (UCHARAT(RExC_parse) == '^') {
8349         RExC_parse++;
8350         has_use_defaults = TRUE;
8351         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8352         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8353                                         ? REGEX_UNICODE_CHARSET
8354                                         : REGEX_DEPENDS_CHARSET);
8355     }
8356
8357     cs = get_regex_charset(RExC_flags);
8358     if (cs == REGEX_DEPENDS_CHARSET
8359         && (RExC_utf8 || RExC_uni_semantics))
8360     {
8361         cs = REGEX_UNICODE_CHARSET;
8362     }
8363
8364     while (*RExC_parse) {
8365         /* && strchr("iogcmsx", *RExC_parse) */
8366         /* (?g), (?gc) and (?o) are useless here
8367            and must be globally applied -- japhy */
8368         switch (*RExC_parse) {
8369
8370             /* Code for the imsx flags */
8371             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8372
8373             case LOCALE_PAT_MOD:
8374                 if (has_charset_modifier) {
8375                     goto excess_modifier;
8376                 }
8377                 else if (flagsp == &negflags) {
8378                     goto neg_modifier;
8379                 }
8380                 cs = REGEX_LOCALE_CHARSET;
8381                 has_charset_modifier = LOCALE_PAT_MOD;
8382                 RExC_contains_locale = 1;
8383                 break;
8384             case UNICODE_PAT_MOD:
8385                 if (has_charset_modifier) {
8386                     goto excess_modifier;
8387                 }
8388                 else if (flagsp == &negflags) {
8389                     goto neg_modifier;
8390                 }
8391                 cs = REGEX_UNICODE_CHARSET;
8392                 has_charset_modifier = UNICODE_PAT_MOD;
8393                 break;
8394             case ASCII_RESTRICT_PAT_MOD:
8395                 if (flagsp == &negflags) {
8396                     goto neg_modifier;
8397                 }
8398                 if (has_charset_modifier) {
8399                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8400                         goto excess_modifier;
8401                     }
8402                     /* Doubled modifier implies more restricted */
8403                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8404                 }
8405                 else {
8406                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
8407                 }
8408                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8409                 break;
8410             case DEPENDS_PAT_MOD:
8411                 if (has_use_defaults) {
8412                     goto fail_modifiers;
8413                 }
8414                 else if (flagsp == &negflags) {
8415                     goto neg_modifier;
8416                 }
8417                 else if (has_charset_modifier) {
8418                     goto excess_modifier;
8419                 }
8420
8421                 /* The dual charset means unicode semantics if the
8422                  * pattern (or target, not known until runtime) are
8423                  * utf8, or something in the pattern indicates unicode
8424                  * semantics */
8425                 cs = (RExC_utf8 || RExC_uni_semantics)
8426                      ? REGEX_UNICODE_CHARSET
8427                      : REGEX_DEPENDS_CHARSET;
8428                 has_charset_modifier = DEPENDS_PAT_MOD;
8429                 break;
8430             excess_modifier:
8431                 RExC_parse++;
8432                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8433                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8434                 }
8435                 else if (has_charset_modifier == *(RExC_parse - 1)) {
8436                     vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8437                 }
8438                 else {
8439                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8440                 }
8441                 /*NOTREACHED*/
8442             neg_modifier:
8443                 RExC_parse++;
8444                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8445                 /*NOTREACHED*/
8446             case ONCE_PAT_MOD: /* 'o' */
8447             case GLOBAL_PAT_MOD: /* 'g' */
8448                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8449                     const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8450                     if (! (wastedflags & wflagbit) ) {
8451                         wastedflags |= wflagbit;
8452                         vWARN5(
8453                             RExC_parse + 1,
8454                             "Useless (%s%c) - %suse /%c modifier",
8455                             flagsp == &negflags ? "?-" : "?",
8456                             *RExC_parse,
8457                             flagsp == &negflags ? "don't " : "",
8458                             *RExC_parse
8459                         );
8460                     }
8461                 }
8462                 break;
8463
8464             case CONTINUE_PAT_MOD: /* 'c' */
8465                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8466                     if (! (wastedflags & WASTED_C) ) {
8467                         wastedflags |= WASTED_GC;
8468                         vWARN3(
8469                             RExC_parse + 1,
8470                             "Useless (%sc) - %suse /gc modifier",
8471                             flagsp == &negflags ? "?-" : "?",
8472                             flagsp == &negflags ? "don't " : ""
8473                         );
8474                     }
8475                 }
8476                 break;
8477             case KEEPCOPY_PAT_MOD: /* 'p' */
8478                 if (flagsp == &negflags) {
8479                     if (SIZE_ONLY)
8480                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8481                 } else {
8482                     *flagsp |= RXf_PMf_KEEPCOPY;
8483                 }
8484                 break;
8485             case '-':
8486                 /* A flag is a default iff it is following a minus, so
8487                  * if there is a minus, it means will be trying to
8488                  * re-specify a default which is an error */
8489                 if (has_use_defaults || flagsp == &negflags) {
8490                     goto fail_modifiers;
8491                 }
8492                 flagsp = &negflags;
8493                 wastedflags = 0;  /* reset so (?g-c) warns twice */
8494                 break;
8495             case ':':
8496             case ')':
8497                 RExC_flags |= posflags;
8498                 RExC_flags &= ~negflags;
8499                 set_regex_charset(&RExC_flags, cs);
8500                 return;
8501                 /*NOTREACHED*/
8502             default:
8503             fail_modifiers:
8504                 RExC_parse++;
8505                 vFAIL3("Sequence (%.*s...) not recognized",
8506                        RExC_parse-seqstart, seqstart);
8507                 /*NOTREACHED*/
8508         }
8509
8510         ++RExC_parse;
8511     }
8512 }
8513
8514 /*
8515  - reg - regular expression, i.e. main body or parenthesized thing
8516  *
8517  * Caller must absorb opening parenthesis.
8518  *
8519  * Combining parenthesis handling with the base level of regular expression
8520  * is a trifle forced, but the need to tie the tails of the branches to what
8521  * follows makes it hard to avoid.
8522  */
8523 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8524 #ifdef DEBUGGING
8525 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8526 #else
8527 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8528 #endif
8529
8530 STATIC regnode *
8531 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8532     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8533 {
8534     dVAR;
8535     regnode *ret;               /* Will be the head of the group. */
8536     regnode *br;
8537     regnode *lastbr;
8538     regnode *ender = NULL;
8539     I32 parno = 0;
8540     I32 flags;
8541     U32 oregflags = RExC_flags;
8542     bool have_branch = 0;
8543     bool is_open = 0;
8544     I32 freeze_paren = 0;
8545     I32 after_freeze = 0;
8546
8547     char * parse_start = RExC_parse; /* MJD */
8548     char * const oregcomp_parse = RExC_parse;
8549
8550     GET_RE_DEBUG_FLAGS_DECL;
8551
8552     PERL_ARGS_ASSERT_REG;
8553     DEBUG_PARSE("reg ");
8554
8555     *flagp = 0;                         /* Tentatively. */
8556
8557
8558     /* Make an OPEN node, if parenthesized. */
8559     if (paren) {
8560         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8561             char *start_verb = RExC_parse;
8562             STRLEN verb_len = 0;
8563             char *start_arg = NULL;
8564             unsigned char op = 0;
8565             int argok = 1;
8566             int internal_argval = 0; /* internal_argval is only useful if !argok */
8567             while ( *RExC_parse && *RExC_parse != ')' ) {
8568                 if ( *RExC_parse == ':' ) {
8569                     start_arg = RExC_parse + 1;
8570                     break;
8571                 }
8572                 RExC_parse++;
8573             }
8574             ++start_verb;
8575             verb_len = RExC_parse - start_verb;
8576             if ( start_arg ) {
8577                 RExC_parse++;
8578                 while ( *RExC_parse && *RExC_parse != ')' )
8579                     RExC_parse++;
8580                 if ( *RExC_parse != ')' )
8581                     vFAIL("Unterminated verb pattern argument");
8582                 if ( RExC_parse == start_arg )
8583                     start_arg = NULL;
8584             } else {
8585                 if ( *RExC_parse != ')' )
8586                     vFAIL("Unterminated verb pattern");
8587             }
8588
8589             switch ( *start_verb ) {
8590             case 'A':  /* (*ACCEPT) */
8591                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8592                     op = ACCEPT;
8593                     internal_argval = RExC_nestroot;
8594                 }
8595                 break;
8596             case 'C':  /* (*COMMIT) */
8597                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8598                     op = COMMIT;
8599                 break;
8600             case 'F':  /* (*FAIL) */
8601                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8602                     op = OPFAIL;
8603                     argok = 0;
8604                 }
8605                 break;
8606             case ':':  /* (*:NAME) */
8607             case 'M':  /* (*MARK:NAME) */
8608                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8609                     op = MARKPOINT;
8610                     argok = -1;
8611                 }
8612                 break;
8613             case 'P':  /* (*PRUNE) */
8614                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8615                     op = PRUNE;
8616                 break;
8617             case 'S':   /* (*SKIP) */
8618                 if ( memEQs(start_verb,verb_len,"SKIP") )
8619                     op = SKIP;
8620                 break;
8621             case 'T':  /* (*THEN) */
8622                 /* [19:06] <TimToady> :: is then */
8623                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8624                     op = CUTGROUP;
8625                     RExC_seen |= REG_SEEN_CUTGROUP;
8626                 }
8627                 break;
8628             }
8629             if ( ! op ) {
8630                 RExC_parse++;
8631                 vFAIL3("Unknown verb pattern '%.*s'",
8632                     verb_len, start_verb);
8633             }
8634             if ( argok ) {
8635                 if ( start_arg && internal_argval ) {
8636                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8637                         verb_len, start_verb);
8638                 } else if ( argok < 0 && !start_arg ) {
8639                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8640                         verb_len, start_verb);
8641                 } else {
8642                     ret = reganode(pRExC_state, op, internal_argval);
8643                     if ( ! internal_argval && ! SIZE_ONLY ) {
8644                         if (start_arg) {
8645                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8646                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8647                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8648                             ret->flags = 0;
8649                         } else {
8650                             ret->flags = 1;
8651                         }
8652                     }
8653                 }
8654                 if (!internal_argval)
8655                     RExC_seen |= REG_SEEN_VERBARG;
8656             } else if ( start_arg ) {
8657                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8658                         verb_len, start_verb);
8659             } else {
8660                 ret = reg_node(pRExC_state, op);
8661             }
8662             nextchar(pRExC_state);
8663             return ret;
8664         } else
8665         if (*RExC_parse == '?') { /* (?...) */
8666             bool is_logical = 0;
8667             const char * const seqstart = RExC_parse;
8668
8669             RExC_parse++;
8670             paren = *RExC_parse++;
8671             ret = NULL;                 /* For look-ahead/behind. */
8672             switch (paren) {
8673
8674             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8675                 paren = *RExC_parse++;
8676                 if ( paren == '<')         /* (?P<...>) named capture */
8677                     goto named_capture;
8678                 else if (paren == '>') {   /* (?P>name) named recursion */
8679                     goto named_recursion;
8680                 }
8681                 else if (paren == '=') {   /* (?P=...)  named backref */
8682                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8683                        you change this make sure you change that */
8684                     char* name_start = RExC_parse;
8685                     U32 num = 0;
8686                     SV *sv_dat = reg_scan_name(pRExC_state,
8687                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8688                     if (RExC_parse == name_start || *RExC_parse != ')')
8689                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8690
8691                     if (!SIZE_ONLY) {
8692                         num = add_data( pRExC_state, 1, "S" );
8693                         RExC_rxi->data->data[num]=(void*)sv_dat;
8694                         SvREFCNT_inc_simple_void(sv_dat);
8695                     }
8696                     RExC_sawback = 1;
8697                     ret = reganode(pRExC_state,
8698                                    ((! FOLD)
8699                                      ? NREF
8700                                      : (ASCII_FOLD_RESTRICTED)
8701                                        ? NREFFA
8702                                        : (AT_LEAST_UNI_SEMANTICS)
8703                                          ? NREFFU
8704                                          : (LOC)
8705                                            ? NREFFL
8706                                            : NREFF),
8707                                     num);
8708                     *flagp |= HASWIDTH;
8709
8710                     Set_Node_Offset(ret, parse_start+1);
8711                     Set_Node_Cur_Length(ret); /* MJD */
8712
8713                     nextchar(pRExC_state);
8714                     return ret;
8715                 }
8716                 RExC_parse++;
8717                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8718                 /*NOTREACHED*/
8719             case '<':           /* (?<...) */
8720                 if (*RExC_parse == '!')
8721                     paren = ',';
8722                 else if (*RExC_parse != '=')
8723               named_capture:
8724                 {               /* (?<...>) */
8725                     char *name_start;
8726                     SV *svname;
8727                     paren= '>';
8728             case '\'':          /* (?'...') */
8729                     name_start= RExC_parse;
8730                     svname = reg_scan_name(pRExC_state,
8731                         SIZE_ONLY ?  /* reverse test from the others */
8732                         REG_RSN_RETURN_NAME :
8733                         REG_RSN_RETURN_NULL);
8734                     if (RExC_parse == name_start) {
8735                         RExC_parse++;
8736                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8737                         /*NOTREACHED*/
8738                     }
8739                     if (*RExC_parse != paren)
8740                         vFAIL2("Sequence (?%c... not terminated",
8741                             paren=='>' ? '<' : paren);
8742                     if (SIZE_ONLY) {
8743                         HE *he_str;
8744                         SV *sv_dat = NULL;
8745                         if (!svname) /* shouldn't happen */
8746                             Perl_croak(aTHX_
8747                                 "panic: reg_scan_name returned NULL");
8748                         if (!RExC_paren_names) {
8749                             RExC_paren_names= newHV();
8750                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8751 #ifdef DEBUGGING
8752                             RExC_paren_name_list= newAV();
8753                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8754 #endif
8755                         }
8756                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8757                         if ( he_str )
8758                             sv_dat = HeVAL(he_str);
8759                         if ( ! sv_dat ) {
8760                             /* croak baby croak */
8761                             Perl_croak(aTHX_
8762                                 "panic: paren_name hash element allocation failed");
8763                         } else if ( SvPOK(sv_dat) ) {
8764                             /* (?|...) can mean we have dupes so scan to check
8765                                its already been stored. Maybe a flag indicating
8766                                we are inside such a construct would be useful,
8767                                but the arrays are likely to be quite small, so
8768                                for now we punt -- dmq */
8769                             IV count = SvIV(sv_dat);
8770                             I32 *pv = (I32*)SvPVX(sv_dat);
8771                             IV i;
8772                             for ( i = 0 ; i < count ; i++ ) {
8773                                 if ( pv[i] == RExC_npar ) {
8774                                     count = 0;
8775                                     break;
8776                                 }
8777                             }
8778                             if ( count ) {
8779                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8780                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8781                                 pv[count] = RExC_npar;
8782                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8783                             }
8784                         } else {
8785                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8786                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8787                             SvIOK_on(sv_dat);
8788                             SvIV_set(sv_dat, 1);
8789                         }
8790 #ifdef DEBUGGING
8791                         /* Yes this does cause a memory leak in debugging Perls */
8792                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8793                             SvREFCNT_dec_NN(svname);
8794 #endif
8795
8796                         /*sv_dump(sv_dat);*/
8797                     }
8798                     nextchar(pRExC_state);
8799                     paren = 1;
8800                     goto capturing_parens;
8801                 }
8802                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8803                 RExC_in_lookbehind++;
8804                 RExC_parse++;
8805             case '=':           /* (?=...) */
8806                 RExC_seen_zerolen++;
8807                 break;
8808             case '!':           /* (?!...) */
8809                 RExC_seen_zerolen++;
8810                 if (*RExC_parse == ')') {
8811                     ret=reg_node(pRExC_state, OPFAIL);
8812                     nextchar(pRExC_state);
8813                     return ret;
8814                 }
8815                 break;
8816             case '|':           /* (?|...) */
8817                 /* branch reset, behave like a (?:...) except that
8818                    buffers in alternations share the same numbers */
8819                 paren = ':';
8820                 after_freeze = freeze_paren = RExC_npar;
8821                 break;
8822             case ':':           /* (?:...) */
8823             case '>':           /* (?>...) */
8824                 break;
8825             case '$':           /* (?$...) */
8826             case '@':           /* (?@...) */
8827                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8828                 break;
8829             case '#':           /* (?#...) */
8830                 while (*RExC_parse && *RExC_parse != ')')
8831                     RExC_parse++;
8832                 if (*RExC_parse != ')')
8833                     FAIL("Sequence (?#... not terminated");
8834                 nextchar(pRExC_state);
8835                 *flagp = TRYAGAIN;
8836                 return NULL;
8837             case '0' :           /* (?0) */
8838             case 'R' :           /* (?R) */
8839                 if (*RExC_parse != ')')
8840                     FAIL("Sequence (?R) not terminated");
8841                 ret = reg_node(pRExC_state, GOSTART);
8842                 *flagp |= POSTPONED;
8843                 nextchar(pRExC_state);
8844                 return ret;
8845                 /*notreached*/
8846             { /* named and numeric backreferences */
8847                 I32 num;
8848             case '&':            /* (?&NAME) */
8849                 parse_start = RExC_parse - 1;
8850               named_recursion:
8851                 {
8852                     SV *sv_dat = reg_scan_name(pRExC_state,
8853                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8854                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8855                 }
8856                 goto gen_recurse_regop;
8857                 assert(0); /* NOT REACHED */
8858             case '+':
8859                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8860                     RExC_parse++;
8861                     vFAIL("Illegal pattern");
8862                 }
8863                 goto parse_recursion;
8864                 /* NOT REACHED*/
8865             case '-': /* (?-1) */
8866                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8867                     RExC_parse--; /* rewind to let it be handled later */
8868                     goto parse_flags;
8869                 }
8870                 /*FALLTHROUGH */
8871             case '1': case '2': case '3': case '4': /* (?1) */
8872             case '5': case '6': case '7': case '8': case '9':
8873                 RExC_parse--;
8874               parse_recursion:
8875                 num = atoi(RExC_parse);
8876                 parse_start = RExC_parse - 1; /* MJD */
8877                 if (*RExC_parse == '-')
8878                     RExC_parse++;
8879                 while (isDIGIT(*RExC_parse))
8880                         RExC_parse++;
8881                 if (*RExC_parse!=')')
8882                     vFAIL("Expecting close bracket");
8883
8884               gen_recurse_regop:
8885                 if ( paren == '-' ) {
8886                     /*
8887                     Diagram of capture buffer numbering.
8888                     Top line is the normal capture buffer numbers
8889                     Bottom line is the negative indexing as from
8890                     the X (the (?-2))
8891
8892                     +   1 2    3 4 5 X          6 7
8893                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8894                     -   5 4    3 2 1 X          x x
8895
8896                     */
8897                     num = RExC_npar + num;
8898                     if (num < 1)  {
8899                         RExC_parse++;
8900                         vFAIL("Reference to nonexistent group");
8901                     }
8902                 } else if ( paren == '+' ) {
8903                     num = RExC_npar + num - 1;
8904                 }
8905
8906                 ret = reganode(pRExC_state, GOSUB, num);
8907                 if (!SIZE_ONLY) {
8908                     if (num > (I32)RExC_rx->nparens) {
8909                         RExC_parse++;
8910                         vFAIL("Reference to nonexistent group");
8911                     }
8912                     ARG2L_SET( ret, RExC_recurse_count++);
8913                     RExC_emit++;
8914                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8915                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8916                 } else {
8917                     RExC_size++;
8918                 }
8919                 RExC_seen |= REG_SEEN_RECURSE;
8920                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8921                 Set_Node_Offset(ret, parse_start); /* MJD */
8922
8923                 *flagp |= POSTPONED;
8924                 nextchar(pRExC_state);
8925                 return ret;
8926             } /* named and numeric backreferences */
8927             assert(0); /* NOT REACHED */
8928
8929             case '?':           /* (??...) */
8930                 is_logical = 1;
8931                 if (*RExC_parse != '{') {
8932                     RExC_parse++;
8933                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8934                     /*NOTREACHED*/
8935                 }
8936                 *flagp |= POSTPONED;
8937                 paren = *RExC_parse++;
8938                 /* FALL THROUGH */
8939             case '{':           /* (?{...}) */
8940             {
8941                 U32 n = 0;
8942                 struct reg_code_block *cb;
8943
8944                 RExC_seen_zerolen++;
8945
8946                 if (   !pRExC_state->num_code_blocks
8947                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8948                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8949                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8950                             - RExC_start)
8951                 ) {
8952                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8953                         FAIL("panic: Sequence (?{...}): no code block found\n");
8954                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8955                 }
8956                 /* this is a pre-compiled code block (?{...}) */
8957                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8958                 RExC_parse = RExC_start + cb->end;
8959                 if (!SIZE_ONLY) {
8960                     OP *o = cb->block;
8961                     if (cb->src_regex) {
8962                         n = add_data(pRExC_state, 2, "rl");
8963                         RExC_rxi->data->data[n] =
8964                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8965                         RExC_rxi->data->data[n+1] = (void*)o;
8966                     }
8967                     else {
8968                         n = add_data(pRExC_state, 1,
8969                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8970                         RExC_rxi->data->data[n] = (void*)o;
8971                     }
8972                 }
8973                 pRExC_state->code_index++;
8974                 nextchar(pRExC_state);
8975
8976                 if (is_logical) {
8977                     regnode *eval;
8978                     ret = reg_node(pRExC_state, LOGICAL);
8979                     eval = reganode(pRExC_state, EVAL, n);
8980                     if (!SIZE_ONLY) {
8981                         ret->flags = 2;
8982                         /* for later propagation into (??{}) return value */
8983                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8984                     }
8985                     REGTAIL(pRExC_state, ret, eval);
8986                     /* deal with the length of this later - MJD */
8987                     return ret;
8988                 }
8989                 ret = reganode(pRExC_state, EVAL, n);
8990                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8991                 Set_Node_Offset(ret, parse_start);
8992                 return ret;
8993             }
8994             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8995             {
8996                 int is_define= 0;
8997                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8998                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8999                         || RExC_parse[1] == '<'
9000                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9001                         I32 flag;
9002
9003                         ret = reg_node(pRExC_state, LOGICAL);
9004                         if (!SIZE_ONLY)
9005                             ret->flags = 1;
9006                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
9007                         goto insert_if;
9008                     }
9009                 }
9010                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9011                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9012                 {
9013                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9014                     char *name_start= RExC_parse++;
9015                     U32 num = 0;
9016                     SV *sv_dat=reg_scan_name(pRExC_state,
9017                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9018                     if (RExC_parse == name_start || *RExC_parse != ch)
9019                         vFAIL2("Sequence (?(%c... not terminated",
9020                             (ch == '>' ? '<' : ch));
9021                     RExC_parse++;
9022                     if (!SIZE_ONLY) {
9023                         num = add_data( pRExC_state, 1, "S" );
9024                         RExC_rxi->data->data[num]=(void*)sv_dat;
9025                         SvREFCNT_inc_simple_void(sv_dat);
9026                     }
9027                     ret = reganode(pRExC_state,NGROUPP,num);
9028                     goto insert_if_check_paren;
9029                 }
9030                 else if (RExC_parse[0] == 'D' &&
9031                          RExC_parse[1] == 'E' &&
9032                          RExC_parse[2] == 'F' &&
9033                          RExC_parse[3] == 'I' &&
9034                          RExC_parse[4] == 'N' &&
9035                          RExC_parse[5] == 'E')
9036                 {
9037                     ret = reganode(pRExC_state,DEFINEP,0);
9038                     RExC_parse +=6 ;
9039                     is_define = 1;
9040                     goto insert_if_check_paren;
9041                 }
9042                 else if (RExC_parse[0] == 'R') {
9043                     RExC_parse++;
9044                     parno = 0;
9045                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9046                         parno = atoi(RExC_parse++);
9047                         while (isDIGIT(*RExC_parse))
9048                             RExC_parse++;
9049                     } else if (RExC_parse[0] == '&') {
9050                         SV *sv_dat;
9051                         RExC_parse++;
9052                         sv_dat = reg_scan_name(pRExC_state,
9053                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9054                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9055                     }
9056                     ret = reganode(pRExC_state,INSUBP,parno);
9057                     goto insert_if_check_paren;
9058                 }
9059                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9060                     /* (?(1)...) */
9061                     char c;
9062                     parno = atoi(RExC_parse++);
9063
9064                     while (isDIGIT(*RExC_parse))
9065                         RExC_parse++;
9066                     ret = reganode(pRExC_state, GROUPP, parno);
9067
9068                  insert_if_check_paren:
9069                     if ((c = *nextchar(pRExC_state)) != ')')
9070                         vFAIL("Switch condition not recognized");
9071                   insert_if:
9072                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9073                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9074                     if (br == NULL)
9075                         br = reganode(pRExC_state, LONGJMP, 0);
9076                     else
9077                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9078                     c = *nextchar(pRExC_state);
9079                     if (flags&HASWIDTH)
9080                         *flagp |= HASWIDTH;
9081                     if (c == '|') {
9082                         if (is_define)
9083                             vFAIL("(?(DEFINE)....) does not allow branches");
9084                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9085                         regbranch(pRExC_state, &flags, 1,depth+1);
9086                         REGTAIL(pRExC_state, ret, lastbr);
9087                         if (flags&HASWIDTH)
9088                             *flagp |= HASWIDTH;
9089                         c = *nextchar(pRExC_state);
9090                     }
9091                     else
9092                         lastbr = NULL;
9093                     if (c != ')')
9094                         vFAIL("Switch (?(condition)... contains too many branches");
9095                     ender = reg_node(pRExC_state, TAIL);
9096                     REGTAIL(pRExC_state, br, ender);
9097                     if (lastbr) {
9098                         REGTAIL(pRExC_state, lastbr, ender);
9099                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9100                     }
9101                     else
9102                         REGTAIL(pRExC_state, ret, ender);
9103                     RExC_size++; /* XXX WHY do we need this?!!
9104                                     For large programs it seems to be required
9105                                     but I can't figure out why. -- dmq*/
9106                     return ret;
9107                 }
9108                 else {
9109                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9110                 }
9111             }
9112             case '[':           /* (?[ ... ]) */
9113                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9114                                          oregcomp_parse);
9115             case 0:
9116                 RExC_parse--; /* for vFAIL to print correctly */
9117                 vFAIL("Sequence (? incomplete");
9118                 break;
9119             default: /* e.g., (?i) */
9120                 --RExC_parse;
9121               parse_flags:
9122                 parse_lparen_question_flags(pRExC_state);
9123                 if (UCHARAT(RExC_parse) != ':') {
9124                     nextchar(pRExC_state);
9125                     *flagp = TRYAGAIN;
9126                     return NULL;
9127                 }
9128                 paren = ':';
9129                 nextchar(pRExC_state);
9130                 ret = NULL;
9131                 goto parse_rest;
9132             } /* end switch */
9133         }
9134         else {                  /* (...) */
9135           capturing_parens:
9136             parno = RExC_npar;
9137             RExC_npar++;
9138
9139             ret = reganode(pRExC_state, OPEN, parno);
9140             if (!SIZE_ONLY ){
9141                 if (!RExC_nestroot)
9142                     RExC_nestroot = parno;
9143                 if (RExC_seen & REG_SEEN_RECURSE
9144                     && !RExC_open_parens[parno-1])
9145                 {
9146                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9147                         "Setting open paren #%"IVdf" to %d\n",
9148                         (IV)parno, REG_NODE_NUM(ret)));
9149                     RExC_open_parens[parno-1]= ret;
9150                 }
9151             }
9152             Set_Node_Length(ret, 1); /* MJD */
9153             Set_Node_Offset(ret, RExC_parse); /* MJD */
9154             is_open = 1;
9155         }
9156     }
9157     else                        /* ! paren */
9158         ret = NULL;
9159
9160    parse_rest:
9161     /* Pick up the branches, linking them together. */
9162     parse_start = RExC_parse;   /* MJD */
9163     br = regbranch(pRExC_state, &flags, 1,depth+1);
9164
9165     /*     branch_len = (paren != 0); */
9166
9167     if (br == NULL)
9168         return(NULL);
9169     if (*RExC_parse == '|') {
9170         if (!SIZE_ONLY && RExC_extralen) {
9171             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9172         }
9173         else {                  /* MJD */
9174             reginsert(pRExC_state, BRANCH, br, depth+1);
9175             Set_Node_Length(br, paren != 0);
9176             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9177         }
9178         have_branch = 1;
9179         if (SIZE_ONLY)
9180             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9181     }
9182     else if (paren == ':') {
9183         *flagp |= flags&SIMPLE;
9184     }
9185     if (is_open) {                              /* Starts with OPEN. */
9186         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9187     }
9188     else if (paren != '?')              /* Not Conditional */
9189         ret = br;
9190     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9191     lastbr = br;
9192     while (*RExC_parse == '|') {
9193         if (!SIZE_ONLY && RExC_extralen) {
9194             ender = reganode(pRExC_state, LONGJMP,0);
9195             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9196         }
9197         if (SIZE_ONLY)
9198             RExC_extralen += 2;         /* Account for LONGJMP. */
9199         nextchar(pRExC_state);
9200         if (freeze_paren) {
9201             if (RExC_npar > after_freeze)
9202                 after_freeze = RExC_npar;
9203             RExC_npar = freeze_paren;
9204         }
9205         br = regbranch(pRExC_state, &flags, 0, depth+1);
9206
9207         if (br == NULL)
9208             return(NULL);
9209         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9210         lastbr = br;
9211         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9212     }
9213
9214     if (have_branch || paren != ':') {
9215         /* Make a closing node, and hook it on the end. */
9216         switch (paren) {
9217         case ':':
9218             ender = reg_node(pRExC_state, TAIL);
9219             break;
9220         case 1:
9221             ender = reganode(pRExC_state, CLOSE, parno);
9222             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9223                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9224                         "Setting close paren #%"IVdf" to %d\n",
9225                         (IV)parno, REG_NODE_NUM(ender)));
9226                 RExC_close_parens[parno-1]= ender;
9227                 if (RExC_nestroot == parno)
9228                     RExC_nestroot = 0;
9229             }
9230             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9231             Set_Node_Length(ender,1); /* MJD */
9232             break;
9233         case '<':
9234         case ',':
9235         case '=':
9236         case '!':
9237             *flagp &= ~HASWIDTH;
9238             /* FALL THROUGH */
9239         case '>':
9240             ender = reg_node(pRExC_state, SUCCEED);
9241             break;
9242         case 0:
9243             ender = reg_node(pRExC_state, END);
9244             if (!SIZE_ONLY) {
9245                 assert(!RExC_opend); /* there can only be one! */
9246                 RExC_opend = ender;
9247             }
9248             break;
9249         }
9250         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9251             SV * const mysv_val1=sv_newmortal();
9252             SV * const mysv_val2=sv_newmortal();
9253             DEBUG_PARSE_MSG("lsbr");
9254             regprop(RExC_rx, mysv_val1, lastbr);
9255             regprop(RExC_rx, mysv_val2, ender);
9256             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9257                           SvPV_nolen_const(mysv_val1),
9258                           (IV)REG_NODE_NUM(lastbr),
9259                           SvPV_nolen_const(mysv_val2),
9260                           (IV)REG_NODE_NUM(ender),
9261                           (IV)(ender - lastbr)
9262             );
9263         });
9264         REGTAIL(pRExC_state, lastbr, ender);
9265
9266         if (have_branch && !SIZE_ONLY) {
9267             char is_nothing= 1;
9268             if (depth==1)
9269                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9270
9271             /* Hook the tails of the branches to the closing node. */
9272             for (br = ret; br; br = regnext(br)) {
9273                 const U8 op = PL_regkind[OP(br)];
9274                 if (op == BRANCH) {
9275                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9276                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9277                         is_nothing= 0;
9278                 }
9279                 else if (op == BRANCHJ) {
9280                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9281                     /* for now we always disable this optimisation * /
9282                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9283                     */
9284                         is_nothing= 0;
9285                 }
9286             }
9287             if (is_nothing) {
9288                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9289                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9290                     SV * const mysv_val1=sv_newmortal();
9291                     SV * const mysv_val2=sv_newmortal();
9292                     DEBUG_PARSE_MSG("NADA");
9293                     regprop(RExC_rx, mysv_val1, ret);
9294                     regprop(RExC_rx, mysv_val2, ender);
9295                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9296                                   SvPV_nolen_const(mysv_val1),
9297                                   (IV)REG_NODE_NUM(ret),
9298                                   SvPV_nolen_const(mysv_val2),
9299                                   (IV)REG_NODE_NUM(ender),
9300                                   (IV)(ender - ret)
9301                     );
9302                 });
9303                 OP(br)= NOTHING;
9304                 if (OP(ender) == TAIL) {
9305                     NEXT_OFF(br)= 0;
9306                     RExC_emit= br + 1;
9307                 } else {
9308                     regnode *opt;
9309                     for ( opt= br + 1; opt < ender ; opt++ )
9310                         OP(opt)= OPTIMIZED;
9311                     NEXT_OFF(br)= ender - br;
9312                 }
9313             }
9314         }
9315     }
9316
9317     {
9318         const char *p;
9319         static const char parens[] = "=!<,>";
9320
9321         if (paren && (p = strchr(parens, paren))) {
9322             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9323             int flag = (p - parens) > 1;
9324
9325             if (paren == '>')
9326                 node = SUSPEND, flag = 0;
9327             reginsert(pRExC_state, node,ret, depth+1);
9328             Set_Node_Cur_Length(ret);
9329             Set_Node_Offset(ret, parse_start + 1);
9330             ret->flags = flag;
9331             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9332         }
9333     }
9334
9335     /* Check for proper termination. */
9336     if (paren) {
9337         RExC_flags = oregflags;
9338         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9339             RExC_parse = oregcomp_parse;
9340             vFAIL("Unmatched (");
9341         }
9342     }
9343     else if (!paren && RExC_parse < RExC_end) {
9344         if (*RExC_parse == ')') {
9345             RExC_parse++;
9346             vFAIL("Unmatched )");
9347         }
9348         else
9349             FAIL("Junk on end of regexp");      /* "Can't happen". */
9350         assert(0); /* NOTREACHED */
9351     }
9352
9353     if (RExC_in_lookbehind) {
9354         RExC_in_lookbehind--;
9355     }
9356     if (after_freeze > RExC_npar)
9357         RExC_npar = after_freeze;
9358     return(ret);
9359 }
9360
9361 /*
9362  - regbranch - one alternative of an | operator
9363  *
9364  * Implements the concatenation operator.
9365  */
9366 STATIC regnode *
9367 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9368 {
9369     dVAR;
9370     regnode *ret;
9371     regnode *chain = NULL;
9372     regnode *latest;
9373     I32 flags = 0, c = 0;
9374     GET_RE_DEBUG_FLAGS_DECL;
9375
9376     PERL_ARGS_ASSERT_REGBRANCH;
9377
9378     DEBUG_PARSE("brnc");
9379
9380     if (first)
9381         ret = NULL;
9382     else {
9383         if (!SIZE_ONLY && RExC_extralen)
9384             ret = reganode(pRExC_state, BRANCHJ,0);
9385         else {
9386             ret = reg_node(pRExC_state, BRANCH);
9387             Set_Node_Length(ret, 1);
9388         }
9389     }
9390
9391     if (!first && SIZE_ONLY)
9392         RExC_extralen += 1;                     /* BRANCHJ */
9393
9394     *flagp = WORST;                     /* Tentatively. */
9395
9396     RExC_parse--;
9397     nextchar(pRExC_state);
9398     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9399         flags &= ~TRYAGAIN;
9400         latest = regpiece(pRExC_state, &flags,depth+1);
9401         if (latest == NULL) {
9402             if (flags & TRYAGAIN)
9403                 continue;
9404             return(NULL);
9405         }
9406         else if (ret == NULL)
9407             ret = latest;
9408         *flagp |= flags&(HASWIDTH|POSTPONED);
9409         if (chain == NULL)      /* First piece. */
9410             *flagp |= flags&SPSTART;
9411         else {
9412             RExC_naughty++;
9413             REGTAIL(pRExC_state, chain, latest);
9414         }
9415         chain = latest;
9416         c++;
9417     }
9418     if (chain == NULL) {        /* Loop ran zero times. */
9419         chain = reg_node(pRExC_state, NOTHING);
9420         if (ret == NULL)
9421             ret = chain;
9422     }
9423     if (c == 1) {
9424         *flagp |= flags&SIMPLE;
9425     }
9426
9427     return ret;
9428 }
9429
9430 /*
9431  - regpiece - something followed by possible [*+?]
9432  *
9433  * Note that the branching code sequences used for ? and the general cases
9434  * of * and + are somewhat optimized:  they use the same NOTHING node as
9435  * both the endmarker for their branch list and the body of the last branch.
9436  * It might seem that this node could be dispensed with entirely, but the
9437  * endmarker role is not redundant.
9438  */
9439 STATIC regnode *
9440 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9441 {
9442     dVAR;
9443     regnode *ret;
9444     char op;
9445     char *next;
9446     I32 flags;
9447     const char * const origparse = RExC_parse;
9448     I32 min;
9449     I32 max = REG_INFTY;
9450 #ifdef RE_TRACK_PATTERN_OFFSETS
9451     char *parse_start;
9452 #endif
9453     const char *maxpos = NULL;
9454
9455     /* Save the original in case we change the emitted regop to a FAIL. */
9456     regnode * const orig_emit = RExC_emit;
9457
9458     GET_RE_DEBUG_FLAGS_DECL;
9459
9460     PERL_ARGS_ASSERT_REGPIECE;
9461
9462     DEBUG_PARSE("piec");
9463
9464     ret = regatom(pRExC_state, &flags,depth+1);
9465     if (ret == NULL) {
9466         if (flags & TRYAGAIN)
9467             *flagp |= TRYAGAIN;
9468         return(NULL);
9469     }
9470
9471     op = *RExC_parse;
9472
9473     if (op == '{' && regcurly(RExC_parse, FALSE)) {
9474         maxpos = NULL;
9475 #ifdef RE_TRACK_PATTERN_OFFSETS
9476         parse_start = RExC_parse; /* MJD */
9477 #endif
9478         next = RExC_parse + 1;
9479         while (isDIGIT(*next) || *next == ',') {
9480             if (*next == ',') {
9481                 if (maxpos)
9482                     break;
9483                 else
9484                     maxpos = next;
9485             }
9486             next++;
9487         }
9488         if (*next == '}') {             /* got one */
9489             if (!maxpos)
9490                 maxpos = next;
9491             RExC_parse++;
9492             min = atoi(RExC_parse);
9493             if (*maxpos == ',')
9494                 maxpos++;
9495             else
9496                 maxpos = RExC_parse;
9497             max = atoi(maxpos);
9498             if (!max && *maxpos != '0')
9499                 max = REG_INFTY;                /* meaning "infinity" */
9500             else if (max >= REG_INFTY)
9501                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9502             RExC_parse = next;
9503             nextchar(pRExC_state);
9504             if (max < min) {    /* If can't match, warn and optimize to fail
9505                                    unconditionally */
9506                 if (SIZE_ONLY) {
9507                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9508
9509                     /* We can't back off the size because we have to reserve
9510                      * enough space for all the things we are about to throw
9511                      * away, but we can shrink it by the ammount we are about
9512                      * to re-use here */
9513                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9514                 }
9515                 else {
9516                     RExC_emit = orig_emit;
9517                 }
9518                 ret = reg_node(pRExC_state, OPFAIL);
9519                 return ret;
9520             }
9521             else if (max == 0) {    /* replace {0} with a nothing node */
9522                 if (SIZE_ONLY) {
9523                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
9524                 }
9525                 else {
9526                     RExC_emit = orig_emit;
9527                 }
9528                 ret = reg_node(pRExC_state, NOTHING);
9529                 return ret;
9530             }
9531
9532         do_curly:
9533             if ((flags&SIMPLE)) {
9534                 RExC_naughty += 2 + RExC_naughty / 2;
9535                 reginsert(pRExC_state, CURLY, ret, depth+1);
9536                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9537                 Set_Node_Cur_Length(ret);
9538             }
9539             else {
9540                 regnode * const w = reg_node(pRExC_state, WHILEM);
9541
9542                 w->flags = 0;
9543                 REGTAIL(pRExC_state, ret, w);
9544                 if (!SIZE_ONLY && RExC_extralen) {
9545                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9546                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9547                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9548                 }
9549                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9550                                 /* MJD hk */
9551                 Set_Node_Offset(ret, parse_start+1);
9552                 Set_Node_Length(ret,
9553                                 op == '{' ? (RExC_parse - parse_start) : 1);
9554
9555                 if (!SIZE_ONLY && RExC_extralen)
9556                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9557                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9558                 if (SIZE_ONLY)
9559                     RExC_whilem_seen++, RExC_extralen += 3;
9560                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9561             }
9562             ret->flags = 0;
9563
9564             if (min > 0)
9565                 *flagp = WORST;
9566             if (max > 0)
9567                 *flagp |= HASWIDTH;
9568             if (!SIZE_ONLY) {
9569                 ARG1_SET(ret, (U16)min);
9570                 ARG2_SET(ret, (U16)max);
9571             }
9572
9573             goto nest_check;
9574         }
9575     }
9576
9577     if (!ISMULT1(op)) {
9578         *flagp = flags;
9579         return(ret);
9580     }
9581
9582 #if 0                           /* Now runtime fix should be reliable. */
9583
9584     /* if this is reinstated, don't forget to put this back into perldiag:
9585
9586             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9587
9588            (F) The part of the regexp subject to either the * or + quantifier
9589            could match an empty string. The {#} shows in the regular
9590            expression about where the problem was discovered.
9591
9592     */
9593
9594     if (!(flags&HASWIDTH) && op != '?')
9595       vFAIL("Regexp *+ operand could be empty");
9596 #endif
9597
9598 #ifdef RE_TRACK_PATTERN_OFFSETS
9599     parse_start = RExC_parse;
9600 #endif
9601     nextchar(pRExC_state);
9602
9603     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9604
9605     if (op == '*' && (flags&SIMPLE)) {
9606         reginsert(pRExC_state, STAR, ret, depth+1);
9607         ret->flags = 0;
9608         RExC_naughty += 4;
9609     }
9610     else if (op == '*') {
9611         min = 0;
9612         goto do_curly;
9613     }
9614     else if (op == '+' && (flags&SIMPLE)) {
9615         reginsert(pRExC_state, PLUS, ret, depth+1);
9616         ret->flags = 0;
9617         RExC_naughty += 3;
9618     }
9619     else if (op == '+') {
9620         min = 1;
9621         goto do_curly;
9622     }
9623     else if (op == '?') {
9624         min = 0; max = 1;
9625         goto do_curly;
9626     }
9627   nest_check:
9628     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9629         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9630         ckWARN3reg(RExC_parse,
9631                    "%.*s matches null string many times",
9632                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9633                    origparse);
9634         (void)ReREFCNT_inc(RExC_rx_sv);
9635     }
9636
9637     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9638         nextchar(pRExC_state);
9639         reginsert(pRExC_state, MINMOD, ret, depth+1);
9640         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9641     }
9642 #ifndef REG_ALLOW_MINMOD_SUSPEND
9643     else
9644 #endif
9645     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9646         regnode *ender;
9647         nextchar(pRExC_state);
9648         ender = reg_node(pRExC_state, SUCCEED);
9649         REGTAIL(pRExC_state, ret, ender);
9650         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9651         ret->flags = 0;
9652         ender = reg_node(pRExC_state, TAIL);
9653         REGTAIL(pRExC_state, ret, ender);
9654         /*ret= ender;*/
9655     }
9656
9657     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9658         RExC_parse++;
9659         vFAIL("Nested quantifiers");
9660     }
9661
9662     return(ret);
9663 }
9664
9665 STATIC bool
9666 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9667         const bool strict   /* Apply stricter parsing rules? */
9668     )
9669 {
9670
9671  /* This is expected to be called by a parser routine that has recognized '\N'
9672    and needs to handle the rest. RExC_parse is expected to point at the first
9673    char following the N at the time of the call.  On successful return,
9674    RExC_parse has been updated to point to just after the sequence identified
9675    by this routine, and <*flagp> has been updated.
9676
9677    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9678    character class.
9679
9680    \N may begin either a named sequence, or if outside a character class, mean
9681    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9682    attempted to decide which, and in the case of a named sequence, converted it
9683    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9684    where c1... are the characters in the sequence.  For single-quoted regexes,
9685    the tokenizer passes the \N sequence through unchanged; this code will not
9686    attempt to determine this nor expand those, instead raising a syntax error.
9687    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9688    or there is no '}', it signals that this \N occurrence means to match a
9689    non-newline.
9690
9691    Only the \N{U+...} form should occur in a character class, for the same
9692    reason that '.' inside a character class means to just match a period: it
9693    just doesn't make sense.
9694
9695    The function raises an error (via vFAIL), and doesn't return for various
9696    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9697    success; it returns FALSE otherwise.
9698
9699    If <valuep> is non-null, it means the caller can accept an input sequence
9700    consisting of a just a single code point; <*valuep> is set to that value
9701    if the input is such.
9702
9703    If <node_p> is non-null it signifies that the caller can accept any other
9704    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9705    is set as follows:
9706     1) \N means not-a-NL: points to a newly created REG_ANY node;
9707     2) \N{}:              points to a new NOTHING node;
9708     3) otherwise:         points to a new EXACT node containing the resolved
9709                           string.
9710    Note that FALSE is returned for single code point sequences if <valuep> is
9711    null.
9712  */
9713
9714     char * endbrace;    /* '}' following the name */
9715     char* p;
9716     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9717                            stream */
9718     bool has_multiple_chars; /* true if the input stream contains a sequence of
9719                                 more than one character */
9720
9721     GET_RE_DEBUG_FLAGS_DECL;
9722
9723     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9724
9725     GET_RE_DEBUG_FLAGS;
9726
9727     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9728
9729     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9730      * modifier.  The other meaning does not */
9731     p = (RExC_flags & RXf_PMf_EXTENDED)
9732         ? regwhite( pRExC_state, RExC_parse )
9733         : RExC_parse;
9734
9735     /* Disambiguate between \N meaning a named character versus \N meaning
9736      * [^\n].  The former is assumed when it can't be the latter. */
9737     if (*p != '{' || regcurly(p, FALSE)) {
9738         RExC_parse = p;
9739         if (! node_p) {
9740             /* no bare \N in a charclass */
9741             if (in_char_class) {
9742                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9743             }
9744             return FALSE;
9745         }
9746         nextchar(pRExC_state);
9747         *node_p = reg_node(pRExC_state, REG_ANY);
9748         *flagp |= HASWIDTH|SIMPLE;
9749         RExC_naughty++;
9750         RExC_parse--;
9751         Set_Node_Length(*node_p, 1); /* MJD */
9752         return TRUE;
9753     }
9754
9755     /* Here, we have decided it should be a named character or sequence */
9756
9757     /* The test above made sure that the next real character is a '{', but
9758      * under the /x modifier, it could be separated by space (or a comment and
9759      * \n) and this is not allowed (for consistency with \x{...} and the
9760      * tokenizer handling of \N{NAME}). */
9761     if (*RExC_parse != '{') {
9762         vFAIL("Missing braces on \\N{}");
9763     }
9764
9765     RExC_parse++;       /* Skip past the '{' */
9766
9767     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9768         || ! (endbrace == RExC_parse            /* nothing between the {} */
9769               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9770                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9771     {
9772         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9773         vFAIL("\\N{NAME} must be resolved by the lexer");
9774     }
9775
9776     if (endbrace == RExC_parse) {   /* empty: \N{} */
9777         bool ret = TRUE;
9778         if (node_p) {
9779             *node_p = reg_node(pRExC_state,NOTHING);
9780         }
9781         else if (in_char_class) {
9782             if (SIZE_ONLY && in_char_class) {
9783                 if (strict) {
9784                     RExC_parse++;   /* Position after the "}" */
9785                     vFAIL("Zero length \\N{}");
9786                 }
9787                 else {
9788                     ckWARNreg(RExC_parse,
9789                               "Ignoring zero length \\N{} in character class");
9790                 }
9791             }
9792             ret = FALSE;
9793         }
9794         else {
9795             return FALSE;
9796         }
9797         nextchar(pRExC_state);
9798         return ret;
9799     }
9800
9801     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9802     RExC_parse += 2;    /* Skip past the 'U+' */
9803
9804     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9805
9806     /* Code points are separated by dots.  If none, there is only one code
9807      * point, and is terminated by the brace */
9808     has_multiple_chars = (endchar < endbrace);
9809
9810     if (valuep && (! has_multiple_chars || in_char_class)) {
9811         /* We only pay attention to the first char of
9812         multichar strings being returned in char classes. I kinda wonder
9813         if this makes sense as it does change the behaviour
9814         from earlier versions, OTOH that behaviour was broken
9815         as well. XXX Solution is to recharacterize as
9816         [rest-of-class]|multi1|multi2... */
9817
9818         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9819         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9820             | PERL_SCAN_DISALLOW_PREFIX
9821             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9822
9823         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9824
9825         /* The tokenizer should have guaranteed validity, but it's possible to
9826          * bypass it by using single quoting, so check */
9827         if (length_of_hex == 0
9828             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9829         {
9830             RExC_parse += length_of_hex;        /* Includes all the valid */
9831             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9832                             ? UTF8SKIP(RExC_parse)
9833                             : 1;
9834             /* Guard against malformed utf8 */
9835             if (RExC_parse >= endchar) {
9836                 RExC_parse = endchar;
9837             }
9838             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9839         }
9840
9841         if (in_char_class && has_multiple_chars) {
9842             if (strict) {
9843                 RExC_parse = endbrace;
9844                 vFAIL("\\N{} in character class restricted to one character");
9845             }
9846             else {
9847                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9848             }
9849         }
9850
9851         RExC_parse = endbrace + 1;
9852     }
9853     else if (! node_p || ! has_multiple_chars) {
9854
9855         /* Here, the input is legal, but not according to the caller's
9856          * options.  We fail without advancing the parse, so that the
9857          * caller can try again */
9858         RExC_parse = p;
9859         return FALSE;
9860     }
9861     else {
9862
9863         /* What is done here is to convert this to a sub-pattern of the form
9864          * (?:\x{char1}\x{char2}...)
9865          * and then call reg recursively.  That way, it retains its atomicness,
9866          * while not having to worry about special handling that some code
9867          * points may have.  toke.c has converted the original Unicode values
9868          * to native, so that we can just pass on the hex values unchanged.  We
9869          * do have to set a flag to keep recoding from happening in the
9870          * recursion */
9871
9872         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9873         STRLEN len;
9874         char *orig_end = RExC_end;
9875         I32 flags;
9876
9877         while (RExC_parse < endbrace) {
9878
9879             /* Convert to notation the rest of the code understands */
9880             sv_catpv(substitute_parse, "\\x{");
9881             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9882             sv_catpv(substitute_parse, "}");
9883
9884             /* Point to the beginning of the next character in the sequence. */
9885             RExC_parse = endchar + 1;
9886             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9887         }
9888         sv_catpv(substitute_parse, ")");
9889
9890         RExC_parse = SvPV(substitute_parse, len);
9891
9892         /* Don't allow empty number */
9893         if (len < 8) {
9894             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9895         }
9896         RExC_end = RExC_parse + len;
9897
9898         /* The values are Unicode, and therefore not subject to recoding */
9899         RExC_override_recoding = 1;
9900
9901         *node_p = reg(pRExC_state, 1, &flags, depth+1);
9902         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9903
9904         RExC_parse = endbrace;
9905         RExC_end = orig_end;
9906         RExC_override_recoding = 0;
9907
9908         nextchar(pRExC_state);
9909     }
9910
9911     return TRUE;
9912 }
9913
9914
9915 /*
9916  * reg_recode
9917  *
9918  * It returns the code point in utf8 for the value in *encp.
9919  *    value: a code value in the source encoding
9920  *    encp:  a pointer to an Encode object
9921  *
9922  * If the result from Encode is not a single character,
9923  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9924  */
9925 STATIC UV
9926 S_reg_recode(pTHX_ const char value, SV **encp)
9927 {
9928     STRLEN numlen = 1;
9929     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9930     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9931     const STRLEN newlen = SvCUR(sv);
9932     UV uv = UNICODE_REPLACEMENT;
9933
9934     PERL_ARGS_ASSERT_REG_RECODE;
9935
9936     if (newlen)
9937         uv = SvUTF8(sv)
9938              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9939              : *(U8*)s;
9940
9941     if (!newlen || numlen != newlen) {
9942         uv = UNICODE_REPLACEMENT;
9943         *encp = NULL;
9944     }
9945     return uv;
9946 }
9947
9948 PERL_STATIC_INLINE U8
9949 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9950 {
9951     U8 op;
9952
9953     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9954
9955     if (! FOLD) {
9956         return EXACT;
9957     }
9958
9959     op = get_regex_charset(RExC_flags);
9960     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9961         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9962                  been, so there is no hole */
9963     }
9964
9965     return op + EXACTF;
9966 }
9967
9968 PERL_STATIC_INLINE void
9969 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9970 {
9971     /* This knows the details about sizing an EXACTish node, setting flags for
9972      * it (by setting <*flagp>, and potentially populating it with a single
9973      * character.
9974      *
9975      * If <len> (the length in bytes) is non-zero, this function assumes that
9976      * the node has already been populated, and just does the sizing.  In this
9977      * case <code_point> should be the final code point that has already been
9978      * placed into the node.  This value will be ignored except that under some
9979      * circumstances <*flagp> is set based on it.
9980      *
9981      * If <len> is zero, the function assumes that the node is to contain only
9982      * the single character given by <code_point> and calculates what <len>
9983      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
9984      * additionally will populate the node's STRING with <code_point>, if <len>
9985      * is 0.  In both cases <*flagp> is appropriately set
9986      *
9987      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9988      * folded (the latter only when the rules indicate it can match 'ss') */
9989
9990     bool len_passed_in = cBOOL(len != 0);
9991     U8 character[UTF8_MAXBYTES_CASE+1];
9992
9993     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9994
9995     if (! len_passed_in) {
9996         if (UTF) {
9997             if (FOLD) {
9998                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9999             }
10000             else {
10001                 uvchr_to_utf8( character, code_point);
10002                 len = UTF8SKIP(character);
10003             }
10004         }
10005         else if (! FOLD
10006                  || code_point != LATIN_SMALL_LETTER_SHARP_S
10007                  || ASCII_FOLD_RESTRICTED
10008                  || ! AT_LEAST_UNI_SEMANTICS)
10009         {
10010             *character = (U8) code_point;
10011             len = 1;
10012         }
10013         else {
10014             *character = 's';
10015             *(character + 1) = 's';
10016             len = 2;
10017         }
10018     }
10019
10020     if (SIZE_ONLY) {
10021         RExC_size += STR_SZ(len);
10022     }
10023     else {
10024         RExC_emit += STR_SZ(len);
10025         STR_LEN(node) = len;
10026         if (! len_passed_in) {
10027             Copy((char *) character, STRING(node), len, char);
10028         }
10029     }
10030
10031     *flagp |= HASWIDTH;
10032
10033     /* A single character node is SIMPLE, except for the special-cased SHARP S
10034      * under /di. */
10035     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10036         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10037             || ! FOLD || ! DEPENDS_SEMANTICS))
10038     {
10039         *flagp |= SIMPLE;
10040     }
10041 }
10042
10043 /*
10044  - regatom - the lowest level
10045
10046    Try to identify anything special at the start of the pattern. If there
10047    is, then handle it as required. This may involve generating a single regop,
10048    such as for an assertion; or it may involve recursing, such as to
10049    handle a () structure.
10050
10051    If the string doesn't start with something special then we gobble up
10052    as much literal text as we can.
10053
10054    Once we have been able to handle whatever type of thing started the
10055    sequence, we return.
10056
10057    Note: we have to be careful with escapes, as they can be both literal
10058    and special, and in the case of \10 and friends, context determines which.
10059
10060    A summary of the code structure is:
10061
10062    switch (first_byte) {
10063         cases for each special:
10064             handle this special;
10065             break;
10066         case '\\':
10067             switch (2nd byte) {
10068                 cases for each unambiguous special:
10069                     handle this special;
10070                     break;
10071                 cases for each ambigous special/literal:
10072                     disambiguate;
10073                     if (special)  handle here
10074                     else goto defchar;
10075                 default: // unambiguously literal:
10076                     goto defchar;
10077             }
10078         default:  // is a literal char
10079             // FALL THROUGH
10080         defchar:
10081             create EXACTish node for literal;
10082             while (more input and node isn't full) {
10083                 switch (input_byte) {
10084                    cases for each special;
10085                        make sure parse pointer is set so that the next call to
10086                            regatom will see this special first
10087                        goto loopdone; // EXACTish node terminated by prev. char
10088                    default:
10089                        append char to EXACTISH node;
10090                 }
10091                 get next input byte;
10092             }
10093         loopdone:
10094    }
10095    return the generated node;
10096
10097    Specifically there are two separate switches for handling
10098    escape sequences, with the one for handling literal escapes requiring
10099    a dummy entry for all of the special escapes that are actually handled
10100    by the other.
10101 */
10102
10103 STATIC regnode *
10104 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10105 {
10106     dVAR;
10107     regnode *ret = NULL;
10108     I32 flags = 0;
10109     char *parse_start = RExC_parse;
10110     U8 op;
10111     int invert = 0;
10112
10113     GET_RE_DEBUG_FLAGS_DECL;
10114
10115     *flagp = WORST;             /* Tentatively. */
10116
10117     DEBUG_PARSE("atom");
10118
10119     PERL_ARGS_ASSERT_REGATOM;
10120
10121 tryagain:
10122     switch ((U8)*RExC_parse) {
10123     case '^':
10124         RExC_seen_zerolen++;
10125         nextchar(pRExC_state);
10126         if (RExC_flags & RXf_PMf_MULTILINE)
10127             ret = reg_node(pRExC_state, MBOL);
10128         else if (RExC_flags & RXf_PMf_SINGLELINE)
10129             ret = reg_node(pRExC_state, SBOL);
10130         else
10131             ret = reg_node(pRExC_state, BOL);
10132         Set_Node_Length(ret, 1); /* MJD */
10133         break;
10134     case '$':
10135         nextchar(pRExC_state);
10136         if (*RExC_parse)
10137             RExC_seen_zerolen++;
10138         if (RExC_flags & RXf_PMf_MULTILINE)
10139             ret = reg_node(pRExC_state, MEOL);
10140         else if (RExC_flags & RXf_PMf_SINGLELINE)
10141             ret = reg_node(pRExC_state, SEOL);
10142         else
10143             ret = reg_node(pRExC_state, EOL);
10144         Set_Node_Length(ret, 1); /* MJD */
10145         break;
10146     case '.':
10147         nextchar(pRExC_state);
10148         if (RExC_flags & RXf_PMf_SINGLELINE)
10149             ret = reg_node(pRExC_state, SANY);
10150         else
10151             ret = reg_node(pRExC_state, REG_ANY);
10152         *flagp |= HASWIDTH|SIMPLE;
10153         RExC_naughty++;
10154         Set_Node_Length(ret, 1); /* MJD */
10155         break;
10156     case '[':
10157     {
10158         char * const oregcomp_parse = ++RExC_parse;
10159         ret = regclass(pRExC_state, flagp,depth+1,
10160                        FALSE, /* means parse the whole char class */
10161                        TRUE, /* allow multi-char folds */
10162                        FALSE, /* don't silence non-portable warnings. */
10163                        NULL);
10164         if (*RExC_parse != ']') {
10165             RExC_parse = oregcomp_parse;
10166             vFAIL("Unmatched [");
10167         }
10168         nextchar(pRExC_state);
10169         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10170         break;
10171     }
10172     case '(':
10173         nextchar(pRExC_state);
10174         ret = reg(pRExC_state, 1, &flags,depth+1);
10175         if (ret == NULL) {
10176                 if (flags & TRYAGAIN) {
10177                     if (RExC_parse == RExC_end) {
10178                          /* Make parent create an empty node if needed. */
10179                         *flagp |= TRYAGAIN;
10180                         return(NULL);
10181                     }
10182                     goto tryagain;
10183                 }
10184                 return(NULL);
10185         }
10186         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10187         break;
10188     case '|':
10189     case ')':
10190         if (flags & TRYAGAIN) {
10191             *flagp |= TRYAGAIN;
10192             return NULL;
10193         }
10194         vFAIL("Internal urp");
10195                                 /* Supposed to be caught earlier. */
10196         break;
10197     case '{':
10198         if (!regcurly(RExC_parse, FALSE)) {
10199             RExC_parse++;
10200             goto defchar;
10201         }
10202         /* FALL THROUGH */
10203     case '?':
10204     case '+':
10205     case '*':
10206         RExC_parse++;
10207         vFAIL("Quantifier follows nothing");
10208         break;
10209     case '\\':
10210         /* Special Escapes
10211
10212            This switch handles escape sequences that resolve to some kind
10213            of special regop and not to literal text. Escape sequnces that
10214            resolve to literal text are handled below in the switch marked
10215            "Literal Escapes".
10216
10217            Every entry in this switch *must* have a corresponding entry
10218            in the literal escape switch. However, the opposite is not
10219            required, as the default for this switch is to jump to the
10220            literal text handling code.
10221         */
10222         switch ((U8)*++RExC_parse) {
10223             U8 arg;
10224         /* Special Escapes */
10225         case 'A':
10226             RExC_seen_zerolen++;
10227             ret = reg_node(pRExC_state, SBOL);
10228             *flagp |= SIMPLE;
10229             goto finish_meta_pat;
10230         case 'G':
10231             ret = reg_node(pRExC_state, GPOS);
10232             RExC_seen |= REG_SEEN_GPOS;
10233             *flagp |= SIMPLE;
10234             goto finish_meta_pat;
10235         case 'K':
10236             RExC_seen_zerolen++;
10237             ret = reg_node(pRExC_state, KEEPS);
10238             *flagp |= SIMPLE;
10239             /* XXX:dmq : disabling in-place substitution seems to
10240              * be necessary here to avoid cases of memory corruption, as
10241              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10242              */
10243             RExC_seen |= REG_SEEN_LOOKBEHIND;
10244             goto finish_meta_pat;
10245         case 'Z':
10246             ret = reg_node(pRExC_state, SEOL);
10247             *flagp |= SIMPLE;
10248             RExC_seen_zerolen++;                /* Do not optimize RE away */
10249             goto finish_meta_pat;
10250         case 'z':
10251             ret = reg_node(pRExC_state, EOS);
10252             *flagp |= SIMPLE;
10253             RExC_seen_zerolen++;                /* Do not optimize RE away */
10254             goto finish_meta_pat;
10255         case 'C':
10256             ret = reg_node(pRExC_state, CANY);
10257             RExC_seen |= REG_SEEN_CANY;
10258             *flagp |= HASWIDTH|SIMPLE;
10259             goto finish_meta_pat;
10260         case 'X':
10261             ret = reg_node(pRExC_state, CLUMP);
10262             *flagp |= HASWIDTH;
10263             goto finish_meta_pat;
10264
10265         case 'W':
10266             invert = 1;
10267             /* FALLTHROUGH */
10268         case 'w':
10269             arg = ANYOF_WORDCHAR;
10270             goto join_posix;
10271
10272         case 'b':
10273             RExC_seen_zerolen++;
10274             RExC_seen |= REG_SEEN_LOOKBEHIND;
10275             op = BOUND + get_regex_charset(RExC_flags);
10276             if (op > BOUNDA) {  /* /aa is same as /a */
10277                 op = BOUNDA;
10278             }
10279             ret = reg_node(pRExC_state, op);
10280             FLAGS(ret) = get_regex_charset(RExC_flags);
10281             *flagp |= SIMPLE;
10282             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10283                 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10284             }
10285             goto finish_meta_pat;
10286         case 'B':
10287             RExC_seen_zerolen++;
10288             RExC_seen |= REG_SEEN_LOOKBEHIND;
10289             op = NBOUND + get_regex_charset(RExC_flags);
10290             if (op > NBOUNDA) { /* /aa is same as /a */
10291                 op = NBOUNDA;
10292             }
10293             ret = reg_node(pRExC_state, op);
10294             FLAGS(ret) = get_regex_charset(RExC_flags);
10295             *flagp |= SIMPLE;
10296             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10297                 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10298             }
10299             goto finish_meta_pat;
10300
10301         case 'D':
10302             invert = 1;
10303             /* FALLTHROUGH */
10304         case 'd':
10305             arg = ANYOF_DIGIT;
10306             goto join_posix;
10307
10308         case 'R':
10309             ret = reg_node(pRExC_state, LNBREAK);
10310             *flagp |= HASWIDTH|SIMPLE;
10311             goto finish_meta_pat;
10312
10313         case 'H':
10314             invert = 1;
10315             /* FALLTHROUGH */
10316         case 'h':
10317             arg = ANYOF_BLANK;
10318             op = POSIXU;
10319             goto join_posix_op_known;
10320
10321         case 'V':
10322             invert = 1;
10323             /* FALLTHROUGH */
10324         case 'v':
10325             arg = ANYOF_VERTWS;
10326             op = POSIXU;
10327             goto join_posix_op_known;
10328
10329         case 'S':
10330             invert = 1;
10331             /* FALLTHROUGH */
10332         case 's':
10333             arg = ANYOF_SPACE;
10334
10335         join_posix:
10336
10337             op = POSIXD + get_regex_charset(RExC_flags);
10338             if (op > POSIXA) {  /* /aa is same as /a */
10339                 op = POSIXA;
10340             }
10341
10342         join_posix_op_known:
10343
10344             if (invert) {
10345                 op += NPOSIXD - POSIXD;
10346             }
10347
10348             ret = reg_node(pRExC_state, op);
10349             if (! SIZE_ONLY) {
10350                 FLAGS(ret) = namedclass_to_classnum(arg);
10351             }
10352
10353             *flagp |= HASWIDTH|SIMPLE;
10354             /* FALL THROUGH */
10355
10356          finish_meta_pat:
10357             nextchar(pRExC_state);
10358             Set_Node_Length(ret, 2); /* MJD */
10359             break;
10360         case 'p':
10361         case 'P':
10362             {
10363 #ifdef DEBUGGING
10364                 char* parse_start = RExC_parse - 2;
10365 #endif
10366
10367                 RExC_parse--;
10368
10369                 ret = regclass(pRExC_state, flagp,depth+1,
10370                                TRUE, /* means just parse this element */
10371                                FALSE, /* don't allow multi-char folds */
10372                                FALSE, /* don't silence non-portable warnings.
10373                                          It would be a bug if these returned
10374                                          non-portables */
10375                                NULL);
10376
10377                 RExC_parse--;
10378
10379                 Set_Node_Offset(ret, parse_start + 2);
10380                 Set_Node_Cur_Length(ret);
10381                 nextchar(pRExC_state);
10382             }
10383             break;
10384         case 'N':
10385             /* Handle \N and \N{NAME} with multiple code points here and not
10386              * below because it can be multicharacter. join_exact() will join
10387              * them up later on.  Also this makes sure that things like
10388              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10389              * The options to the grok function call causes it to fail if the
10390              * sequence is just a single code point.  We then go treat it as
10391              * just another character in the current EXACT node, and hence it
10392              * gets uniform treatment with all the other characters.  The
10393              * special treatment for quantifiers is not needed for such single
10394              * character sequences */
10395             ++RExC_parse;
10396             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10397                                 FALSE /* not strict */ )) {
10398                 RExC_parse--;
10399                 goto defchar;
10400             }
10401             break;
10402         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10403         parse_named_seq:
10404         {
10405             char ch= RExC_parse[1];
10406             if (ch != '<' && ch != '\'' && ch != '{') {
10407                 RExC_parse++;
10408                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10409             } else {
10410                 /* this pretty much dupes the code for (?P=...) in reg(), if
10411                    you change this make sure you change that */
10412                 char* name_start = (RExC_parse += 2);
10413                 U32 num = 0;
10414                 SV *sv_dat = reg_scan_name(pRExC_state,
10415                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10416                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10417                 if (RExC_parse == name_start || *RExC_parse != ch)
10418                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10419
10420                 if (!SIZE_ONLY) {
10421                     num = add_data( pRExC_state, 1, "S" );
10422                     RExC_rxi->data->data[num]=(void*)sv_dat;
10423                     SvREFCNT_inc_simple_void(sv_dat);
10424                 }
10425
10426                 RExC_sawback = 1;
10427                 ret = reganode(pRExC_state,
10428                                ((! FOLD)
10429                                  ? NREF
10430                                  : (ASCII_FOLD_RESTRICTED)
10431                                    ? NREFFA
10432                                    : (AT_LEAST_UNI_SEMANTICS)
10433                                      ? NREFFU
10434                                      : (LOC)
10435                                        ? NREFFL
10436                                        : NREFF),
10437                                 num);
10438                 *flagp |= HASWIDTH;
10439
10440                 /* override incorrect value set in reganode MJD */
10441                 Set_Node_Offset(ret, parse_start+1);
10442                 Set_Node_Cur_Length(ret); /* MJD */
10443                 nextchar(pRExC_state);
10444
10445             }
10446             break;
10447         }
10448         case 'g':
10449         case '1': case '2': case '3': case '4':
10450         case '5': case '6': case '7': case '8': case '9':
10451             {
10452                 I32 num;
10453                 bool isg = *RExC_parse == 'g';
10454                 bool isrel = 0;
10455                 bool hasbrace = 0;
10456                 if (isg) {
10457                     RExC_parse++;
10458                     if (*RExC_parse == '{') {
10459                         RExC_parse++;
10460                         hasbrace = 1;
10461                     }
10462                     if (*RExC_parse == '-') {
10463                         RExC_parse++;
10464                         isrel = 1;
10465                     }
10466                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10467                         if (isrel) RExC_parse--;
10468                         RExC_parse -= 2;
10469                         goto parse_named_seq;
10470                 }   }
10471                 num = atoi(RExC_parse);
10472                 if (isg && num == 0)
10473                     vFAIL("Reference to invalid group 0");
10474                 if (isrel) {
10475                     num = RExC_npar - num;
10476                     if (num < 1)
10477                         vFAIL("Reference to nonexistent or unclosed group");
10478                 }
10479                 if (!isg && num > 9 && num >= RExC_npar)
10480                     /* Probably a character specified in octal, e.g. \35 */
10481                     goto defchar;
10482                 else {
10483                     char * const parse_start = RExC_parse - 1; /* MJD */
10484                     while (isDIGIT(*RExC_parse))
10485                         RExC_parse++;
10486                     if (parse_start == RExC_parse - 1)
10487                         vFAIL("Unterminated \\g... pattern");
10488                     if (hasbrace) {
10489                         if (*RExC_parse != '}')
10490                             vFAIL("Unterminated \\g{...} pattern");
10491                         RExC_parse++;
10492                     }
10493                     if (!SIZE_ONLY) {
10494                         if (num > (I32)RExC_rx->nparens)
10495                             vFAIL("Reference to nonexistent group");
10496                     }
10497                     RExC_sawback = 1;
10498                     ret = reganode(pRExC_state,
10499                                    ((! FOLD)
10500                                      ? REF
10501                                      : (ASCII_FOLD_RESTRICTED)
10502                                        ? REFFA
10503                                        : (AT_LEAST_UNI_SEMANTICS)
10504                                          ? REFFU
10505                                          : (LOC)
10506                                            ? REFFL
10507                                            : REFF),
10508                                     num);
10509                     *flagp |= HASWIDTH;
10510
10511                     /* override incorrect value set in reganode MJD */
10512                     Set_Node_Offset(ret, parse_start+1);
10513                     Set_Node_Cur_Length(ret); /* MJD */
10514                     RExC_parse--;
10515                     nextchar(pRExC_state);
10516                 }
10517             }
10518             break;
10519         case '\0':
10520             if (RExC_parse >= RExC_end)
10521                 FAIL("Trailing \\");
10522             /* FALL THROUGH */
10523         default:
10524             /* Do not generate "unrecognized" warnings here, we fall
10525                back into the quick-grab loop below */
10526             parse_start--;
10527             goto defchar;
10528         }
10529         break;
10530
10531     case '#':
10532         if (RExC_flags & RXf_PMf_EXTENDED) {
10533             if ( reg_skipcomment( pRExC_state ) )
10534                 goto tryagain;
10535         }
10536         /* FALL THROUGH */
10537
10538     default:
10539
10540             parse_start = RExC_parse - 1;
10541
10542             RExC_parse++;
10543
10544         defchar: {
10545             STRLEN len = 0;
10546             UV ender;
10547             char *p;
10548             char *s;
10549 #define MAX_NODE_STRING_SIZE 127
10550             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10551             char *s0;
10552             U8 upper_parse = MAX_NODE_STRING_SIZE;
10553             STRLEN foldlen;
10554             U8 node_type;
10555             bool next_is_quantifier;
10556             char * oldp = NULL;
10557
10558             /* If a folding node contains only code points that don't
10559              * participate in folds, it can be changed into an EXACT node,
10560              * which allows the optimizer more things to look for */
10561             bool maybe_exact;
10562
10563             ender = 0;
10564             node_type = compute_EXACTish(pRExC_state);
10565             ret = reg_node(pRExC_state, node_type);
10566
10567             /* In pass1, folded, we use a temporary buffer instead of the
10568              * actual node, as the node doesn't exist yet */
10569             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10570
10571             s0 = s;
10572
10573         reparse:
10574
10575             /* We do the EXACTFish to EXACT node only if folding, and not if in
10576              * locale, as whether a character folds or not isn't known until
10577              * runtime */
10578             maybe_exact = FOLD && ! LOC;
10579
10580             /* XXX The node can hold up to 255 bytes, yet this only goes to
10581              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10582              * 255 allows us to not have to worry about overflow due to
10583              * converting to utf8 and fold expansion, but that value is
10584              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10585              * split up by this limit into a single one using the real max of
10586              * 255.  Even at 127, this breaks under rare circumstances.  If
10587              * folding, we do not want to split a node at a character that is a
10588              * non-final in a multi-char fold, as an input string could just
10589              * happen to want to match across the node boundary.  The join
10590              * would solve that problem if the join actually happens.  But a
10591              * series of more than two nodes in a row each of 127 would cause
10592              * the first join to succeed to get to 254, but then there wouldn't
10593              * be room for the next one, which could at be one of those split
10594              * multi-char folds.  I don't know of any fool-proof solution.  One
10595              * could back off to end with only a code point that isn't such a
10596              * non-final, but it is possible for there not to be any in the
10597              * entire node. */
10598             for (p = RExC_parse - 1;
10599                  len < upper_parse && p < RExC_end;
10600                  len++)
10601             {
10602                 oldp = p;
10603
10604                 if (RExC_flags & RXf_PMf_EXTENDED)
10605                     p = regwhite( pRExC_state, p );
10606                 switch ((U8)*p) {
10607                 case '^':
10608                 case '$':
10609                 case '.':
10610                 case '[':
10611                 case '(':
10612                 case ')':
10613                 case '|':
10614                     goto loopdone;
10615                 case '\\':
10616                     /* Literal Escapes Switch
10617
10618                        This switch is meant to handle escape sequences that
10619                        resolve to a literal character.
10620
10621                        Every escape sequence that represents something
10622                        else, like an assertion or a char class, is handled
10623                        in the switch marked 'Special Escapes' above in this
10624                        routine, but also has an entry here as anything that
10625                        isn't explicitly mentioned here will be treated as
10626                        an unescaped equivalent literal.
10627                     */
10628
10629                     switch ((U8)*++p) {
10630                     /* These are all the special escapes. */
10631                     case 'A':             /* Start assertion */
10632                     case 'b': case 'B':   /* Word-boundary assertion*/
10633                     case 'C':             /* Single char !DANGEROUS! */
10634                     case 'd': case 'D':   /* digit class */
10635                     case 'g': case 'G':   /* generic-backref, pos assertion */
10636                     case 'h': case 'H':   /* HORIZWS */
10637                     case 'k': case 'K':   /* named backref, keep marker */
10638                     case 'p': case 'P':   /* Unicode property */
10639                               case 'R':   /* LNBREAK */
10640                     case 's': case 'S':   /* space class */
10641                     case 'v': case 'V':   /* VERTWS */
10642                     case 'w': case 'W':   /* word class */
10643                     case 'X':             /* eXtended Unicode "combining character sequence" */
10644                     case 'z': case 'Z':   /* End of line/string assertion */
10645                         --p;
10646                         goto loopdone;
10647
10648                     /* Anything after here is an escape that resolves to a
10649                        literal. (Except digits, which may or may not)
10650                      */
10651                     case 'n':
10652                         ender = '\n';
10653                         p++;
10654                         break;
10655                     case 'N': /* Handle a single-code point named character. */
10656                         /* The options cause it to fail if a multiple code
10657                          * point sequence.  Handle those in the switch() above
10658                          * */
10659                         RExC_parse = p + 1;
10660                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10661                                             flagp, depth, FALSE,
10662                                             FALSE /* not strict */ ))
10663                         {
10664                             RExC_parse = p = oldp;
10665                             goto loopdone;
10666                         }
10667                         p = RExC_parse;
10668                         if (ender > 0xff) {
10669                             REQUIRE_UTF8;
10670                         }
10671                         break;
10672                     case 'r':
10673                         ender = '\r';
10674                         p++;
10675                         break;
10676                     case 't':
10677                         ender = '\t';
10678                         p++;
10679                         break;
10680                     case 'f':
10681                         ender = '\f';
10682                         p++;
10683                         break;
10684                     case 'e':
10685                           ender = ASCII_TO_NATIVE('\033');
10686                         p++;
10687                         break;
10688                     case 'a':
10689                           ender = ASCII_TO_NATIVE('\007');
10690                         p++;
10691                         break;
10692                     case 'o':
10693                         {
10694                             UV result;
10695                             const char* error_msg;
10696
10697                             bool valid = grok_bslash_o(&p,
10698                                                        &result,
10699                                                        &error_msg,
10700                                                        TRUE, /* out warnings */
10701                                                        FALSE, /* not strict */
10702                                                        TRUE, /* Output warnings
10703                                                                 for non-
10704                                                                 portables */
10705                                                        UTF);
10706                             if (! valid) {
10707                                 RExC_parse = p; /* going to die anyway; point
10708                                                    to exact spot of failure */
10709                                 vFAIL(error_msg);
10710                             }
10711                             ender = result;
10712                             if (PL_encoding && ender < 0x100) {
10713                                 goto recode_encoding;
10714                             }
10715                             if (ender > 0xff) {
10716                                 REQUIRE_UTF8;
10717                             }
10718                             break;
10719                         }
10720                     case 'x':
10721                         {
10722                             UV result = UV_MAX; /* initialize to erroneous
10723                                                    value */
10724                             const char* error_msg;
10725
10726                             bool valid = grok_bslash_x(&p,
10727                                                        &result,
10728                                                        &error_msg,
10729                                                        TRUE, /* out warnings */
10730                                                        FALSE, /* not strict */
10731                                                        TRUE, /* Output warnings
10732                                                                 for non-
10733                                                                 portables */
10734                                                        UTF);
10735                             if (! valid) {
10736                                 RExC_parse = p; /* going to die anyway; point
10737                                                    to exact spot of failure */
10738                                 vFAIL(error_msg);
10739                             }
10740                             ender = result;
10741
10742                             if (PL_encoding && ender < 0x100) {
10743                                 goto recode_encoding;
10744                             }
10745                             if (ender > 0xff) {
10746                                 REQUIRE_UTF8;
10747                             }
10748                             break;
10749                         }
10750                     case 'c':
10751                         p++;
10752                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10753                         break;
10754                     case '0': case '1': case '2': case '3':case '4':
10755                     case '5': case '6': case '7':
10756                         if (*p == '0' ||
10757                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10758                         {
10759                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10760                             STRLEN numlen = 3;
10761                             ender = grok_oct(p, &numlen, &flags, NULL);
10762                             if (ender > 0xff) {
10763                                 REQUIRE_UTF8;
10764                             }
10765                             p += numlen;
10766                             if (SIZE_ONLY   /* like \08, \178 */
10767                                 && numlen < 3
10768                                 && p < RExC_end
10769                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10770                             {
10771                                 reg_warn_non_literal_string(
10772                                          p + 1,
10773                                          form_short_octal_warning(p, numlen));
10774                             }
10775                         }
10776                         else {  /* Not to be treated as an octal constant, go
10777                                    find backref */
10778                             --p;
10779                             goto loopdone;
10780                         }
10781                         if (PL_encoding && ender < 0x100)
10782                             goto recode_encoding;
10783                         break;
10784                     recode_encoding:
10785                         if (! RExC_override_recoding) {
10786                             SV* enc = PL_encoding;
10787                             ender = reg_recode((const char)(U8)ender, &enc);
10788                             if (!enc && SIZE_ONLY)
10789                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10790                             REQUIRE_UTF8;
10791                         }
10792                         break;
10793                     case '\0':
10794                         if (p >= RExC_end)
10795                             FAIL("Trailing \\");
10796                         /* FALL THROUGH */
10797                     default:
10798                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10799                             /* Include any { following the alpha to emphasize
10800                              * that it could be part of an escape at some point
10801                              * in the future */
10802                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10803                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10804                         }
10805                         goto normal_default;
10806                     } /* End of switch on '\' */
10807                     break;
10808                 default:    /* A literal character */
10809
10810                     if (! SIZE_ONLY
10811                         && RExC_flags & RXf_PMf_EXTENDED
10812                         && ckWARN(WARN_DEPRECATED)
10813                         && is_PATWS_non_low(p, UTF))
10814                     {
10815                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
10816                                 "Escape literal pattern white space under /x");
10817                     }
10818
10819                   normal_default:
10820                     if (UTF8_IS_START(*p) && UTF) {
10821                         STRLEN numlen;
10822                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10823                                                &numlen, UTF8_ALLOW_DEFAULT);
10824                         p += numlen;
10825                     }
10826                     else
10827                         ender = (U8) *p++;
10828                     break;
10829                 } /* End of switch on the literal */
10830
10831                 /* Here, have looked at the literal character and <ender>
10832                  * contains its ordinal, <p> points to the character after it
10833                  */
10834
10835                 if ( RExC_flags & RXf_PMf_EXTENDED)
10836                     p = regwhite( pRExC_state, p );
10837
10838                 /* If the next thing is a quantifier, it applies to this
10839                  * character only, which means that this character has to be in
10840                  * its own node and can't just be appended to the string in an
10841                  * existing node, so if there are already other characters in
10842                  * the node, close the node with just them, and set up to do
10843                  * this character again next time through, when it will be the
10844                  * only thing in its new node */
10845                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10846                 {
10847                     p = oldp;
10848                     goto loopdone;
10849                 }
10850
10851                 if (FOLD) {
10852                     if (UTF
10853                             /* See comments for join_exact() as to why we fold
10854                              * this non-UTF at compile time */
10855                         || (node_type == EXACTFU
10856                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10857                     {
10858
10859
10860                         /* Prime the casefolded buffer.  Locale rules, which
10861                          * apply only to code points < 256, aren't known until
10862                          * execution, so for them, just output the original
10863                          * character using utf8.  If we start to fold non-UTF
10864                          * patterns, be sure to update join_exact() */
10865                         if (LOC && ender < 256) {
10866                             if (UNI_IS_INVARIANT(ender)) {
10867                                 *s = (U8) ender;
10868                                 foldlen = 1;
10869                             } else {
10870                                 *s = UTF8_TWO_BYTE_HI(ender);
10871                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10872                                 foldlen = 2;
10873                             }
10874                         }
10875                         else {
10876                             UV folded = _to_uni_fold_flags(
10877                                            ender,
10878                                            (U8 *) s,
10879                                            &foldlen,
10880                                            FOLD_FLAGS_FULL
10881                                            | ((LOC) ?  FOLD_FLAGS_LOCALE
10882                                                     : (ASCII_FOLD_RESTRICTED)
10883                                                       ? FOLD_FLAGS_NOMIX_ASCII
10884                                                       : 0)
10885                                             );
10886
10887                             /* If this node only contains non-folding code
10888                              * points so far, see if this new one is also
10889                              * non-folding */
10890                             if (maybe_exact) {
10891                                 if (folded != ender) {
10892                                     maybe_exact = FALSE;
10893                                 }
10894                                 else {
10895                                     /* Here the fold is the original; we have
10896                                      * to check further to see if anything
10897                                      * folds to it */
10898                                     if (! PL_utf8_foldable) {
10899                                         SV* swash = swash_init("utf8",
10900                                                            "_Perl_Any_Folds",
10901                                                            &PL_sv_undef, 1, 0);
10902                                         PL_utf8_foldable =
10903                                                     _get_swash_invlist(swash);
10904                                         SvREFCNT_dec_NN(swash);
10905                                     }
10906                                     if (_invlist_contains_cp(PL_utf8_foldable,
10907                                                              ender))
10908                                     {
10909                                         maybe_exact = FALSE;
10910                                     }
10911                                 }
10912                             }
10913                             ender = folded;
10914                         }
10915                         s += foldlen;
10916
10917                         /* The loop increments <len> each time, as all but this
10918                          * path (and the one just below for UTF) through it add
10919                          * a single byte to the EXACTish node.  But this one
10920                          * has changed len to be the correct final value, so
10921                          * subtract one to cancel out the increment that
10922                          * follows */
10923                         len += foldlen - 1;
10924                     }
10925                     else {
10926                         *(s++) = (char) ender;
10927                         maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10928                     }
10929                 }
10930                 else if (UTF) {
10931                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10932                     if (unilen > 0) {
10933                        s   += unilen;
10934                        len += unilen;
10935                     }
10936
10937                     /* See comment just above for - 1 */
10938                     len--;
10939                 }
10940                 else {
10941                     REGC((char)ender, s++);
10942                 }
10943
10944                 if (next_is_quantifier) {
10945
10946                     /* Here, the next input is a quantifier, and to get here,
10947                      * the current character is the only one in the node.
10948                      * Also, here <len> doesn't include the final byte for this
10949                      * character */
10950                     len++;
10951                     goto loopdone;
10952                 }
10953
10954             } /* End of loop through literal characters */
10955
10956             /* Here we have either exhausted the input or ran out of room in
10957              * the node.  (If we encountered a character that can't be in the
10958              * node, transfer is made directly to <loopdone>, and so we
10959              * wouldn't have fallen off the end of the loop.)  In the latter
10960              * case, we artificially have to split the node into two, because
10961              * we just don't have enough space to hold everything.  This
10962              * creates a problem if the final character participates in a
10963              * multi-character fold in the non-final position, as a match that
10964              * should have occurred won't, due to the way nodes are matched,
10965              * and our artificial boundary.  So back off until we find a non-
10966              * problematic character -- one that isn't at the beginning or
10967              * middle of such a fold.  (Either it doesn't participate in any
10968              * folds, or appears only in the final position of all the folds it
10969              * does participate in.)  A better solution with far fewer false
10970              * positives, and that would fill the nodes more completely, would
10971              * be to actually have available all the multi-character folds to
10972              * test against, and to back-off only far enough to be sure that
10973              * this node isn't ending with a partial one.  <upper_parse> is set
10974              * further below (if we need to reparse the node) to include just
10975              * up through that final non-problematic character that this code
10976              * identifies, so when it is set to less than the full node, we can
10977              * skip the rest of this */
10978             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10979
10980                 const STRLEN full_len = len;
10981
10982                 assert(len >= MAX_NODE_STRING_SIZE);
10983
10984                 /* Here, <s> points to the final byte of the final character.
10985                  * Look backwards through the string until find a non-
10986                  * problematic character */
10987
10988                 if (! UTF) {
10989
10990                     /* These two have no multi-char folds to non-UTF characters
10991                      */
10992                     if (ASCII_FOLD_RESTRICTED || LOC) {
10993                         goto loopdone;
10994                     }
10995
10996                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10997                     len = s - s0 + 1;
10998                 }
10999                 else {
11000                     if (!  PL_NonL1NonFinalFold) {
11001                         PL_NonL1NonFinalFold = _new_invlist_C_array(
11002                                         NonL1_Perl_Non_Final_Folds_invlist);
11003                     }
11004
11005                     /* Point to the first byte of the final character */
11006                     s = (char *) utf8_hop((U8 *) s, -1);
11007
11008                     while (s >= s0) {   /* Search backwards until find
11009                                            non-problematic char */
11010                         if (UTF8_IS_INVARIANT(*s)) {
11011
11012                             /* There are no ascii characters that participate
11013                              * in multi-char folds under /aa.  In EBCDIC, the
11014                              * non-ascii invariants are all control characters,
11015                              * so don't ever participate in any folds. */
11016                             if (ASCII_FOLD_RESTRICTED
11017                                 || ! IS_NON_FINAL_FOLD(*s))
11018                             {
11019                                 break;
11020                             }
11021                         }
11022                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11023
11024                             /* No Latin1 characters participate in multi-char
11025                              * folds under /l */
11026                             if (LOC
11027                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11028                                                                 *s, *(s+1))))
11029                             {
11030                                 break;
11031                             }
11032                         }
11033                         else if (! _invlist_contains_cp(
11034                                         PL_NonL1NonFinalFold,
11035                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
11036                         {
11037                             break;
11038                         }
11039
11040                         /* Here, the current character is problematic in that
11041                          * it does occur in the non-final position of some
11042                          * fold, so try the character before it, but have to
11043                          * special case the very first byte in the string, so
11044                          * we don't read outside the string */
11045                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11046                     } /* End of loop backwards through the string */
11047
11048                     /* If there were only problematic characters in the string,
11049                      * <s> will point to before s0, in which case the length
11050                      * should be 0, otherwise include the length of the
11051                      * non-problematic character just found */
11052                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11053                 }
11054
11055                 /* Here, have found the final character, if any, that is
11056                  * non-problematic as far as ending the node without splitting
11057                  * it across a potential multi-char fold.  <len> contains the
11058                  * number of bytes in the node up-to and including that
11059                  * character, or is 0 if there is no such character, meaning
11060                  * the whole node contains only problematic characters.  In
11061                  * this case, give up and just take the node as-is.  We can't
11062                  * do any better */
11063                 if (len == 0) {
11064                     len = full_len;
11065                 } else {
11066
11067                     /* Here, the node does contain some characters that aren't
11068                      * problematic.  If one such is the final character in the
11069                      * node, we are done */
11070                     if (len == full_len) {
11071                         goto loopdone;
11072                     }
11073                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11074
11075                         /* If the final character is problematic, but the
11076                          * penultimate is not, back-off that last character to
11077                          * later start a new node with it */
11078                         p = oldp;
11079                         goto loopdone;
11080                     }
11081
11082                     /* Here, the final non-problematic character is earlier
11083                      * in the input than the penultimate character.  What we do
11084                      * is reparse from the beginning, going up only as far as
11085                      * this final ok one, thus guaranteeing that the node ends
11086                      * in an acceptable character.  The reason we reparse is
11087                      * that we know how far in the character is, but we don't
11088                      * know how to correlate its position with the input parse.
11089                      * An alternate implementation would be to build that
11090                      * correlation as we go along during the original parse,
11091                      * but that would entail extra work for every node, whereas
11092                      * this code gets executed only when the string is too
11093                      * large for the node, and the final two characters are
11094                      * problematic, an infrequent occurrence.  Yet another
11095                      * possible strategy would be to save the tail of the
11096                      * string, and the next time regatom is called, initialize
11097                      * with that.  The problem with this is that unless you
11098                      * back off one more character, you won't be guaranteed
11099                      * regatom will get called again, unless regbranch,
11100                      * regpiece ... are also changed.  If you do back off that
11101                      * extra character, so that there is input guaranteed to
11102                      * force calling regatom, you can't handle the case where
11103                      * just the first character in the node is acceptable.  I
11104                      * (khw) decided to try this method which doesn't have that
11105                      * pitfall; if performance issues are found, we can do a
11106                      * combination of the current approach plus that one */
11107                     upper_parse = len;
11108                     len = 0;
11109                     s = s0;
11110                     goto reparse;
11111                 }
11112             }   /* End of verifying node ends with an appropriate char */
11113
11114         loopdone:   /* Jumped to when encounters something that shouldn't be in
11115                        the node */
11116
11117             /* If 'maybe_exact' is still set here, means there are no
11118              * code points in the node that participate in folds */
11119             if (FOLD && maybe_exact) {
11120                 OP(ret) = EXACT;
11121             }
11122
11123             /* I (khw) don't know if you can get here with zero length, but the
11124              * old code handled this situation by creating a zero-length EXACT
11125              * node.  Might as well be NOTHING instead */
11126             if (len == 0) {
11127                 OP(ret) = NOTHING;
11128             }
11129             else{
11130                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11131             }
11132
11133             RExC_parse = p - 1;
11134             Set_Node_Cur_Length(ret); /* MJD */
11135             nextchar(pRExC_state);
11136             {
11137                 /* len is STRLEN which is unsigned, need to copy to signed */
11138                 IV iv = len;
11139                 if (iv < 0)
11140                     vFAIL("Internal disaster");
11141             }
11142
11143         } /* End of label 'defchar:' */
11144         break;
11145     } /* End of giant switch on input character */
11146
11147     return(ret);
11148 }
11149
11150 STATIC char *
11151 S_regwhite( RExC_state_t *pRExC_state, char *p )
11152 {
11153     const char *e = RExC_end;
11154
11155     PERL_ARGS_ASSERT_REGWHITE;
11156
11157     while (p < e) {
11158         if (isSPACE(*p))
11159             ++p;
11160         else if (*p == '#') {
11161             bool ended = 0;
11162             do {
11163                 if (*p++ == '\n') {
11164                     ended = 1;
11165                     break;
11166                 }
11167             } while (p < e);
11168             if (!ended)
11169                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11170         }
11171         else
11172             break;
11173     }
11174     return p;
11175 }
11176
11177 STATIC char *
11178 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11179 {
11180     /* Returns the next non-pattern-white space, non-comment character (the
11181      * latter only if 'recognize_comment is true) in the string p, which is
11182      * ended by RExC_end.  If there is no line break ending a comment,
11183      * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11184     const char *e = RExC_end;
11185
11186     PERL_ARGS_ASSERT_REGPATWS;
11187
11188     while (p < e) {
11189         STRLEN len;
11190         if ((len = is_PATWS_safe(p, e, UTF))) {
11191             p += len;
11192         }
11193         else if (recognize_comment && *p == '#') {
11194             bool ended = 0;
11195             do {
11196                 p++;
11197                 if (is_LNBREAK_safe(p, e, UTF)) {
11198                     ended = 1;
11199                     break;
11200                 }
11201             } while (p < e);
11202             if (!ended)
11203                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11204         }
11205         else
11206             break;
11207     }
11208     return p;
11209 }
11210
11211 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11212    Character classes ([:foo:]) can also be negated ([:^foo:]).
11213    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11214    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11215    but trigger failures because they are currently unimplemented. */
11216
11217 #define POSIXCC_DONE(c)   ((c) == ':')
11218 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11219 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11220
11221 PERL_STATIC_INLINE I32
11222 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me,
11223                     const bool strict)
11224 {
11225     dVAR;
11226     I32 namedclass = OOB_NAMEDCLASS;
11227
11228     PERL_ARGS_ASSERT_REGPPOSIXCC;
11229
11230     if (value == '[' && RExC_parse + 1 < RExC_end &&
11231         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11232         POSIXCC(UCHARAT(RExC_parse)))
11233     {
11234         const char c = UCHARAT(RExC_parse);
11235         char* const s = RExC_parse++;
11236
11237         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11238             RExC_parse++;
11239         if (RExC_parse == RExC_end) {
11240             if (strict) {
11241
11242                 /* Try to give a better location for the error (than the end of
11243                  * the string) by looking for the matching ']' */
11244                 RExC_parse = s;
11245                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11246                     RExC_parse++;
11247                 }
11248                 vFAIL2("Unmatched '%c' in POSIX class", c);
11249             }
11250             /* Grandfather lone [:, [=, [. */
11251             RExC_parse = s;
11252         }
11253         else {
11254             const char* const t = RExC_parse++; /* skip over the c */
11255             assert(*t == c);
11256
11257             if (UCHARAT(RExC_parse) == ']') {
11258                 const char *posixcc = s + 1;
11259                 RExC_parse++; /* skip over the ending ] */
11260
11261                 if (*s == ':') {
11262                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11263                     const I32 skip = t - posixcc;
11264
11265                     /* Initially switch on the length of the name.  */
11266                     switch (skip) {
11267                     case 4:
11268                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11269                                                           this is the Perl \w
11270                                                         */
11271                             namedclass = ANYOF_WORDCHAR;
11272                         break;
11273                     case 5:
11274                         /* Names all of length 5.  */
11275                         /* alnum alpha ascii blank cntrl digit graph lower
11276                            print punct space upper  */
11277                         /* Offset 4 gives the best switch position.  */
11278                         switch (posixcc[4]) {
11279                         case 'a':
11280                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11281                                 namedclass = ANYOF_ALPHA;
11282                             break;
11283                         case 'e':
11284                             if (memEQ(posixcc, "spac", 4)) /* space */
11285                                 namedclass = ANYOF_PSXSPC;
11286                             break;
11287                         case 'h':
11288                             if (memEQ(posixcc, "grap", 4)) /* graph */
11289                                 namedclass = ANYOF_GRAPH;
11290                             break;
11291                         case 'i':
11292                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11293                                 namedclass = ANYOF_ASCII;
11294                             break;
11295                         case 'k':
11296                             if (memEQ(posixcc, "blan", 4)) /* blank */
11297                                 namedclass = ANYOF_BLANK;
11298                             break;
11299                         case 'l':
11300                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11301                                 namedclass = ANYOF_CNTRL;
11302                             break;
11303                         case 'm':
11304                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11305                                 namedclass = ANYOF_ALPHANUMERIC;
11306                             break;
11307                         case 'r':
11308                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11309                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11310                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11311                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11312                             break;
11313                         case 't':
11314                             if (memEQ(posixcc, "digi", 4)) /* digit */
11315                                 namedclass = ANYOF_DIGIT;
11316                             else if (memEQ(posixcc, "prin", 4)) /* print */
11317                                 namedclass = ANYOF_PRINT;
11318                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11319                                 namedclass = ANYOF_PUNCT;
11320                             break;
11321                         }
11322                         break;
11323                     case 6:
11324                         if (memEQ(posixcc, "xdigit", 6))
11325                             namedclass = ANYOF_XDIGIT;
11326                         break;
11327                     }
11328
11329                     if (namedclass == OOB_NAMEDCLASS)
11330                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11331                                       t - s - 1, s + 1);
11332
11333                     /* The #defines are structured so each complement is +1 to
11334                      * the normal one */
11335                     if (complement) {
11336                         namedclass++;
11337                     }
11338                     assert (posixcc[skip] == ':');
11339                     assert (posixcc[skip+1] == ']');
11340                 } else if (!SIZE_ONLY) {
11341                     /* [[=foo=]] and [[.foo.]] are still future. */
11342
11343                     /* adjust RExC_parse so the warning shows after
11344                        the class closes */
11345                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11346                         RExC_parse++;
11347                     SvREFCNT_dec(free_me);
11348                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11349                 }
11350             } else {
11351                 /* Maternal grandfather:
11352                  * "[:" ending in ":" but not in ":]" */
11353                 if (strict) {
11354                     vFAIL("Unmatched '[' in POSIX class");
11355                 }
11356
11357                 /* Grandfather lone [:, [=, [. */
11358                 RExC_parse = s;
11359             }
11360         }
11361     }
11362
11363     return namedclass;
11364 }
11365
11366 STATIC bool
11367 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11368 {
11369     /* This applies some heuristics at the current parse position (which should
11370      * be at a '[') to see if what follows might be intended to be a [:posix:]
11371      * class.  It returns true if it really is a posix class, of course, but it
11372      * also can return true if it thinks that what was intended was a posix
11373      * class that didn't quite make it.
11374      *
11375      * It will return true for
11376      *      [:alphanumerics:
11377      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11378      *                         ')' indicating the end of the (?[
11379      *      [:any garbage including %^&$ punctuation:]
11380      *
11381      * This is designed to be called only from S_handle_regex_sets; it could be
11382      * easily adapted to be called from the spot at the beginning of regclass()
11383      * that checks to see in a normal bracketed class if the surrounding []
11384      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11385      * change long-standing behavior, so I (khw) didn't do that */
11386     char* p = RExC_parse + 1;
11387     char first_char = *p;
11388
11389     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11390
11391     assert(*(p - 1) == '[');
11392
11393     if (! POSIXCC(first_char)) {
11394         return FALSE;
11395     }
11396
11397     p++;
11398     while (p < RExC_end && isWORDCHAR(*p)) p++;
11399
11400     if (p >= RExC_end) {
11401         return FALSE;
11402     }
11403
11404     if (p - RExC_parse > 2    /* Got at least 1 word character */
11405         && (*p == first_char
11406             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11407     {
11408         return TRUE;
11409     }
11410
11411     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11412
11413     return (p
11414             && p - RExC_parse > 2 /* [:] evaluates to colon;
11415                                       [::] is a bad posix class. */
11416             && first_char == *(p - 1));
11417 }
11418
11419 STATIC regnode *
11420 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11421                    char * const oregcomp_parse)
11422 {
11423     /* Handle the (?[...]) construct to do set operations */
11424
11425     U8 curchar;
11426     UV start, end;      /* End points of code point ranges */
11427     SV* result_string;
11428     char *save_end, *save_parse;
11429     SV* final;
11430     STRLEN len;
11431     regnode* node;
11432     AV* stack;
11433     const bool save_fold = FOLD;
11434
11435     GET_RE_DEBUG_FLAGS_DECL;
11436
11437     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11438
11439     if (LOC) {
11440         vFAIL("(?[...]) not valid in locale");
11441     }
11442     RExC_uni_semantics = 1;
11443
11444     /* This will return only an ANYOF regnode, or (unlikely) something smaller
11445      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11446      * call regclass to handle '[]' so as to not have to reinvent its parsing
11447      * rules here (throwing away the size it computes each time).  And, we exit
11448      * upon an unescaped ']' that isn't one ending a regclass.  To do both
11449      * these things, we need to realize that something preceded by a backslash
11450      * is escaped, so we have to keep track of backslashes */
11451     if (SIZE_ONLY) {
11452
11453         Perl_ck_warner_d(aTHX_
11454             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11455             "The regex_sets feature is experimental" REPORT_LOCATION,
11456             (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11457
11458         while (RExC_parse < RExC_end) {
11459             SV* current = NULL;
11460             RExC_parse = regpatws(pRExC_state, RExC_parse,
11461                                 TRUE); /* means recognize comments */
11462             switch (*RExC_parse) {
11463                 default:
11464                     break;
11465                 case '\\':
11466                     /* Skip the next byte (which could cause us to end up in
11467                      * the middle of a UTF-8 character, but since none of those
11468                      * are confusable with anything we currently handle in this
11469                      * switch (invariants all), it's safe.  We'll just hit the
11470                      * default: case next time and keep on incrementing until
11471                      * we find one of the invariants we do handle. */
11472                     RExC_parse++;
11473                     break;
11474                 case '[':
11475                 {
11476                     /* If this looks like it is a [:posix:] class, leave the
11477                      * parse pointer at the '[' to fool regclass() into
11478                      * thinking it is part of a '[[:posix:]]'.  That function
11479                      * will use strict checking to force a syntax error if it
11480                      * doesn't work out to a legitimate class */
11481                     bool is_posix_class
11482                                     = could_it_be_a_POSIX_class(pRExC_state);
11483                     if (! is_posix_class) {
11484                         RExC_parse++;
11485                     }
11486
11487                     (void) regclass(pRExC_state, flagp,depth+1,
11488                                     is_posix_class, /* parse the whole char
11489                                                        class only if not a
11490                                                        posix class */
11491                                     FALSE, /* don't allow multi-char folds */
11492                                     TRUE, /* silence non-portable warnings. */
11493                                     &current);
11494                     /* function call leaves parse pointing to the ']', except
11495                      * if we faked it */
11496                     if (is_posix_class) {
11497                         RExC_parse--;
11498                     }
11499
11500                     SvREFCNT_dec(current);   /* In case it returned something */
11501                     break;
11502                 }
11503
11504                 case ']':
11505                     RExC_parse++;
11506                     if (RExC_parse < RExC_end
11507                         && *RExC_parse == ')')
11508                     {
11509                         node = reganode(pRExC_state, ANYOF, 0);
11510                         RExC_size += ANYOF_SKIP;
11511                         nextchar(pRExC_state);
11512                         Set_Node_Length(node,
11513                                 RExC_parse - oregcomp_parse + 1); /* MJD */
11514                         return node;
11515                     }
11516                     goto no_close;
11517             }
11518             RExC_parse++;
11519         }
11520
11521         no_close:
11522         FAIL("Syntax error in (?[...])");
11523     }
11524
11525     /* Pass 2 only after this.  Everything in this construct is a
11526      * metacharacter.  Operands begin with either a '\' (for an escape
11527      * sequence), or a '[' for a bracketed character class.  Any other
11528      * character should be an operator, or parenthesis for grouping.  Both
11529      * types of operands are handled by calling regclass() to parse them.  It
11530      * is called with a parameter to indicate to return the computed inversion
11531      * list.  The parsing here is implemented via a stack.  Each entry on the
11532      * stack is a single character representing one of the operators, or the
11533      * '('; or else a pointer to an operand inversion list. */
11534
11535 #define IS_OPERAND(a)  (! SvIOK(a))
11536
11537     /* The stack starts empty.  It is a syntax error if the first thing parsed
11538      * is a binary operator; everything else is pushed on the stack.  When an
11539      * operand is parsed, the top of the stack is examined.  If it is a binary
11540      * operator, the item before it should be an operand, and both are replaced
11541      * by the result of doing that operation on the new operand and the one on
11542      * the stack.   Thus a sequence of binary operands is reduced to a single
11543      * one before the next one is parsed.
11544      *
11545      * A unary operator may immediately follow a binary in the input, for
11546      * example
11547      *      [a] + ! [b]
11548      * When an operand is parsed and the top of the stack is a unary operator,
11549      * the operation is performed, and then the stack is rechecked to see if
11550      * this new operand is part of a binary operation; if so, it is handled as
11551      * above.
11552      *
11553      * A '(' is simply pushed on the stack; it is valid only if the stack is
11554      * empty, or the top element of the stack is an operator or another '('
11555      * (for which the parenthesized expression will become an operand).  By the
11556      * time the corresponding ')' is parsed everything in between should have
11557      * been parsed and evaluated to a single operand (or else is a syntax
11558      * error), and is handled as a regular operand */
11559
11560     stack = newAV();
11561
11562     while (RExC_parse < RExC_end) {
11563         I32 top_index = av_tindex(stack);
11564         SV** top_ptr;
11565         SV* current = NULL;
11566
11567         /* Skip white space */
11568         RExC_parse = regpatws(pRExC_state, RExC_parse,
11569                                 TRUE); /* means recognize comments */
11570         if (RExC_parse >= RExC_end) {
11571             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11572         }
11573         if ((curchar = UCHARAT(RExC_parse)) == ']') {
11574             break;
11575         }
11576
11577         switch (curchar) {
11578
11579             case '?':
11580                 if (av_tindex(stack) >= 0   /* This makes sure that we can
11581                                                safely subtract 1 from
11582                                                RExC_parse in the next clause.
11583                                                If we have something on the
11584                                                stack, we have parsed something
11585                                              */
11586                     && UCHARAT(RExC_parse - 1) == '('
11587                     && RExC_parse < RExC_end)
11588                 {
11589                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11590                      * This happens when we have some thing like
11591                      *
11592                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11593                      *   ...
11594                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
11595                      *
11596                      * Here we would be handling the interpolated
11597                      * '$thai_or_lao'.  We handle this by a recursive call to
11598                      * ourselves which returns the inversion list the
11599                      * interpolated expression evaluates to.  We use the flags
11600                      * from the interpolated pattern. */
11601                     U32 save_flags = RExC_flags;
11602                     const char * const save_parse = ++RExC_parse;
11603
11604                     parse_lparen_question_flags(pRExC_state);
11605
11606                     if (RExC_parse == save_parse  /* Makes sure there was at
11607                                                      least one flag (or this
11608                                                      embedding wasn't compiled)
11609                                                    */
11610                         || RExC_parse >= RExC_end - 4
11611                         || UCHARAT(RExC_parse) != ':'
11612                         || UCHARAT(++RExC_parse) != '('
11613                         || UCHARAT(++RExC_parse) != '?'
11614                         || UCHARAT(++RExC_parse) != '[')
11615                     {
11616
11617                         /* In combination with the above, this moves the
11618                          * pointer to the point just after the first erroneous
11619                          * character (or if there are no flags, to where they
11620                          * should have been) */
11621                         if (RExC_parse >= RExC_end - 4) {
11622                             RExC_parse = RExC_end;
11623                         }
11624                         else if (RExC_parse != save_parse) {
11625                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11626                         }
11627                         vFAIL("Expecting '(?flags:(?[...'");
11628                     }
11629                     RExC_parse++;
11630                     (void) handle_regex_sets(pRExC_state, &current, flagp,
11631                                                     depth+1, oregcomp_parse);
11632
11633                     /* Here, 'current' contains the embedded expression's
11634                      * inversion list, and RExC_parse points to the trailing
11635                      * ']'; the next character should be the ')' which will be
11636                      * paired with the '(' that has been put on the stack, so
11637                      * the whole embedded expression reduces to '(operand)' */
11638                     RExC_parse++;
11639
11640                     RExC_flags = save_flags;
11641                     goto handle_operand;
11642                 }
11643                 /* FALL THROUGH */
11644
11645             default:
11646                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11647                 vFAIL("Unexpected character");
11648
11649             case '\\':
11650                 (void) regclass(pRExC_state, flagp,depth+1,
11651                                 TRUE, /* means parse just the next thing */
11652                                 FALSE, /* don't allow multi-char folds */
11653                                 FALSE, /* don't silence non-portable warnings.
11654                                         */
11655                                 &current);
11656                 /* regclass() will return with parsing just the \ sequence,
11657                  * leaving the parse pointer at the next thing to parse */
11658                 RExC_parse--;
11659                 goto handle_operand;
11660
11661             case '[':   /* Is a bracketed character class */
11662             {
11663                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11664
11665                 if (! is_posix_class) {
11666                     RExC_parse++;
11667                 }
11668
11669                 (void) regclass(pRExC_state, flagp,depth+1,
11670                                 is_posix_class, /* parse the whole char class
11671                                                    only if not a posix class */
11672                                 FALSE, /* don't allow multi-char folds */
11673                                 FALSE, /* don't silence non-portable warnings.
11674                                         */
11675                                 &current);
11676                 /* function call leaves parse pointing to the ']', except if we
11677                  * faked it */
11678                 if (is_posix_class) {
11679                     RExC_parse--;
11680                 }
11681
11682                 goto handle_operand;
11683             }
11684
11685             case '&':
11686             case '|':
11687             case '+':
11688             case '-':
11689             case '^':
11690                 if (top_index < 0
11691                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11692                     || ! IS_OPERAND(*top_ptr))
11693                 {
11694                     RExC_parse++;
11695                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11696                 }
11697                 av_push(stack, newSVuv(curchar));
11698                 break;
11699
11700             case '!':
11701                 av_push(stack, newSVuv(curchar));
11702                 break;
11703
11704             case '(':
11705                 if (top_index >= 0) {
11706                     top_ptr = av_fetch(stack, top_index, FALSE);
11707                     assert(top_ptr);
11708                     if (IS_OPERAND(*top_ptr)) {
11709                         RExC_parse++;
11710                         vFAIL("Unexpected '(' with no preceding operator");
11711                     }
11712                 }
11713                 av_push(stack, newSVuv(curchar));
11714                 break;
11715
11716             case ')':
11717             {
11718                 SV* lparen;
11719                 if (top_index < 1
11720                     || ! (current = av_pop(stack))
11721                     || ! IS_OPERAND(current)
11722                     || ! (lparen = av_pop(stack))
11723                     || IS_OPERAND(lparen)
11724                     || SvUV(lparen) != '(')
11725                 {
11726                     RExC_parse++;
11727                     vFAIL("Unexpected ')'");
11728                 }
11729                 top_index -= 2;
11730                 SvREFCNT_dec_NN(lparen);
11731
11732                 /* FALL THROUGH */
11733             }
11734
11735               handle_operand:
11736
11737                 /* Here, we have an operand to process, in 'current' */
11738
11739                 if (top_index < 0) {    /* Just push if stack is empty */
11740                     av_push(stack, current);
11741                 }
11742                 else {
11743                     SV* top = av_pop(stack);
11744                     char current_operator;
11745
11746                     if (IS_OPERAND(top)) {
11747                         vFAIL("Operand with no preceding operator");
11748                     }
11749                     current_operator = (char) SvUV(top);
11750                     switch (current_operator) {
11751                         case '(':   /* Push the '(' back on followed by the new
11752                                        operand */
11753                             av_push(stack, top);
11754                             av_push(stack, current);
11755                             SvREFCNT_inc(top);  /* Counters the '_dec' done
11756                                                    just after the 'break', so
11757                                                    it doesn't get wrongly freed
11758                                                  */
11759                             break;
11760
11761                         case '!':
11762                             _invlist_invert(current);
11763
11764                             /* Unlike binary operators, the top of the stack,
11765                              * now that this unary one has been popped off, may
11766                              * legally be an operator, and we now have operand
11767                              * for it. */
11768                             top_index--;
11769                             SvREFCNT_dec_NN(top);
11770                             goto handle_operand;
11771
11772                         case '&':
11773                             _invlist_intersection(av_pop(stack),
11774                                                    current,
11775                                                    &current);
11776                             av_push(stack, current);
11777                             break;
11778
11779                         case '|':
11780                         case '+':
11781                             _invlist_union(av_pop(stack), current, &current);
11782                             av_push(stack, current);
11783                             break;
11784
11785                         case '-':
11786                             _invlist_subtract(av_pop(stack), current, &current);
11787                             av_push(stack, current);
11788                             break;
11789
11790                         case '^':   /* The union minus the intersection */
11791                         {
11792                             SV* i = NULL;
11793                             SV* u = NULL;
11794                             SV* element;
11795
11796                             element = av_pop(stack);
11797                             _invlist_union(element, current, &u);
11798                             _invlist_intersection(element, current, &i);
11799                             _invlist_subtract(u, i, &current);
11800                             av_push(stack, current);
11801                             SvREFCNT_dec_NN(i);
11802                             SvREFCNT_dec_NN(u);
11803                             SvREFCNT_dec_NN(element);
11804                             break;
11805                         }
11806
11807                         default:
11808                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
11809                 }
11810                 SvREFCNT_dec_NN(top);
11811             }
11812         }
11813
11814         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11815     }
11816
11817     if (av_tindex(stack) < 0   /* Was empty */
11818         || ((final = av_pop(stack)) == NULL)
11819         || ! IS_OPERAND(final)
11820         || av_tindex(stack) >= 0)  /* More left on stack */
11821     {
11822         vFAIL("Incomplete expression within '(?[ ])'");
11823     }
11824
11825     /* Here, 'final' is the resultant inversion list from evaluating the
11826      * expression.  Return it if so requested */
11827     if (return_invlist) {
11828         *return_invlist = final;
11829         return END;
11830     }
11831
11832     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
11833      * expecting a string of ranges and individual code points */
11834     invlist_iterinit(final);
11835     result_string = newSVpvs("");
11836     while (invlist_iternext(final, &start, &end)) {
11837         if (start == end) {
11838             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
11839         }
11840         else {
11841             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
11842                                                      start,          end);
11843         }
11844     }
11845
11846     save_parse = RExC_parse;
11847     RExC_parse = SvPV(result_string, len);
11848     save_end = RExC_end;
11849     RExC_end = RExC_parse + len;
11850
11851     /* We turn off folding around the call, as the class we have constructed
11852      * already has all folding taken into consideration, and we don't want
11853      * regclass() to add to that */
11854     RExC_flags &= ~RXf_PMf_FOLD;
11855     node = regclass(pRExC_state, flagp,depth+1,
11856                     FALSE, /* means parse the whole char class */
11857                     FALSE, /* don't allow multi-char folds */
11858                     TRUE, /* silence non-portable warnings.  The above may very
11859                              well have generated non-portable code points, but
11860                              they're valid on this machine */
11861                     NULL);
11862     if (save_fold) {
11863         RExC_flags |= RXf_PMf_FOLD;
11864     }
11865     RExC_parse = save_parse + 1;
11866     RExC_end = save_end;
11867     SvREFCNT_dec_NN(final);
11868     SvREFCNT_dec_NN(result_string);
11869     SvREFCNT_dec_NN(stack);
11870
11871     nextchar(pRExC_state);
11872     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
11873     return node;
11874 }
11875 #undef IS_OPERAND
11876
11877 /* The names of properties whose definitions are not known at compile time are
11878  * stored in this SV, after a constant heading.  So if the length has been
11879  * changed since initialization, then there is a run-time definition. */
11880 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11881
11882 STATIC regnode *
11883 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
11884                  const bool stop_at_1,  /* Just parse the next thing, don't
11885                                            look for a full character class */
11886                  bool allow_multi_folds,
11887                  const bool silence_non_portable,   /* Don't output warnings
11888                                                        about too large
11889                                                        characters */
11890                  SV** ret_invlist)  /* Return an inversion list, not a node */
11891 {
11892     /* parse a bracketed class specification.  Most of these will produce an
11893      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
11894      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
11895      * under /i with multi-character folds: it will be rewritten following the
11896      * paradigm of this example, where the <multi-fold>s are characters which
11897      * fold to multiple character sequences:
11898      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11899      * gets effectively rewritten as:
11900      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11901      * reg() gets called (recursively) on the rewritten version, and this
11902      * function will return what it constructs.  (Actually the <multi-fold>s
11903      * aren't physically removed from the [abcdefghi], it's just that they are
11904      * ignored in the recursion by means of a flag:
11905      * <RExC_in_multi_char_class>.)
11906      *
11907      * ANYOF nodes contain a bit map for the first 256 characters, with the
11908      * corresponding bit set if that character is in the list.  For characters
11909      * above 255, a range list or swash is used.  There are extra bits for \w,
11910      * etc. in locale ANYOFs, as what these match is not determinable at
11911      * compile time */
11912
11913     dVAR;
11914     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11915     IV range = 0;
11916     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11917     regnode *ret;
11918     STRLEN numlen;
11919     IV namedclass = OOB_NAMEDCLASS;
11920     char *rangebegin = NULL;
11921     bool need_class = 0;
11922     SV *listsv = NULL;
11923     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11924                                       than just initialized.  */
11925     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11926     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
11927                                extended beyond the Latin1 range */
11928     UV element_count = 0;   /* Number of distinct elements in the class.
11929                                Optimizations may be possible if this is tiny */
11930     AV * multi_char_matches = NULL; /* Code points that fold to more than one
11931                                        character; used under /i */
11932     UV n;
11933     char * stop_ptr = RExC_end;    /* where to stop parsing */
11934     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
11935                                                    space? */
11936     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
11937
11938     /* Unicode properties are stored in a swash; this holds the current one
11939      * being parsed.  If this swash is the only above-latin1 component of the
11940      * character class, an optimization is to pass it directly on to the
11941      * execution engine.  Otherwise, it is set to NULL to indicate that there
11942      * are other things in the class that have to be dealt with at execution
11943      * time */
11944     SV* swash = NULL;           /* Code points that match \p{} \P{} */
11945
11946     /* Set if a component of this character class is user-defined; just passed
11947      * on to the engine */
11948     bool has_user_defined_property = FALSE;
11949
11950     /* inversion list of code points this node matches only when the target
11951      * string is in UTF-8.  (Because is under /d) */
11952     SV* depends_list = NULL;
11953
11954     /* inversion list of code points this node matches.  For much of the
11955      * function, it includes only those that match regardless of the utf8ness
11956      * of the target string */
11957     SV* cp_list = NULL;
11958
11959 #ifdef EBCDIC
11960     /* In a range, counts how many 0-2 of the ends of it came from literals,
11961      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
11962     UV literal_endpoint = 0;
11963 #endif
11964     bool invert = FALSE;    /* Is this class to be complemented */
11965
11966     /* Is there any thing like \W or [:^digit:] that matches above the legal
11967      * Unicode range? */
11968     bool runtime_posix_matches_above_Unicode = FALSE;
11969
11970     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11971         case we need to change the emitted regop to an EXACT. */
11972     const char * orig_parse = RExC_parse;
11973     const I32 orig_size = RExC_size;
11974     GET_RE_DEBUG_FLAGS_DECL;
11975
11976     PERL_ARGS_ASSERT_REGCLASS;
11977 #ifndef DEBUGGING
11978     PERL_UNUSED_ARG(depth);
11979 #endif
11980
11981     DEBUG_PARSE("clas");
11982
11983     /* Assume we are going to generate an ANYOF node. */
11984     ret = reganode(pRExC_state, ANYOF, 0);
11985
11986     if (SIZE_ONLY) {
11987         RExC_size += ANYOF_SKIP;
11988         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11989     }
11990     else {
11991         ANYOF_FLAGS(ret) = 0;
11992
11993         RExC_emit += ANYOF_SKIP;
11994         if (LOC) {
11995             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11996         }
11997         listsv = newSVpvs("# comment\n");
11998         initial_listsv_len = SvCUR(listsv);
11999     }
12000
12001     if (skip_white) {
12002         RExC_parse = regpatws(pRExC_state, RExC_parse,
12003                               FALSE /* means don't recognize comments */);
12004     }
12005
12006     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
12007         RExC_parse++;
12008         invert = TRUE;
12009         allow_multi_folds = FALSE;
12010         RExC_naughty++;
12011         if (skip_white) {
12012             RExC_parse = regpatws(pRExC_state, RExC_parse,
12013                                   FALSE /* means don't recognize comments */);
12014         }
12015     }
12016
12017     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12018     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12019         const char *s = RExC_parse;
12020         const char  c = *s++;
12021
12022         while (isWORDCHAR(*s))
12023             s++;
12024         if (*s && c == *s && s[1] == ']') {
12025             SAVEFREESV(RExC_rx_sv);
12026             SAVEFREESV(listsv);
12027             ckWARN3reg(s+2,
12028                        "POSIX syntax [%c %c] belongs inside character classes",
12029                        c, c);
12030             (void)ReREFCNT_inc(RExC_rx_sv);
12031             SvREFCNT_inc_simple_void_NN(listsv);
12032         }
12033     }
12034
12035     /* If the caller wants us to just parse a single element, accomplish this
12036      * by faking the loop ending condition */
12037     if (stop_at_1 && RExC_end > RExC_parse) {
12038         stop_ptr = RExC_parse + 1;
12039     }
12040
12041     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12042     if (UCHARAT(RExC_parse) == ']')
12043         goto charclassloop;
12044
12045 parseit:
12046     while (1) {
12047         if  (RExC_parse >= stop_ptr) {
12048             break;
12049         }
12050
12051         if (skip_white) {
12052             RExC_parse = regpatws(pRExC_state, RExC_parse,
12053                                   FALSE /* means don't recognize comments */);
12054         }
12055
12056         if  (UCHARAT(RExC_parse) == ']') {
12057             break;
12058         }
12059
12060     charclassloop:
12061
12062         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12063         save_value = value;
12064         save_prevvalue = prevvalue;
12065
12066         if (!range) {
12067             rangebegin = RExC_parse;
12068             element_count++;
12069         }
12070         if (UTF) {
12071             value = utf8n_to_uvchr((U8*)RExC_parse,
12072                                    RExC_end - RExC_parse,
12073                                    &numlen, UTF8_ALLOW_DEFAULT);
12074             RExC_parse += numlen;
12075         }
12076         else
12077             value = UCHARAT(RExC_parse++);
12078
12079         if (value == '['
12080             && RExC_parse < RExC_end
12081             && POSIXCC(UCHARAT(RExC_parse)))
12082         {
12083             namedclass = regpposixcc(pRExC_state, value, listsv, strict);
12084         }
12085         else if (value == '\\') {
12086             if (UTF) {
12087                 value = utf8n_to_uvchr((U8*)RExC_parse,
12088                                    RExC_end - RExC_parse,
12089                                    &numlen, UTF8_ALLOW_DEFAULT);
12090                 RExC_parse += numlen;
12091             }
12092             else
12093                 value = UCHARAT(RExC_parse++);
12094
12095             /* Some compilers cannot handle switching on 64-bit integer
12096              * values, therefore value cannot be an UV.  Yes, this will
12097              * be a problem later if we want switch on Unicode.
12098              * A similar issue a little bit later when switching on
12099              * namedclass. --jhi */
12100
12101             /* If the \ is escaping white space when white space is being
12102              * skipped, it means that that white space is wanted literally, and
12103              * is already in 'value'.  Otherwise, need to translate the escape
12104              * into what it signifies. */
12105             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12106
12107             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
12108             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
12109             case 's':   namedclass = ANYOF_SPACE;       break;
12110             case 'S':   namedclass = ANYOF_NSPACE;      break;
12111             case 'd':   namedclass = ANYOF_DIGIT;       break;
12112             case 'D':   namedclass = ANYOF_NDIGIT;      break;
12113             case 'v':   namedclass = ANYOF_VERTWS;      break;
12114             case 'V':   namedclass = ANYOF_NVERTWS;     break;
12115             case 'h':   namedclass = ANYOF_HORIZWS;     break;
12116             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
12117             case 'N':  /* Handle \N{NAME} in class */
12118                 {
12119                     /* We only pay attention to the first char of
12120                     multichar strings being returned. I kinda wonder
12121                     if this makes sense as it does change the behaviour
12122                     from earlier versions, OTOH that behaviour was broken
12123                     as well. */
12124                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12125                                       TRUE, /* => charclass */
12126                                       strict))
12127                     {
12128                         goto parseit;
12129                     }
12130                 }
12131                 break;
12132             case 'p':
12133             case 'P':
12134                 {
12135                 char *e;
12136
12137                 /* We will handle any undefined properties ourselves */
12138                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12139
12140                 if (RExC_parse >= RExC_end)
12141                     vFAIL2("Empty \\%c{}", (U8)value);
12142                 if (*RExC_parse == '{') {
12143                     const U8 c = (U8)value;
12144                     e = strchr(RExC_parse++, '}');
12145                     if (!e)
12146                         vFAIL2("Missing right brace on \\%c{}", c);
12147                     while (isSPACE(UCHARAT(RExC_parse)))
12148                         RExC_parse++;
12149                     if (e == RExC_parse)
12150                         vFAIL2("Empty \\%c{}", c);
12151                     n = e - RExC_parse;
12152                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12153                         n--;
12154                 }
12155                 else {
12156                     e = RExC_parse;
12157                     n = 1;
12158                 }
12159                 if (!SIZE_ONLY) {
12160                     SV* invlist;
12161                     char* name;
12162
12163                     if (UCHARAT(RExC_parse) == '^') {
12164                          RExC_parse++;
12165                          n--;
12166                          /* toggle.  (The rhs xor gets the single bit that
12167                           * differs between P and p; the other xor inverts just
12168                           * that bit) */
12169                          value ^= 'P' ^ 'p';
12170
12171                          while (isSPACE(UCHARAT(RExC_parse))) {
12172                               RExC_parse++;
12173                               n--;
12174                          }
12175                     }
12176                     /* Try to get the definition of the property into
12177                      * <invlist>.  If /i is in effect, the effective property
12178                      * will have its name be <__NAME_i>.  The design is
12179                      * discussed in commit
12180                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12181                     Newx(name, n + sizeof("_i__\n"), char);
12182
12183                     sprintf(name, "%s%.*s%s\n",
12184                                     (FOLD) ? "__" : "",
12185                                     (int)n,
12186                                     RExC_parse,
12187                                     (FOLD) ? "_i" : ""
12188                     );
12189
12190                     /* Look up the property name, and get its swash and
12191                      * inversion list, if the property is found  */
12192                     if (swash) {
12193                         SvREFCNT_dec_NN(swash);
12194                     }
12195                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
12196                                              1, /* binary */
12197                                              0, /* not tr/// */
12198                                              NULL, /* No inversion list */
12199                                              &swash_init_flags
12200                                             );
12201                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12202                         if (swash) {
12203                             SvREFCNT_dec_NN(swash);
12204                             swash = NULL;
12205                         }
12206
12207                         /* Here didn't find it.  It could be a user-defined
12208                          * property that will be available at run-time.  If we
12209                          * accept only compile-time properties, is an error;
12210                          * otherwise add it to the list for run-time look up */
12211                         if (ret_invlist) {
12212                             RExC_parse = e + 1;
12213                             vFAIL3("Property '%.*s' is unknown", (int) n, name);
12214                         }
12215                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12216                                         (value == 'p' ? '+' : '!'),
12217                                         name);
12218                         has_user_defined_property = TRUE;
12219
12220                         /* We don't know yet, so have to assume that the
12221                          * property could match something in the Latin1 range,
12222                          * hence something that isn't utf8.  Note that this
12223                          * would cause things in <depends_list> to match
12224                          * inappropriately, except that any \p{}, including
12225                          * this one forces Unicode semantics, which means there
12226                          * is <no depends_list> */
12227                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12228                     }
12229                     else {
12230
12231                         /* Here, did get the swash and its inversion list.  If
12232                          * the swash is from a user-defined property, then this
12233                          * whole character class should be regarded as such */
12234                         has_user_defined_property =
12235                                     (swash_init_flags
12236                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12237
12238                         /* Invert if asking for the complement */
12239                         if (value == 'P') {
12240                             _invlist_union_complement_2nd(properties,
12241                                                           invlist,
12242                                                           &properties);
12243
12244                             /* The swash can't be used as-is, because we've
12245                              * inverted things; delay removing it to here after
12246                              * have copied its invlist above */
12247                             SvREFCNT_dec_NN(swash);
12248                             swash = NULL;
12249                         }
12250                         else {
12251                             _invlist_union(properties, invlist, &properties);
12252                         }
12253                     }
12254                     Safefree(name);
12255                 }
12256                 RExC_parse = e + 1;
12257                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12258                                                 named */
12259
12260                 /* \p means they want Unicode semantics */
12261                 RExC_uni_semantics = 1;
12262                 }
12263                 break;
12264             case 'n':   value = '\n';                   break;
12265             case 'r':   value = '\r';                   break;
12266             case 't':   value = '\t';                   break;
12267             case 'f':   value = '\f';                   break;
12268             case 'b':   value = '\b';                   break;
12269             case 'e':   value = ASCII_TO_NATIVE('\033');break;
12270             case 'a':   value = ASCII_TO_NATIVE('\007');break;
12271             case 'o':
12272                 RExC_parse--;   /* function expects to be pointed at the 'o' */
12273                 {
12274                     const char* error_msg;
12275                     bool valid = grok_bslash_o(&RExC_parse,
12276                                                &value,
12277                                                &error_msg,
12278                                                SIZE_ONLY,   /* warnings in pass
12279                                                                1 only */
12280                                                strict,
12281                                                silence_non_portable,
12282                                                UTF);
12283                     if (! valid) {
12284                         vFAIL(error_msg);
12285                     }
12286                 }
12287                 if (PL_encoding && value < 0x100) {
12288                     goto recode_encoding;
12289                 }
12290                 break;
12291             case 'x':
12292                 RExC_parse--;   /* function expects to be pointed at the 'x' */
12293                 {
12294                     const char* error_msg;
12295                     bool valid = grok_bslash_x(&RExC_parse,
12296                                                &value,
12297                                                &error_msg,
12298                                                TRUE, /* Output warnings */
12299                                                strict,
12300                                                silence_non_portable,
12301                                                UTF);
12302                     if (! valid) {
12303                         vFAIL(error_msg);
12304                     }
12305                 }
12306                 if (PL_encoding && value < 0x100)
12307                     goto recode_encoding;
12308                 break;
12309             case 'c':
12310                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12311                 break;
12312             case '0': case '1': case '2': case '3': case '4':
12313             case '5': case '6': case '7':
12314                 {
12315                     /* Take 1-3 octal digits */
12316                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12317                     numlen = (strict) ? 4 : 3;
12318                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12319                     RExC_parse += numlen;
12320                     if (numlen != 3) {
12321                         SAVEFREESV(listsv); /* In case warnings are fatalized */
12322                         if (strict) {
12323                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12324                             vFAIL("Need exactly 3 octal digits");
12325                         }
12326                         else if (! SIZE_ONLY /* like \08, \178 */
12327                                  && numlen < 3
12328                                  && RExC_parse < RExC_end
12329                                  && isDIGIT(*RExC_parse)
12330                                  && ckWARN(WARN_REGEXP))
12331                         {
12332                             SAVEFREESV(RExC_rx_sv);
12333                             reg_warn_non_literal_string(
12334                                  RExC_parse + 1,
12335                                  form_short_octal_warning(RExC_parse, numlen));
12336                             (void)ReREFCNT_inc(RExC_rx_sv);
12337                         }
12338                         SvREFCNT_inc_simple_void_NN(listsv);
12339                     }
12340                     if (PL_encoding && value < 0x100)
12341                         goto recode_encoding;
12342                     break;
12343                 }
12344             recode_encoding:
12345                 if (! RExC_override_recoding) {
12346                     SV* enc = PL_encoding;
12347                     value = reg_recode((const char)(U8)value, &enc);
12348                     if (!enc) {
12349                         if (strict) {
12350                             vFAIL("Invalid escape in the specified encoding");
12351                         }
12352                         else if (SIZE_ONLY) {
12353                             ckWARNreg(RExC_parse,
12354                                   "Invalid escape in the specified encoding");
12355                         }
12356                     }
12357                     break;
12358                 }
12359             default:
12360                 /* Allow \_ to not give an error */
12361                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12362                     SAVEFREESV(listsv);
12363                     if (strict) {
12364                         vFAIL2("Unrecognized escape \\%c in character class",
12365                                (int)value);
12366                     }
12367                     else {
12368                         SAVEFREESV(RExC_rx_sv);
12369                         ckWARN2reg(RExC_parse,
12370                             "Unrecognized escape \\%c in character class passed through",
12371                             (int)value);
12372                         (void)ReREFCNT_inc(RExC_rx_sv);
12373                     }
12374                     SvREFCNT_inc_simple_void_NN(listsv);
12375                 }
12376                 break;
12377             }   /* End of switch on char following backslash */
12378         } /* end of handling backslash escape sequences */
12379 #ifdef EBCDIC
12380         else
12381             literal_endpoint++;
12382 #endif
12383
12384         /* Here, we have the current token in 'value' */
12385
12386         /* What matches in a locale is not known until runtime.  This includes
12387          * what the Posix classes (like \w, [:space:]) match.  Room must be
12388          * reserved (one time per class) to store such classes, either if Perl
12389          * is compiled so that locale nodes always should have this space, or
12390          * if there is such class info to be stored.  The space will contain a
12391          * bit for each named class that is to be matched against.  This isn't
12392          * needed for \p{} and pseudo-classes, as they are not affected by
12393          * locale, and hence are dealt with separately */
12394         if (LOC
12395             && ! need_class
12396             && (ANYOF_LOCALE == ANYOF_CLASS
12397                 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12398         {
12399             need_class = 1;
12400             if (SIZE_ONLY) {
12401                 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12402             }
12403             else {
12404                 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12405                 ANYOF_CLASS_ZERO(ret);
12406             }
12407             ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12408         }
12409
12410         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12411
12412             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12413              * literal, as is the character that began the false range, i.e.
12414              * the 'a' in the examples */
12415             if (range) {
12416                 if (!SIZE_ONLY) {
12417                     const int w = (RExC_parse >= rangebegin)
12418                                   ? RExC_parse - rangebegin
12419                                   : 0;
12420                     SAVEFREESV(listsv); /* in case of fatal warnings */
12421                     if (strict) {
12422                         vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12423                     }
12424                     else {
12425                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12426                         ckWARN4reg(RExC_parse,
12427                                 "False [] range \"%*.*s\"",
12428                                 w, w, rangebegin);
12429                         (void)ReREFCNT_inc(RExC_rx_sv);
12430                         cp_list = add_cp_to_invlist(cp_list, '-');
12431                         cp_list = add_cp_to_invlist(cp_list, prevvalue);
12432                     }
12433                     SvREFCNT_inc_simple_void_NN(listsv);
12434                 }
12435
12436                 range = 0; /* this was not a true range */
12437                 element_count += 2; /* So counts for three values */
12438             }
12439
12440             if (! SIZE_ONLY) {
12441                 U8 classnum = namedclass_to_classnum(namedclass);
12442                 if (namedclass >= ANYOF_MAX) {  /* If a special class */
12443                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12444
12445                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12446                          * /l make a difference in what these match.  There
12447                          * would be problems if these characters had folds
12448                          * other than themselves, as cp_list is subject to
12449                          * folding. */
12450                         if (classnum != _CC_VERTSPACE) {
12451                             assert(   namedclass == ANYOF_HORIZWS
12452                                    || namedclass == ANYOF_NHORIZWS);
12453
12454                             /* It turns out that \h is just a synonym for
12455                              * XPosixBlank */
12456                             classnum = _CC_BLANK;
12457                         }
12458
12459                         _invlist_union_maybe_complement_2nd(
12460                                 cp_list,
12461                                 PL_XPosix_ptrs[classnum],
12462                                 cBOOL(namedclass % 2), /* Complement if odd
12463                                                           (NHORIZWS, NVERTWS)
12464                                                         */
12465                                 &cp_list);
12466                     }
12467                 }
12468                 else if (classnum == _CC_ASCII) {
12469 #ifdef HAS_ISASCII
12470                     if (LOC) {
12471                         ANYOF_CLASS_SET(ret, namedclass);
12472                     }
12473                     else
12474 #endif  /* Not isascii(); just use the hard-coded definition for it */
12475                         _invlist_union_maybe_complement_2nd(
12476                                 posixes,
12477                                 PL_ASCII,
12478                                 cBOOL(namedclass % 2), /* Complement if odd
12479                                                           (NASCII) */
12480                                 &posixes);
12481                 }
12482                 else {  /* Garden variety class */
12483
12484                     /* The ascii range inversion list */
12485                     SV* ascii_source = PL_Posix_ptrs[classnum];
12486
12487                     /* The full Latin1 range inversion list */
12488                     SV* l1_source = PL_L1Posix_ptrs[classnum];
12489
12490                     /* This code is structured into two major clauses.  The
12491                      * first is for classes whose complete definitions may not
12492                      * already be known.  It not, the Latin1 definition
12493                      * (guaranteed to already known) is used plus code is
12494                      * generated to load the rest at run-time (only if needed).
12495                      * If the complete definition is known, it drops down to
12496                      * the second clause, where the complete definition is
12497                      * known */
12498
12499                     if (classnum < _FIRST_NON_SWASH_CC) {
12500
12501                         /* Here, the class has a swash, which may or not
12502                          * already be loaded */
12503
12504                         /* The name of the property to use to match the full
12505                          * eXtended Unicode range swash for this character
12506                          * class */
12507                         const char *Xname = swash_property_names[classnum];
12508
12509                         /* If returning the inversion list, we can't defer
12510                          * getting this until runtime */
12511                         if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12512                             PL_utf8_swash_ptrs[classnum] =
12513                                 _core_swash_init("utf8", Xname, &PL_sv_undef,
12514                                              1, /* binary */
12515                                              0, /* not tr/// */
12516                                              NULL, /* No inversion list */
12517                                              NULL  /* No flags */
12518                                             );
12519                             assert(PL_utf8_swash_ptrs[classnum]);
12520                         }
12521                         if ( !  PL_utf8_swash_ptrs[classnum]) {
12522                             if (namedclass % 2 == 0) { /* A non-complemented
12523                                                           class */
12524                                 /* If not /a matching, there are code points we
12525                                  * don't know at compile time.  Arrange for the
12526                                  * unknown matches to be loaded at run-time, if
12527                                  * needed */
12528                                 if (! AT_LEAST_ASCII_RESTRICTED) {
12529                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12530                                                                  Xname);
12531                                 }
12532                                 if (LOC) {  /* Under locale, set run-time
12533                                                lookup */
12534                                     ANYOF_CLASS_SET(ret, namedclass);
12535                                 }
12536                                 else {
12537                                     /* Add the current class's code points to
12538                                      * the running total */
12539                                     _invlist_union(posixes,
12540                                                    (AT_LEAST_ASCII_RESTRICTED)
12541                                                         ? ascii_source
12542                                                         : l1_source,
12543                                                    &posixes);
12544                                 }
12545                             }
12546                             else {  /* A complemented class */
12547                                 if (AT_LEAST_ASCII_RESTRICTED) {
12548                                     /* Under /a should match everything above
12549                                      * ASCII, plus the complement of the set's
12550                                      * ASCII matches */
12551                                     _invlist_union_complement_2nd(posixes,
12552                                                                   ascii_source,
12553                                                                   &posixes);
12554                                 }
12555                                 else {
12556                                     /* Arrange for the unknown matches to be
12557                                      * loaded at run-time, if needed */
12558                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12559                                                                  Xname);
12560                                     runtime_posix_matches_above_Unicode = TRUE;
12561                                     if (LOC) {
12562                                         ANYOF_CLASS_SET(ret, namedclass);
12563                                     }
12564                                     else {
12565
12566                                         /* We want to match everything in
12567                                          * Latin1, except those things that
12568                                          * l1_source matches */
12569                                         SV* scratch_list = NULL;
12570                                         _invlist_subtract(PL_Latin1, l1_source,
12571                                                           &scratch_list);
12572
12573                                         /* Add the list from this class to the
12574                                          * running total */
12575                                         if (! posixes) {
12576                                             posixes = scratch_list;
12577                                         }
12578                                         else {
12579                                             _invlist_union(posixes,
12580                                                            scratch_list,
12581                                                            &posixes);
12582                                             SvREFCNT_dec_NN(scratch_list);
12583                                         }
12584                                         if (DEPENDS_SEMANTICS) {
12585                                             ANYOF_FLAGS(ret)
12586                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
12587                                         }
12588                                     }
12589                                 }
12590                             }
12591                             goto namedclass_done;
12592                         }
12593
12594                         /* Here, there is a swash loaded for the class.  If no
12595                          * inversion list for it yet, get it */
12596                         if (! PL_XPosix_ptrs[classnum]) {
12597                             PL_XPosix_ptrs[classnum]
12598                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12599                         }
12600                     }
12601
12602                     /* Here there is an inversion list already loaded for the
12603                      * entire class */
12604
12605                     if (namedclass % 2 == 0) {  /* A non-complemented class,
12606                                                    like ANYOF_PUNCT */
12607                         if (! LOC) {
12608                             /* For non-locale, just add it to any existing list
12609                              * */
12610                             _invlist_union(posixes,
12611                                            (AT_LEAST_ASCII_RESTRICTED)
12612                                                ? ascii_source
12613                                                : PL_XPosix_ptrs[classnum],
12614                                            &posixes);
12615                         }
12616                         else {  /* Locale */
12617                             SV* scratch_list = NULL;
12618
12619                             /* For above Latin1 code points, we use the full
12620                              * Unicode range */
12621                             _invlist_intersection(PL_AboveLatin1,
12622                                                   PL_XPosix_ptrs[classnum],
12623                                                   &scratch_list);
12624                             /* And set the output to it, adding instead if
12625                              * there already is an output.  Checking if
12626                              * 'posixes' is NULL first saves an extra clone.
12627                              * Its reference count will be decremented at the
12628                              * next union, etc, or if this is the only
12629                              * instance, at the end of the routine */
12630                             if (! posixes) {
12631                                 posixes = scratch_list;
12632                             }
12633                             else {
12634                                 _invlist_union(posixes, scratch_list, &posixes);
12635                                 SvREFCNT_dec_NN(scratch_list);
12636                             }
12637
12638 #ifndef HAS_ISBLANK
12639                             if (namedclass != ANYOF_BLANK) {
12640 #endif
12641                                 /* Set this class in the node for runtime
12642                                  * matching */
12643                                 ANYOF_CLASS_SET(ret, namedclass);
12644 #ifndef HAS_ISBLANK
12645                             }
12646                             else {
12647                                 /* No isblank(), use the hard-coded ASCII-range
12648                                  * blanks, adding them to the running total. */
12649
12650                                 _invlist_union(posixes, ascii_source, &posixes);
12651                             }
12652 #endif
12653                         }
12654                     }
12655                     else {  /* A complemented class, like ANYOF_NPUNCT */
12656                         if (! LOC) {
12657                             _invlist_union_complement_2nd(
12658                                                 posixes,
12659                                                 (AT_LEAST_ASCII_RESTRICTED)
12660                                                     ? ascii_source
12661                                                     : PL_XPosix_ptrs[classnum],
12662                                                 &posixes);
12663                             /* Under /d, everything in the upper half of the
12664                              * Latin1 range matches this complement */
12665                             if (DEPENDS_SEMANTICS) {
12666                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12667                             }
12668                         }
12669                         else {  /* Locale */
12670                             SV* scratch_list = NULL;
12671                             _invlist_subtract(PL_AboveLatin1,
12672                                               PL_XPosix_ptrs[classnum],
12673                                               &scratch_list);
12674                             if (! posixes) {
12675                                 posixes = scratch_list;
12676                             }
12677                             else {
12678                                 _invlist_union(posixes, scratch_list, &posixes);
12679                                 SvREFCNT_dec_NN(scratch_list);
12680                             }
12681 #ifndef HAS_ISBLANK
12682                             if (namedclass != ANYOF_NBLANK) {
12683 #endif
12684                                 ANYOF_CLASS_SET(ret, namedclass);
12685 #ifndef HAS_ISBLANK
12686                             }
12687                             else {
12688                                 /* Get the list of all code points in Latin1
12689                                  * that are not ASCII blanks, and add them to
12690                                  * the running total */
12691                                 _invlist_subtract(PL_Latin1, ascii_source,
12692                                                   &scratch_list);
12693                                 _invlist_union(posixes, scratch_list, &posixes);
12694                                 SvREFCNT_dec_NN(scratch_list);
12695                             }
12696 #endif
12697                         }
12698                     }
12699                 }
12700               namedclass_done:
12701                 continue;   /* Go get next character */
12702             }
12703         } /* end of namedclass \blah */
12704
12705         /* Here, we have a single value.  If 'range' is set, it is the ending
12706          * of a range--check its validity.  Later, we will handle each
12707          * individual code point in the range.  If 'range' isn't set, this
12708          * could be the beginning of a range, so check for that by looking
12709          * ahead to see if the next real character to be processed is the range
12710          * indicator--the minus sign */
12711
12712         if (skip_white) {
12713             RExC_parse = regpatws(pRExC_state, RExC_parse,
12714                                 FALSE /* means don't recognize comments */);
12715         }
12716
12717         if (range) {
12718             if (prevvalue > value) /* b-a */ {
12719                 const int w = RExC_parse - rangebegin;
12720                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12721                 range = 0; /* not a valid range */
12722             }
12723         }
12724         else {
12725             prevvalue = value; /* save the beginning of the potential range */
12726             if (! stop_at_1     /* Can't be a range if parsing just one thing */
12727                 && *RExC_parse == '-')
12728             {
12729                 char* next_char_ptr = RExC_parse + 1;
12730                 if (skip_white) {   /* Get the next real char after the '-' */
12731                     next_char_ptr = regpatws(pRExC_state,
12732                                              RExC_parse + 1,
12733                                              FALSE); /* means don't recognize
12734                                                         comments */
12735                 }
12736
12737                 /* If the '-' is at the end of the class (just before the ']',
12738                  * it is a literal minus; otherwise it is a range */
12739                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12740                     RExC_parse = next_char_ptr;
12741
12742                     /* a bad range like \w-, [:word:]- ? */
12743                     if (namedclass > OOB_NAMEDCLASS) {
12744                         if (strict || ckWARN(WARN_REGEXP)) {
12745                             const int w =
12746                                 RExC_parse >= rangebegin ?
12747                                 RExC_parse - rangebegin : 0;
12748                             if (strict) {
12749                                 vFAIL4("False [] range \"%*.*s\"",
12750                                     w, w, rangebegin);
12751                             }
12752                             else {
12753                                 vWARN4(RExC_parse,
12754                                     "False [] range \"%*.*s\"",
12755                                     w, w, rangebegin);
12756                             }
12757                         }
12758                         if (!SIZE_ONLY) {
12759                             cp_list = add_cp_to_invlist(cp_list, '-');
12760                         }
12761                         element_count++;
12762                     } else
12763                         range = 1;      /* yeah, it's a range! */
12764                     continue;   /* but do it the next time */
12765                 }
12766             }
12767         }
12768
12769         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12770          * if not */
12771
12772         /* non-Latin1 code point implies unicode semantics.  Must be set in
12773          * pass1 so is there for the whole of pass 2 */
12774         if (value > 255) {
12775             RExC_uni_semantics = 1;
12776         }
12777
12778         /* Ready to process either the single value, or the completed range.
12779          * For single-valued non-inverted ranges, we consider the possibility
12780          * of multi-char folds.  (We made a conscious decision to not do this
12781          * for the other cases because it can often lead to non-intuitive
12782          * results.  For example, you have the peculiar case that:
12783          *  "s s" =~ /^[^\xDF]+$/i => Y
12784          *  "ss"  =~ /^[^\xDF]+$/i => N
12785          *
12786          * See [perl #89750] */
12787         if (FOLD && allow_multi_folds && value == prevvalue) {
12788             if (value == LATIN_SMALL_LETTER_SHARP_S
12789                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12790                                                         value)))
12791             {
12792                 /* Here <value> is indeed a multi-char fold.  Get what it is */
12793
12794                 U8 foldbuf[UTF8_MAXBYTES_CASE];
12795                 STRLEN foldlen;
12796
12797                 UV folded = _to_uni_fold_flags(
12798                                 value,
12799                                 foldbuf,
12800                                 &foldlen,
12801                                 FOLD_FLAGS_FULL
12802                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
12803                                             : (ASCII_FOLD_RESTRICTED)
12804                                               ? FOLD_FLAGS_NOMIX_ASCII
12805                                               : 0)
12806                                 );
12807
12808                 /* Here, <folded> should be the first character of the
12809                  * multi-char fold of <value>, with <foldbuf> containing the
12810                  * whole thing.  But, if this fold is not allowed (because of
12811                  * the flags), <fold> will be the same as <value>, and should
12812                  * be processed like any other character, so skip the special
12813                  * handling */
12814                 if (folded != value) {
12815
12816                     /* Skip if we are recursed, currently parsing the class
12817                      * again.  Otherwise add this character to the list of
12818                      * multi-char folds. */
12819                     if (! RExC_in_multi_char_class) {
12820                         AV** this_array_ptr;
12821                         AV* this_array;
12822                         STRLEN cp_count = utf8_length(foldbuf,
12823                                                       foldbuf + foldlen);
12824                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12825
12826                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12827
12828
12829                         if (! multi_char_matches) {
12830                             multi_char_matches = newAV();
12831                         }
12832
12833                         /* <multi_char_matches> is actually an array of arrays.
12834                          * There will be one or two top-level elements: [2],
12835                          * and/or [3].  The [2] element is an array, each
12836                          * element thereof is a character which folds to two
12837                          * characters; likewise for [3].  (Unicode guarantees a
12838                          * maximum of 3 characters in any fold.)  When we
12839                          * rewrite the character class below, we will do so
12840                          * such that the longest folds are written first, so
12841                          * that it prefers the longest matching strings first.
12842                          * This is done even if it turns out that any
12843                          * quantifier is non-greedy, out of programmer
12844                          * laziness.  Tom Christiansen has agreed that this is
12845                          * ok.  This makes the test for the ligature 'ffi' come
12846                          * before the test for 'ff' */
12847                         if (av_exists(multi_char_matches, cp_count)) {
12848                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
12849                                                              cp_count, FALSE);
12850                             this_array = *this_array_ptr;
12851                         }
12852                         else {
12853                             this_array = newAV();
12854                             av_store(multi_char_matches, cp_count,
12855                                      (SV*) this_array);
12856                         }
12857                         av_push(this_array, multi_fold);
12858                     }
12859
12860                     /* This element should not be processed further in this
12861                      * class */
12862                     element_count--;
12863                     value = save_value;
12864                     prevvalue = save_prevvalue;
12865                     continue;
12866                 }
12867             }
12868         }
12869
12870         /* Deal with this element of the class */
12871         if (! SIZE_ONLY) {
12872 #ifndef EBCDIC
12873             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12874 #else
12875             UV* this_range = _new_invlist(1);
12876             _append_range_to_invlist(this_range, prevvalue, value);
12877
12878             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12879              * If this range was specified using something like 'i-j', we want
12880              * to include only the 'i' and the 'j', and not anything in
12881              * between, so exclude non-ASCII, non-alphabetics from it.
12882              * However, if the range was specified with something like
12883              * [\x89-\x91] or [\x89-j], all code points within it should be
12884              * included.  literal_endpoint==2 means both ends of the range used
12885              * a literal character, not \x{foo} */
12886             if (literal_endpoint == 2
12887                 && (prevvalue >= 'a' && value <= 'z')
12888                     || (prevvalue >= 'A' && value <= 'Z'))
12889             {
12890                 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12891                 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12892             }
12893             _invlist_union(cp_list, this_range, &cp_list);
12894             literal_endpoint = 0;
12895 #endif
12896         }
12897
12898         range = 0; /* this range (if it was one) is done now */
12899     } /* End of loop through all the text within the brackets */
12900
12901     /* If anything in the class expands to more than one character, we have to
12902      * deal with them by building up a substitute parse string, and recursively
12903      * calling reg() on it, instead of proceeding */
12904     if (multi_char_matches) {
12905         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12906         I32 cp_count;
12907         STRLEN len;
12908         char *save_end = RExC_end;
12909         char *save_parse = RExC_parse;
12910         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
12911                                        a "|" */
12912         I32 reg_flags;
12913
12914         assert(! invert);
12915 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
12916            because too confusing */
12917         if (invert) {
12918             sv_catpv(substitute_parse, "(?:");
12919         }
12920 #endif
12921
12922         /* Look at the longest folds first */
12923         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12924
12925             if (av_exists(multi_char_matches, cp_count)) {
12926                 AV** this_array_ptr;
12927                 SV* this_sequence;
12928
12929                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12930                                                  cp_count, FALSE);
12931                 while ((this_sequence = av_pop(*this_array_ptr)) !=
12932                                                                 &PL_sv_undef)
12933                 {
12934                     if (! first_time) {
12935                         sv_catpv(substitute_parse, "|");
12936                     }
12937                     first_time = FALSE;
12938
12939                     sv_catpv(substitute_parse, SvPVX(this_sequence));
12940                 }
12941             }
12942         }
12943
12944         /* If the character class contains anything else besides these
12945          * multi-character folds, have to include it in recursive parsing */
12946         if (element_count) {
12947             sv_catpv(substitute_parse, "|[");
12948             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12949             sv_catpv(substitute_parse, "]");
12950         }
12951
12952         sv_catpv(substitute_parse, ")");
12953 #if 0
12954         if (invert) {
12955             /* This is a way to get the parse to skip forward a whole named
12956              * sequence instead of matching the 2nd character when it fails the
12957              * first */
12958             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12959         }
12960 #endif
12961
12962         RExC_parse = SvPV(substitute_parse, len);
12963         RExC_end = RExC_parse + len;
12964         RExC_in_multi_char_class = 1;
12965         RExC_emit = (regnode *)orig_emit;
12966
12967         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
12968
12969         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12970
12971         RExC_parse = save_parse;
12972         RExC_end = save_end;
12973         RExC_in_multi_char_class = 0;
12974         SvREFCNT_dec_NN(multi_char_matches);
12975         SvREFCNT_dec_NN(listsv);
12976         return ret;
12977     }
12978
12979     /* If the character class contains only a single element, it may be
12980      * optimizable into another node type which is smaller and runs faster.
12981      * Check if this is the case for this class */
12982     if (element_count == 1 && ! ret_invlist) {
12983         U8 op = END;
12984         U8 arg = 0;
12985
12986         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12987                                               [:digit:] or \p{foo} */
12988
12989             /* All named classes are mapped into POSIXish nodes, with its FLAG
12990              * argument giving which class it is */
12991             switch ((I32)namedclass) {
12992                 case ANYOF_UNIPROP:
12993                     break;
12994
12995                 /* These don't depend on the charset modifiers.  They always
12996                  * match under /u rules */
12997                 case ANYOF_NHORIZWS:
12998                 case ANYOF_HORIZWS:
12999                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13000                     /* FALLTHROUGH */
13001
13002                 case ANYOF_NVERTWS:
13003                 case ANYOF_VERTWS:
13004                     op = POSIXU;
13005                     goto join_posix;
13006
13007                 /* The actual POSIXish node for all the rest depends on the
13008                  * charset modifier.  The ones in the first set depend only on
13009                  * ASCII or, if available on this platform, locale */
13010                 case ANYOF_ASCII:
13011                 case ANYOF_NASCII:
13012 #ifdef HAS_ISASCII
13013                     op = (LOC) ? POSIXL : POSIXA;
13014 #else
13015                     op = POSIXA;
13016 #endif
13017                     goto join_posix;
13018
13019                 case ANYOF_NCASED:
13020                 case ANYOF_LOWER:
13021                 case ANYOF_NLOWER:
13022                 case ANYOF_UPPER:
13023                 case ANYOF_NUPPER:
13024                     /* under /a could be alpha */
13025                     if (FOLD) {
13026                         if (ASCII_RESTRICTED) {
13027                             namedclass = ANYOF_ALPHA + (namedclass % 2);
13028                         }
13029                         else if (! LOC) {
13030                             break;
13031                         }
13032                     }
13033                     /* FALLTHROUGH */
13034
13035                 /* The rest have more possibilities depending on the charset.
13036                  * We take advantage of the enum ordering of the charset
13037                  * modifiers to get the exact node type, */
13038                 default:
13039                     op = POSIXD + get_regex_charset(RExC_flags);
13040                     if (op > POSIXA) { /* /aa is same as /a */
13041                         op = POSIXA;
13042                     }
13043 #ifndef HAS_ISBLANK
13044                     if (op == POSIXL
13045                         && (namedclass == ANYOF_BLANK
13046                             || namedclass == ANYOF_NBLANK))
13047                     {
13048                         op = POSIXA;
13049                     }
13050 #endif
13051
13052                 join_posix:
13053                     /* The odd numbered ones are the complements of the
13054                      * next-lower even number one */
13055                     if (namedclass % 2 == 1) {
13056                         invert = ! invert;
13057                         namedclass--;
13058                     }
13059                     arg = namedclass_to_classnum(namedclass);
13060                     break;
13061             }
13062         }
13063         else if (value == prevvalue) {
13064
13065             /* Here, the class consists of just a single code point */
13066
13067             if (invert) {
13068                 if (! LOC && value == '\n') {
13069                     op = REG_ANY; /* Optimize [^\n] */
13070                     *flagp |= HASWIDTH|SIMPLE;
13071                     RExC_naughty++;
13072                 }
13073             }
13074             else if (value < 256 || UTF) {
13075
13076                 /* Optimize a single value into an EXACTish node, but not if it
13077                  * would require converting the pattern to UTF-8. */
13078                 op = compute_EXACTish(pRExC_state);
13079             }
13080         } /* Otherwise is a range */
13081         else if (! LOC) {   /* locale could vary these */
13082             if (prevvalue == '0') {
13083                 if (value == '9') {
13084                     arg = _CC_DIGIT;
13085                     op = POSIXA;
13086                 }
13087             }
13088         }
13089
13090         /* Here, we have changed <op> away from its initial value iff we found
13091          * an optimization */
13092         if (op != END) {
13093
13094             /* Throw away this ANYOF regnode, and emit the calculated one,
13095              * which should correspond to the beginning, not current, state of
13096              * the parse */
13097             const char * cur_parse = RExC_parse;
13098             RExC_parse = (char *)orig_parse;
13099             if ( SIZE_ONLY) {
13100                 if (! LOC) {
13101
13102                     /* To get locale nodes to not use the full ANYOF size would
13103                      * require moving the code above that writes the portions
13104                      * of it that aren't in other nodes to after this point.
13105                      * e.g.  ANYOF_CLASS_SET */
13106                     RExC_size = orig_size;
13107                 }
13108             }
13109             else {
13110                 RExC_emit = (regnode *)orig_emit;
13111                 if (PL_regkind[op] == POSIXD) {
13112                     if (invert) {
13113                         op += NPOSIXD - POSIXD;
13114                     }
13115                 }
13116             }
13117
13118             ret = reg_node(pRExC_state, op);
13119
13120             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13121                 if (! SIZE_ONLY) {
13122                     FLAGS(ret) = arg;
13123                 }
13124                 *flagp |= HASWIDTH|SIMPLE;
13125             }
13126             else if (PL_regkind[op] == EXACT) {
13127                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13128             }
13129
13130             RExC_parse = (char *) cur_parse;
13131
13132             SvREFCNT_dec(posixes);
13133             SvREFCNT_dec_NN(listsv);
13134             SvREFCNT_dec(cp_list);
13135             return ret;
13136         }
13137     }
13138
13139     if (SIZE_ONLY)
13140         return ret;
13141     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13142
13143     /* If folding, we calculate all characters that could fold to or from the
13144      * ones already on the list */
13145     if (FOLD && cp_list) {
13146         UV start, end;  /* End points of code point ranges */
13147
13148         SV* fold_intersection = NULL;
13149
13150         /* If the highest code point is within Latin1, we can use the
13151          * compiled-in Alphas list, and not have to go out to disk.  This
13152          * yields two false positives, the masculine and feminine ordinal
13153          * indicators, which are weeded out below using the
13154          * IS_IN_SOME_FOLD_L1() macro */
13155         if (invlist_highest(cp_list) < 256) {
13156             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13157                                                            &fold_intersection);
13158         }
13159         else {
13160
13161             /* Here, there are non-Latin1 code points, so we will have to go
13162              * fetch the list of all the characters that participate in folds
13163              */
13164             if (! PL_utf8_foldable) {
13165                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13166                                        &PL_sv_undef, 1, 0);
13167                 PL_utf8_foldable = _get_swash_invlist(swash);
13168                 SvREFCNT_dec_NN(swash);
13169             }
13170
13171             /* This is a hash that for a particular fold gives all characters
13172              * that are involved in it */
13173             if (! PL_utf8_foldclosures) {
13174
13175                 /* If we were unable to find any folds, then we likely won't be
13176                  * able to find the closures.  So just create an empty list.
13177                  * Folding will effectively be restricted to the non-Unicode
13178                  * rules hard-coded into Perl.  (This case happens legitimately
13179                  * during compilation of Perl itself before the Unicode tables
13180                  * are generated) */
13181                 if (_invlist_len(PL_utf8_foldable) == 0) {
13182                     PL_utf8_foldclosures = newHV();
13183                 }
13184                 else {
13185                     /* If the folds haven't been read in, call a fold function
13186                      * to force that */
13187                     if (! PL_utf8_tofold) {
13188                         U8 dummy[UTF8_MAXBYTES+1];
13189
13190                         /* This string is just a short named one above \xff */
13191                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13192                         assert(PL_utf8_tofold); /* Verify that worked */
13193                     }
13194                     PL_utf8_foldclosures =
13195                                     _swash_inversion_hash(PL_utf8_tofold);
13196                 }
13197             }
13198
13199             /* Only the characters in this class that participate in folds need
13200              * be checked.  Get the intersection of this class and all the
13201              * possible characters that are foldable.  This can quickly narrow
13202              * down a large class */
13203             _invlist_intersection(PL_utf8_foldable, cp_list,
13204                                   &fold_intersection);
13205         }
13206
13207         /* Now look at the foldable characters in this class individually */
13208         invlist_iterinit(fold_intersection);
13209         while (invlist_iternext(fold_intersection, &start, &end)) {
13210             UV j;
13211
13212             /* Locale folding for Latin1 characters is deferred until runtime */
13213             if (LOC && start < 256) {
13214                 start = 256;
13215             }
13216
13217             /* Look at every character in the range */
13218             for (j = start; j <= end; j++) {
13219
13220                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13221                 STRLEN foldlen;
13222                 SV** listp;
13223
13224                 if (j < 256) {
13225
13226                     /* We have the latin1 folding rules hard-coded here so that
13227                      * an innocent-looking character class, like /[ks]/i won't
13228                      * have to go out to disk to find the possible matches.
13229                      * XXX It would be better to generate these via regen, in
13230                      * case a new version of the Unicode standard adds new
13231                      * mappings, though that is not really likely, and may be
13232                      * caught by the default: case of the switch below. */
13233
13234                     if (IS_IN_SOME_FOLD_L1(j)) {
13235
13236                         /* ASCII is always matched; non-ASCII is matched only
13237                          * under Unicode rules */
13238                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13239                             cp_list =
13240                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13241                         }
13242                         else {
13243                             depends_list =
13244                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13245                         }
13246                     }
13247
13248                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13249                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13250                     {
13251                         /* Certain Latin1 characters have matches outside
13252                          * Latin1.  To get here, <j> is one of those
13253                          * characters.   None of these matches is valid for
13254                          * ASCII characters under /aa, which is why the 'if'
13255                          * just above excludes those.  These matches only
13256                          * happen when the target string is utf8.  The code
13257                          * below adds the single fold closures for <j> to the
13258                          * inversion list. */
13259                         switch (j) {
13260                             case 'k':
13261                             case 'K':
13262                                 cp_list =
13263                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
13264                                 break;
13265                             case 's':
13266                             case 'S':
13267                                 cp_list = add_cp_to_invlist(cp_list,
13268                                                     LATIN_SMALL_LETTER_LONG_S);
13269                                 break;
13270                             case MICRO_SIGN:
13271                                 cp_list = add_cp_to_invlist(cp_list,
13272                                                     GREEK_CAPITAL_LETTER_MU);
13273                                 cp_list = add_cp_to_invlist(cp_list,
13274                                                     GREEK_SMALL_LETTER_MU);
13275                                 break;
13276                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13277                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13278                                 cp_list =
13279                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13280                                 break;
13281                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13282                                 cp_list = add_cp_to_invlist(cp_list,
13283                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13284                                 break;
13285                             case LATIN_SMALL_LETTER_SHARP_S:
13286                                 cp_list = add_cp_to_invlist(cp_list,
13287                                                 LATIN_CAPITAL_LETTER_SHARP_S);
13288                                 break;
13289                             case 'F': case 'f':
13290                             case 'I': case 'i':
13291                             case 'L': case 'l':
13292                             case 'T': case 't':
13293                             case 'A': case 'a':
13294                             case 'H': case 'h':
13295                             case 'J': case 'j':
13296                             case 'N': case 'n':
13297                             case 'W': case 'w':
13298                             case 'Y': case 'y':
13299                                 /* These all are targets of multi-character
13300                                  * folds from code points that require UTF8 to
13301                                  * express, so they can't match unless the
13302                                  * target string is in UTF-8, so no action here
13303                                  * is necessary, as regexec.c properly handles
13304                                  * the general case for UTF-8 matching and
13305                                  * multi-char folds */
13306                                 break;
13307                             default:
13308                                 /* Use deprecated warning to increase the
13309                                  * chances of this being output */
13310                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13311                                 break;
13312                         }
13313                     }
13314                     continue;
13315                 }
13316
13317                 /* Here is an above Latin1 character.  We don't have the rules
13318                  * hard-coded for it.  First, get its fold.  This is the simple
13319                  * fold, as the multi-character folds have been handled earlier
13320                  * and separated out */
13321                 _to_uni_fold_flags(j, foldbuf, &foldlen,
13322                                                ((LOC)
13323                                                ? FOLD_FLAGS_LOCALE
13324                                                : (ASCII_FOLD_RESTRICTED)
13325                                                   ? FOLD_FLAGS_NOMIX_ASCII
13326                                                   : 0));
13327
13328                 /* Single character fold of above Latin1.  Add everything in
13329                  * its fold closure to the list that this node should match.
13330                  * The fold closures data structure is a hash with the keys
13331                  * being the UTF-8 of every character that is folded to, like
13332                  * 'k', and the values each an array of all code points that
13333                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13334                  * Multi-character folds are not included */
13335                 if ((listp = hv_fetch(PL_utf8_foldclosures,
13336                                       (char *) foldbuf, foldlen, FALSE)))
13337                 {
13338                     AV* list = (AV*) *listp;
13339                     IV k;
13340                     for (k = 0; k <= av_len(list); k++) {
13341                         SV** c_p = av_fetch(list, k, FALSE);
13342                         UV c;
13343                         if (c_p == NULL) {
13344                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13345                         }
13346                         c = SvUV(*c_p);
13347
13348                         /* /aa doesn't allow folds between ASCII and non-; /l
13349                          * doesn't allow them between above and below 256 */
13350                         if ((ASCII_FOLD_RESTRICTED
13351                                   && (isASCII(c) != isASCII(j)))
13352                             || (LOC && ((c < 256) != (j < 256))))
13353                         {
13354                             continue;
13355                         }
13356
13357                         /* Folds involving non-ascii Latin1 characters
13358                          * under /d are added to a separate list */
13359                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13360                         {
13361                             cp_list = add_cp_to_invlist(cp_list, c);
13362                         }
13363                         else {
13364                           depends_list = add_cp_to_invlist(depends_list, c);
13365                         }
13366                     }
13367                 }
13368             }
13369         }
13370         SvREFCNT_dec_NN(fold_intersection);
13371     }
13372
13373     /* And combine the result (if any) with any inversion list from posix
13374      * classes.  The lists are kept separate up to now because we don't want to
13375      * fold the classes (folding of those is automatically handled by the swash
13376      * fetching code) */
13377     if (posixes) {
13378         if (! DEPENDS_SEMANTICS) {
13379             if (cp_list) {
13380                 _invlist_union(cp_list, posixes, &cp_list);
13381                 SvREFCNT_dec_NN(posixes);
13382             }
13383             else {
13384                 cp_list = posixes;
13385             }
13386         }
13387         else {
13388             /* Under /d, we put into a separate list the Latin1 things that
13389              * match only when the target string is utf8 */
13390             SV* nonascii_but_latin1_properties = NULL;
13391             _invlist_intersection(posixes, PL_Latin1,
13392                                   &nonascii_but_latin1_properties);
13393             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13394                               &nonascii_but_latin1_properties);
13395             _invlist_subtract(posixes, nonascii_but_latin1_properties,
13396                               &posixes);
13397             if (cp_list) {
13398                 _invlist_union(cp_list, posixes, &cp_list);
13399                 SvREFCNT_dec_NN(posixes);
13400             }
13401             else {
13402                 cp_list = posixes;
13403             }
13404
13405             if (depends_list) {
13406                 _invlist_union(depends_list, nonascii_but_latin1_properties,
13407                                &depends_list);
13408                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13409             }
13410             else {
13411                 depends_list = nonascii_but_latin1_properties;
13412             }
13413         }
13414     }
13415
13416     /* And combine the result (if any) with any inversion list from properties.
13417      * The lists are kept separate up to now so that we can distinguish the two
13418      * in regards to matching above-Unicode.  A run-time warning is generated
13419      * if a Unicode property is matched against a non-Unicode code point. But,
13420      * we allow user-defined properties to match anything, without any warning,
13421      * and we also suppress the warning if there is a portion of the character
13422      * class that isn't a Unicode property, and which matches above Unicode, \W
13423      * or [\x{110000}] for example.
13424      * (Note that in this case, unlike the Posix one above, there is no
13425      * <depends_list>, because having a Unicode property forces Unicode
13426      * semantics */
13427     if (properties) {
13428         bool warn_super = ! has_user_defined_property;
13429         if (cp_list) {
13430
13431             /* If it matters to the final outcome, see if a non-property
13432              * component of the class matches above Unicode.  If so, the
13433              * warning gets suppressed.  This is true even if just a single
13434              * such code point is specified, as though not strictly correct if
13435              * another such code point is matched against, the fact that they
13436              * are using above-Unicode code points indicates they should know
13437              * the issues involved */
13438             if (warn_super) {
13439                 bool non_prop_matches_above_Unicode =
13440                             runtime_posix_matches_above_Unicode
13441                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13442                 if (invert) {
13443                     non_prop_matches_above_Unicode =
13444                                             !  non_prop_matches_above_Unicode;
13445                 }
13446                 warn_super = ! non_prop_matches_above_Unicode;
13447             }
13448
13449             _invlist_union(properties, cp_list, &cp_list);
13450             SvREFCNT_dec_NN(properties);
13451         }
13452         else {
13453             cp_list = properties;
13454         }
13455
13456         if (warn_super) {
13457             OP(ret) = ANYOF_WARN_SUPER;
13458         }
13459     }
13460
13461     /* Here, we have calculated what code points should be in the character
13462      * class.
13463      *
13464      * Now we can see about various optimizations.  Fold calculation (which we
13465      * did above) needs to take place before inversion.  Otherwise /[^k]/i
13466      * would invert to include K, which under /i would match k, which it
13467      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13468      * folded until runtime */
13469
13470     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13471      * at compile time.  Besides not inverting folded locale now, we can't
13472      * invert if there are things such as \w, which aren't known until runtime
13473      * */
13474     if (invert
13475         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13476         && ! depends_list
13477         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13478     {
13479         _invlist_invert(cp_list);
13480
13481         /* Any swash can't be used as-is, because we've inverted things */
13482         if (swash) {
13483             SvREFCNT_dec_NN(swash);
13484             swash = NULL;
13485         }
13486
13487         /* Clear the invert flag since have just done it here */
13488         invert = FALSE;
13489     }
13490
13491     if (ret_invlist) {
13492         *ret_invlist = cp_list;
13493
13494         /* Discard the generated node */
13495         if (SIZE_ONLY) {
13496             RExC_size = orig_size;
13497         }
13498         else {
13499             RExC_emit = orig_emit;
13500         }
13501         return END;
13502     }
13503
13504     /* If we didn't do folding, it's because some information isn't available
13505      * until runtime; set the run-time fold flag for these.  (We don't have to
13506      * worry about properties folding, as that is taken care of by the swash
13507      * fetching) */
13508     if (FOLD && LOC)
13509     {
13510        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13511     }
13512
13513     /* Some character classes are equivalent to other nodes.  Such nodes take
13514      * up less room and generally fewer operations to execute than ANYOF nodes.
13515      * Above, we checked for and optimized into some such equivalents for
13516      * certain common classes that are easy to test.  Getting to this point in
13517      * the code means that the class didn't get optimized there.  Since this
13518      * code is only executed in Pass 2, it is too late to save space--it has
13519      * been allocated in Pass 1, and currently isn't given back.  But turning
13520      * things into an EXACTish node can allow the optimizer to join it to any
13521      * adjacent such nodes.  And if the class is equivalent to things like /./,
13522      * expensive run-time swashes can be avoided.  Now that we have more
13523      * complete information, we can find things necessarily missed by the
13524      * earlier code.  I (khw) am not sure how much to look for here.  It would
13525      * be easy, but perhaps too slow, to check any candidates against all the
13526      * node types they could possibly match using _invlistEQ(). */
13527
13528     if (cp_list
13529         && ! invert
13530         && ! depends_list
13531         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13532         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13533     {
13534         UV start, end;
13535         U8 op = END;  /* The optimzation node-type */
13536         const char * cur_parse= RExC_parse;
13537
13538         invlist_iterinit(cp_list);
13539         if (! invlist_iternext(cp_list, &start, &end)) {
13540
13541             /* Here, the list is empty.  This happens, for example, when a
13542              * Unicode property is the only thing in the character class, and
13543              * it doesn't match anything.  (perluniprops.pod notes such
13544              * properties) */
13545             op = OPFAIL;
13546             *flagp |= HASWIDTH|SIMPLE;
13547         }
13548         else if (start == end) {    /* The range is a single code point */
13549             if (! invlist_iternext(cp_list, &start, &end)
13550
13551                     /* Don't do this optimization if it would require changing
13552                      * the pattern to UTF-8 */
13553                 && (start < 256 || UTF))
13554             {
13555                 /* Here, the list contains a single code point.  Can optimize
13556                  * into an EXACT node */
13557
13558                 value = start;
13559
13560                 if (! FOLD) {
13561                     op = EXACT;
13562                 }
13563                 else if (LOC) {
13564
13565                     /* A locale node under folding with one code point can be
13566                      * an EXACTFL, as its fold won't be calculated until
13567                      * runtime */
13568                     op = EXACTFL;
13569                 }
13570                 else {
13571
13572                     /* Here, we are generally folding, but there is only one
13573                      * code point to match.  If we have to, we use an EXACT
13574                      * node, but it would be better for joining with adjacent
13575                      * nodes in the optimization pass if we used the same
13576                      * EXACTFish node that any such are likely to be.  We can
13577                      * do this iff the code point doesn't participate in any
13578                      * folds.  For example, an EXACTF of a colon is the same as
13579                      * an EXACT one, since nothing folds to or from a colon. */
13580                     if (value < 256) {
13581                         if (IS_IN_SOME_FOLD_L1(value)) {
13582                             op = EXACT;
13583                         }
13584                     }
13585                     else {
13586                         if (! PL_utf8_foldable) {
13587                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13588                                                 &PL_sv_undef, 1, 0);
13589                             PL_utf8_foldable = _get_swash_invlist(swash);
13590                             SvREFCNT_dec_NN(swash);
13591                         }
13592                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13593                             op = EXACT;
13594                         }
13595                     }
13596
13597                     /* If we haven't found the node type, above, it means we
13598                      * can use the prevailing one */
13599                     if (op == END) {
13600                         op = compute_EXACTish(pRExC_state);
13601                     }
13602                 }
13603             }
13604         }
13605         else if (start == 0) {
13606             if (end == UV_MAX) {
13607                 op = SANY;
13608                 *flagp |= HASWIDTH|SIMPLE;
13609                 RExC_naughty++;
13610             }
13611             else if (end == '\n' - 1
13612                     && invlist_iternext(cp_list, &start, &end)
13613                     && start == '\n' + 1 && end == UV_MAX)
13614             {
13615                 op = REG_ANY;
13616                 *flagp |= HASWIDTH|SIMPLE;
13617                 RExC_naughty++;
13618             }
13619         }
13620         invlist_iterfinish(cp_list);
13621
13622         if (op != END) {
13623             RExC_parse = (char *)orig_parse;
13624             RExC_emit = (regnode *)orig_emit;
13625
13626             ret = reg_node(pRExC_state, op);
13627
13628             RExC_parse = (char *)cur_parse;
13629
13630             if (PL_regkind[op] == EXACT) {
13631                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13632             }
13633
13634             SvREFCNT_dec_NN(cp_list);
13635             SvREFCNT_dec_NN(listsv);
13636             return ret;
13637         }
13638     }
13639
13640     /* Here, <cp_list> contains all the code points we can determine at
13641      * compile time that match under all conditions.  Go through it, and
13642      * for things that belong in the bitmap, put them there, and delete from
13643      * <cp_list>.  While we are at it, see if everything above 255 is in the
13644      * list, and if so, set a flag to speed up execution */
13645     ANYOF_BITMAP_ZERO(ret);
13646     if (cp_list) {
13647
13648         /* This gets set if we actually need to modify things */
13649         bool change_invlist = FALSE;
13650
13651         UV start, end;
13652
13653         /* Start looking through <cp_list> */
13654         invlist_iterinit(cp_list);
13655         while (invlist_iternext(cp_list, &start, &end)) {
13656             UV high;
13657             int i;
13658
13659             if (end == UV_MAX && start <= 256) {
13660                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13661             }
13662
13663             /* Quit if are above what we should change */
13664             if (start > 255) {
13665                 break;
13666             }
13667
13668             change_invlist = TRUE;
13669
13670             /* Set all the bits in the range, up to the max that we are doing */
13671             high = (end < 255) ? end : 255;
13672             for (i = start; i <= (int) high; i++) {
13673                 if (! ANYOF_BITMAP_TEST(ret, i)) {
13674                     ANYOF_BITMAP_SET(ret, i);
13675                     prevvalue = value;
13676                     value = i;
13677                 }
13678             }
13679         }
13680         invlist_iterfinish(cp_list);
13681
13682         /* Done with loop; remove any code points that are in the bitmap from
13683          * <cp_list> */
13684         if (change_invlist) {
13685             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13686         }
13687
13688         /* If have completely emptied it, remove it completely */
13689         if (_invlist_len(cp_list) == 0) {
13690             SvREFCNT_dec_NN(cp_list);
13691             cp_list = NULL;
13692         }
13693     }
13694
13695     if (invert) {
13696         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13697     }
13698
13699     /* Here, the bitmap has been populated with all the Latin1 code points that
13700      * always match.  Can now add to the overall list those that match only
13701      * when the target string is UTF-8 (<depends_list>). */
13702     if (depends_list) {
13703         if (cp_list) {
13704             _invlist_union(cp_list, depends_list, &cp_list);
13705             SvREFCNT_dec_NN(depends_list);
13706         }
13707         else {
13708             cp_list = depends_list;
13709         }
13710     }
13711
13712     /* If there is a swash and more than one element, we can't use the swash in
13713      * the optimization below. */
13714     if (swash && element_count > 1) {
13715         SvREFCNT_dec_NN(swash);
13716         swash = NULL;
13717     }
13718
13719     if (! cp_list
13720         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13721     {
13722         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13723         SvREFCNT_dec_NN(listsv);
13724     }
13725     else {
13726         /* av[0] stores the character class description in its textual form:
13727          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13728          *       appropriate swash, and is also useful for dumping the regnode.
13729          * av[1] if NULL, is a placeholder to later contain the swash computed
13730          *       from av[0].  But if no further computation need be done, the
13731          *       swash is stored there now.
13732          * av[2] stores the cp_list inversion list for use in addition or
13733          *       instead of av[0]; used only if av[1] is NULL
13734          * av[3] is set if any component of the class is from a user-defined
13735          *       property; used only if av[1] is NULL */
13736         AV * const av = newAV();
13737         SV *rv;
13738
13739         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13740                         ? listsv
13741                         : (SvREFCNT_dec_NN(listsv), &PL_sv_undef));
13742         if (swash) {
13743             av_store(av, 1, swash);
13744             SvREFCNT_dec_NN(cp_list);
13745         }
13746         else {
13747             av_store(av, 1, NULL);
13748             if (cp_list) {
13749                 av_store(av, 2, cp_list);
13750                 av_store(av, 3, newSVuv(has_user_defined_property));
13751             }
13752         }
13753
13754         rv = newRV_noinc(MUTABLE_SV(av));
13755         n = add_data(pRExC_state, 1, "s");
13756         RExC_rxi->data->data[n] = (void*)rv;
13757         ARG_SET(ret, n);
13758     }
13759
13760     *flagp |= HASWIDTH|SIMPLE;
13761     return ret;
13762 }
13763 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13764
13765
13766 /* reg_skipcomment()
13767
13768    Absorbs an /x style # comments from the input stream.
13769    Returns true if there is more text remaining in the stream.
13770    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13771    terminates the pattern without including a newline.
13772
13773    Note its the callers responsibility to ensure that we are
13774    actually in /x mode
13775
13776 */
13777
13778 STATIC bool
13779 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13780 {
13781     bool ended = 0;
13782
13783     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13784
13785     while (RExC_parse < RExC_end)
13786         if (*RExC_parse++ == '\n') {
13787             ended = 1;
13788             break;
13789         }
13790     if (!ended) {
13791         /* we ran off the end of the pattern without ending
13792            the comment, so we have to add an \n when wrapping */
13793         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13794         return 0;
13795     } else
13796         return 1;
13797 }
13798
13799 /* nextchar()
13800
13801    Advances the parse position, and optionally absorbs
13802    "whitespace" from the inputstream.
13803
13804    Without /x "whitespace" means (?#...) style comments only,
13805    with /x this means (?#...) and # comments and whitespace proper.
13806
13807    Returns the RExC_parse point from BEFORE the scan occurs.
13808
13809    This is the /x friendly way of saying RExC_parse++.
13810 */
13811
13812 STATIC char*
13813 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13814 {
13815     char* const retval = RExC_parse++;
13816
13817     PERL_ARGS_ASSERT_NEXTCHAR;
13818
13819     for (;;) {
13820         if (RExC_end - RExC_parse >= 3
13821             && *RExC_parse == '('
13822             && RExC_parse[1] == '?'
13823             && RExC_parse[2] == '#')
13824         {
13825             while (*RExC_parse != ')') {
13826                 if (RExC_parse == RExC_end)
13827                     FAIL("Sequence (?#... not terminated");
13828                 RExC_parse++;
13829             }
13830             RExC_parse++;
13831             continue;
13832         }
13833         if (RExC_flags & RXf_PMf_EXTENDED) {
13834             if (isSPACE(*RExC_parse)) {
13835                 RExC_parse++;
13836                 continue;
13837             }
13838             else if (*RExC_parse == '#') {
13839                 if ( reg_skipcomment( pRExC_state ) )
13840                     continue;
13841             }
13842         }
13843         return retval;
13844     }
13845 }
13846
13847 /*
13848 - reg_node - emit a node
13849 */
13850 STATIC regnode *                        /* Location. */
13851 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13852 {
13853     dVAR;
13854     regnode *ptr;
13855     regnode * const ret = RExC_emit;
13856     GET_RE_DEBUG_FLAGS_DECL;
13857
13858     PERL_ARGS_ASSERT_REG_NODE;
13859
13860     if (SIZE_ONLY) {
13861         SIZE_ALIGN(RExC_size);
13862         RExC_size += 1;
13863         return(ret);
13864     }
13865     if (RExC_emit >= RExC_emit_bound)
13866         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13867                    op, RExC_emit, RExC_emit_bound);
13868
13869     NODE_ALIGN_FILL(ret);
13870     ptr = ret;
13871     FILL_ADVANCE_NODE(ptr, op);
13872 #ifdef RE_TRACK_PATTERN_OFFSETS
13873     if (RExC_offsets) {         /* MJD */
13874         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13875               "reg_node", __LINE__,
13876               PL_reg_name[op],
13877               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13878                 ? "Overwriting end of array!\n" : "OK",
13879               (UV)(RExC_emit - RExC_emit_start),
13880               (UV)(RExC_parse - RExC_start),
13881               (UV)RExC_offsets[0]));
13882         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13883     }
13884 #endif
13885     RExC_emit = ptr;
13886     return(ret);
13887 }
13888
13889 /*
13890 - reganode - emit a node with an argument
13891 */
13892 STATIC regnode *                        /* Location. */
13893 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13894 {
13895     dVAR;
13896     regnode *ptr;
13897     regnode * const ret = RExC_emit;
13898     GET_RE_DEBUG_FLAGS_DECL;
13899
13900     PERL_ARGS_ASSERT_REGANODE;
13901
13902     if (SIZE_ONLY) {
13903         SIZE_ALIGN(RExC_size);
13904         RExC_size += 2;
13905         /*
13906            We can't do this:
13907
13908            assert(2==regarglen[op]+1);
13909
13910            Anything larger than this has to allocate the extra amount.
13911            If we changed this to be:
13912
13913            RExC_size += (1 + regarglen[op]);
13914
13915            then it wouldn't matter. Its not clear what side effect
13916            might come from that so its not done so far.
13917            -- dmq
13918         */
13919         return(ret);
13920     }
13921     if (RExC_emit >= RExC_emit_bound)
13922         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13923                    op, RExC_emit, RExC_emit_bound);
13924
13925     NODE_ALIGN_FILL(ret);
13926     ptr = ret;
13927     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13928 #ifdef RE_TRACK_PATTERN_OFFSETS
13929     if (RExC_offsets) {         /* MJD */
13930         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13931               "reganode",
13932               __LINE__,
13933               PL_reg_name[op],
13934               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13935               "Overwriting end of array!\n" : "OK",
13936               (UV)(RExC_emit - RExC_emit_start),
13937               (UV)(RExC_parse - RExC_start),
13938               (UV)RExC_offsets[0]));
13939         Set_Cur_Node_Offset;
13940     }
13941 #endif
13942     RExC_emit = ptr;
13943     return(ret);
13944 }
13945
13946 /*
13947 - reguni - emit (if appropriate) a Unicode character
13948 */
13949 STATIC STRLEN
13950 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13951 {
13952     dVAR;
13953
13954     PERL_ARGS_ASSERT_REGUNI;
13955
13956     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13957 }
13958
13959 /*
13960 - reginsert - insert an operator in front of already-emitted operand
13961 *
13962 * Means relocating the operand.
13963 */
13964 STATIC void
13965 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13966 {
13967     dVAR;
13968     regnode *src;
13969     regnode *dst;
13970     regnode *place;
13971     const int offset = regarglen[(U8)op];
13972     const int size = NODE_STEP_REGNODE + offset;
13973     GET_RE_DEBUG_FLAGS_DECL;
13974
13975     PERL_ARGS_ASSERT_REGINSERT;
13976     PERL_UNUSED_ARG(depth);
13977 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13978     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13979     if (SIZE_ONLY) {
13980         RExC_size += size;
13981         return;
13982     }
13983
13984     src = RExC_emit;
13985     RExC_emit += size;
13986     dst = RExC_emit;
13987     if (RExC_open_parens) {
13988         int paren;
13989         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13990         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13991             if ( RExC_open_parens[paren] >= opnd ) {
13992                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13993                 RExC_open_parens[paren] += size;
13994             } else {
13995                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13996             }
13997             if ( RExC_close_parens[paren] >= opnd ) {
13998                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13999                 RExC_close_parens[paren] += size;
14000             } else {
14001                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14002             }
14003         }
14004     }
14005
14006     while (src > opnd) {
14007         StructCopy(--src, --dst, regnode);
14008 #ifdef RE_TRACK_PATTERN_OFFSETS
14009         if (RExC_offsets) {     /* MJD 20010112 */
14010             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14011                   "reg_insert",
14012                   __LINE__,
14013                   PL_reg_name[op],
14014                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14015                     ? "Overwriting end of array!\n" : "OK",
14016                   (UV)(src - RExC_emit_start),
14017                   (UV)(dst - RExC_emit_start),
14018                   (UV)RExC_offsets[0]));
14019             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14020             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14021         }
14022 #endif
14023     }
14024
14025
14026     place = opnd;               /* Op node, where operand used to be. */
14027 #ifdef RE_TRACK_PATTERN_OFFSETS
14028     if (RExC_offsets) {         /* MJD */
14029         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14030               "reginsert",
14031               __LINE__,
14032               PL_reg_name[op],
14033               (UV)(place - RExC_emit_start) > RExC_offsets[0]
14034               ? "Overwriting end of array!\n" : "OK",
14035               (UV)(place - RExC_emit_start),
14036               (UV)(RExC_parse - RExC_start),
14037               (UV)RExC_offsets[0]));
14038         Set_Node_Offset(place, RExC_parse);
14039         Set_Node_Length(place, 1);
14040     }
14041 #endif
14042     src = NEXTOPER(place);
14043     FILL_ADVANCE_NODE(place, op);
14044     Zero(src, offset, regnode);
14045 }
14046
14047 /*
14048 - regtail - set the next-pointer at the end of a node chain of p to val.
14049 - SEE ALSO: regtail_study
14050 */
14051 /* TODO: All three parms should be const */
14052 STATIC void
14053 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14054 {
14055     dVAR;
14056     regnode *scan;
14057     GET_RE_DEBUG_FLAGS_DECL;
14058
14059     PERL_ARGS_ASSERT_REGTAIL;
14060 #ifndef DEBUGGING
14061     PERL_UNUSED_ARG(depth);
14062 #endif
14063
14064     if (SIZE_ONLY)
14065         return;
14066
14067     /* Find last node. */
14068     scan = p;
14069     for (;;) {
14070         regnode * const temp = regnext(scan);
14071         DEBUG_PARSE_r({
14072             SV * const mysv=sv_newmortal();
14073             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14074             regprop(RExC_rx, mysv, scan);
14075             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14076                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14077                     (temp == NULL ? "->" : ""),
14078                     (temp == NULL ? PL_reg_name[OP(val)] : "")
14079             );
14080         });
14081         if (temp == NULL)
14082             break;
14083         scan = temp;
14084     }
14085
14086     if (reg_off_by_arg[OP(scan)]) {
14087         ARG_SET(scan, val - scan);
14088     }
14089     else {
14090         NEXT_OFF(scan) = val - scan;
14091     }
14092 }
14093
14094 #ifdef DEBUGGING
14095 /*
14096 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14097 - Look for optimizable sequences at the same time.
14098 - currently only looks for EXACT chains.
14099
14100 This is experimental code. The idea is to use this routine to perform
14101 in place optimizations on branches and groups as they are constructed,
14102 with the long term intention of removing optimization from study_chunk so
14103 that it is purely analytical.
14104
14105 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14106 to control which is which.
14107
14108 */
14109 /* TODO: All four parms should be const */
14110
14111 STATIC U8
14112 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14113 {
14114     dVAR;
14115     regnode *scan;
14116     U8 exact = PSEUDO;
14117 #ifdef EXPERIMENTAL_INPLACESCAN
14118     I32 min = 0;
14119 #endif
14120     GET_RE_DEBUG_FLAGS_DECL;
14121
14122     PERL_ARGS_ASSERT_REGTAIL_STUDY;
14123
14124
14125     if (SIZE_ONLY)
14126         return exact;
14127
14128     /* Find last node. */
14129
14130     scan = p;
14131     for (;;) {
14132         regnode * const temp = regnext(scan);
14133 #ifdef EXPERIMENTAL_INPLACESCAN
14134         if (PL_regkind[OP(scan)] == EXACT) {
14135             bool has_exactf_sharp_s;    /* Unexamined in this routine */
14136             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14137                 return EXACT;
14138         }
14139 #endif
14140         if ( exact ) {
14141             switch (OP(scan)) {
14142                 case EXACT:
14143                 case EXACTF:
14144                 case EXACTFA:
14145                 case EXACTFU:
14146                 case EXACTFU_SS:
14147                 case EXACTFU_TRICKYFOLD:
14148                 case EXACTFL:
14149                         if( exact == PSEUDO )
14150                             exact= OP(scan);
14151                         else if ( exact != OP(scan) )
14152                             exact= 0;
14153                 case NOTHING:
14154                     break;
14155                 default:
14156                     exact= 0;
14157             }
14158         }
14159         DEBUG_PARSE_r({
14160             SV * const mysv=sv_newmortal();
14161             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14162             regprop(RExC_rx, mysv, scan);
14163             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14164                 SvPV_nolen_const(mysv),
14165                 REG_NODE_NUM(scan),
14166                 PL_reg_name[exact]);
14167         });
14168         if (temp == NULL)
14169             break;
14170         scan = temp;
14171     }
14172     DEBUG_PARSE_r({
14173         SV * const mysv_val=sv_newmortal();
14174         DEBUG_PARSE_MSG("");
14175         regprop(RExC_rx, mysv_val, val);
14176         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14177                       SvPV_nolen_const(mysv_val),
14178                       (IV)REG_NODE_NUM(val),
14179                       (IV)(val - scan)
14180         );
14181     });
14182     if (reg_off_by_arg[OP(scan)]) {
14183         ARG_SET(scan, val - scan);
14184     }
14185     else {
14186         NEXT_OFF(scan) = val - scan;
14187     }
14188
14189     return exact;
14190 }
14191 #endif
14192
14193 /*
14194  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14195  */
14196 #ifdef DEBUGGING
14197 static void
14198 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14199 {
14200     int bit;
14201     int set=0;
14202     regex_charset cs;
14203
14204     for (bit=0; bit<32; bit++) {
14205         if (flags & (1<<bit)) {
14206             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
14207                 continue;
14208             }
14209             if (!set++ && lead)
14210                 PerlIO_printf(Perl_debug_log, "%s",lead);
14211             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14212         }
14213     }
14214     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14215             if (!set++ && lead) {
14216                 PerlIO_printf(Perl_debug_log, "%s",lead);
14217             }
14218             switch (cs) {
14219                 case REGEX_UNICODE_CHARSET:
14220                     PerlIO_printf(Perl_debug_log, "UNICODE");
14221                     break;
14222                 case REGEX_LOCALE_CHARSET:
14223                     PerlIO_printf(Perl_debug_log, "LOCALE");
14224                     break;
14225                 case REGEX_ASCII_RESTRICTED_CHARSET:
14226                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14227                     break;
14228                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14229                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14230                     break;
14231                 default:
14232                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14233                     break;
14234             }
14235     }
14236     if (lead)  {
14237         if (set)
14238             PerlIO_printf(Perl_debug_log, "\n");
14239         else
14240             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14241     }
14242 }
14243 #endif
14244
14245 void
14246 Perl_regdump(pTHX_ const regexp *r)
14247 {
14248 #ifdef DEBUGGING
14249     dVAR;
14250     SV * const sv = sv_newmortal();
14251     SV *dsv= sv_newmortal();
14252     RXi_GET_DECL(r,ri);
14253     GET_RE_DEBUG_FLAGS_DECL;
14254
14255     PERL_ARGS_ASSERT_REGDUMP;
14256
14257     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14258
14259     /* Header fields of interest. */
14260     if (r->anchored_substr) {
14261         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14262             RE_SV_DUMPLEN(r->anchored_substr), 30);
14263         PerlIO_printf(Perl_debug_log,
14264                       "anchored %s%s at %"IVdf" ",
14265                       s, RE_SV_TAIL(r->anchored_substr),
14266                       (IV)r->anchored_offset);
14267     } else if (r->anchored_utf8) {
14268         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14269             RE_SV_DUMPLEN(r->anchored_utf8), 30);
14270         PerlIO_printf(Perl_debug_log,
14271                       "anchored utf8 %s%s at %"IVdf" ",
14272                       s, RE_SV_TAIL(r->anchored_utf8),
14273                       (IV)r->anchored_offset);
14274     }
14275     if (r->float_substr) {
14276         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14277             RE_SV_DUMPLEN(r->float_substr), 30);
14278         PerlIO_printf(Perl_debug_log,
14279                       "floating %s%s at %"IVdf"..%"UVuf" ",
14280                       s, RE_SV_TAIL(r->float_substr),
14281                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14282     } else if (r->float_utf8) {
14283         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14284             RE_SV_DUMPLEN(r->float_utf8), 30);
14285         PerlIO_printf(Perl_debug_log,
14286                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14287                       s, RE_SV_TAIL(r->float_utf8),
14288                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14289     }
14290     if (r->check_substr || r->check_utf8)
14291         PerlIO_printf(Perl_debug_log,
14292                       (const char *)
14293                       (r->check_substr == r->float_substr
14294                        && r->check_utf8 == r->float_utf8
14295                        ? "(checking floating" : "(checking anchored"));
14296     if (r->extflags & RXf_NOSCAN)
14297         PerlIO_printf(Perl_debug_log, " noscan");
14298     if (r->extflags & RXf_CHECK_ALL)
14299         PerlIO_printf(Perl_debug_log, " isall");
14300     if (r->check_substr || r->check_utf8)
14301         PerlIO_printf(Perl_debug_log, ") ");
14302
14303     if (ri->regstclass) {
14304         regprop(r, sv, ri->regstclass);
14305         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14306     }
14307     if (r->extflags & RXf_ANCH) {
14308         PerlIO_printf(Perl_debug_log, "anchored");
14309         if (r->extflags & RXf_ANCH_BOL)
14310             PerlIO_printf(Perl_debug_log, "(BOL)");
14311         if (r->extflags & RXf_ANCH_MBOL)
14312             PerlIO_printf(Perl_debug_log, "(MBOL)");
14313         if (r->extflags & RXf_ANCH_SBOL)
14314             PerlIO_printf(Perl_debug_log, "(SBOL)");
14315         if (r->extflags & RXf_ANCH_GPOS)
14316             PerlIO_printf(Perl_debug_log, "(GPOS)");
14317         PerlIO_putc(Perl_debug_log, ' ');
14318     }
14319     if (r->extflags & RXf_GPOS_SEEN)
14320         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14321     if (r->intflags & PREGf_SKIP)
14322         PerlIO_printf(Perl_debug_log, "plus ");
14323     if (r->intflags & PREGf_IMPLICIT)
14324         PerlIO_printf(Perl_debug_log, "implicit ");
14325     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14326     if (r->extflags & RXf_EVAL_SEEN)
14327         PerlIO_printf(Perl_debug_log, "with eval ");
14328     PerlIO_printf(Perl_debug_log, "\n");
14329     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14330 #else
14331     PERL_ARGS_ASSERT_REGDUMP;
14332     PERL_UNUSED_CONTEXT;
14333     PERL_UNUSED_ARG(r);
14334 #endif  /* DEBUGGING */
14335 }
14336
14337 /*
14338 - regprop - printable representation of opcode
14339 */
14340 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14341 STMT_START { \
14342         if (do_sep) {                           \
14343             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14344             if (flags & ANYOF_INVERT)           \
14345                 /*make sure the invert info is in each */ \
14346                 sv_catpvs(sv, "^");             \
14347             do_sep = 0;                         \
14348         }                                       \
14349 } STMT_END
14350
14351 void
14352 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14353 {
14354 #ifdef DEBUGGING
14355     dVAR;
14356     int k;
14357
14358     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14359     static const char * const anyofs[] = {
14360 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14361     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14362     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14363     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14364     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14365     || _CC_VERTSPACE != 16
14366   #error Need to adjust order of anyofs[]
14367 #endif
14368         "[\\w]",
14369         "[\\W]",
14370         "[\\d]",
14371         "[\\D]",
14372         "[:alpha:]",
14373         "[:^alpha:]",
14374         "[:lower:]",
14375         "[:^lower:]",
14376         "[:upper:]",
14377         "[:^upper:]",
14378         "[:punct:]",
14379         "[:^punct:]",
14380         "[:print:]",
14381         "[:^print:]",
14382         "[:alnum:]",
14383         "[:^alnum:]",
14384         "[:graph:]",
14385         "[:^graph:]",
14386         "[:cased:]",
14387         "[:^cased:]",
14388         "[\\s]",
14389         "[\\S]",
14390         "[:blank:]",
14391         "[:^blank:]",
14392         "[:xdigit:]",
14393         "[:^xdigit:]",
14394         "[:space:]",
14395         "[:^space:]",
14396         "[:cntrl:]",
14397         "[:^cntrl:]",
14398         "[:ascii:]",
14399         "[:^ascii:]",
14400         "[\\v]",
14401         "[\\V]"
14402     };
14403     RXi_GET_DECL(prog,progi);
14404     GET_RE_DEBUG_FLAGS_DECL;
14405
14406     PERL_ARGS_ASSERT_REGPROP;
14407
14408     sv_setpvs(sv, "");
14409
14410     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
14411         /* It would be nice to FAIL() here, but this may be called from
14412            regexec.c, and it would be hard to supply pRExC_state. */
14413         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14414     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14415
14416     k = PL_regkind[OP(o)];
14417
14418     if (k == EXACT) {
14419         sv_catpvs(sv, " ");
14420         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14421          * is a crude hack but it may be the best for now since
14422          * we have no flag "this EXACTish node was UTF-8"
14423          * --jhi */
14424         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14425                   PERL_PV_ESCAPE_UNI_DETECT |
14426                   PERL_PV_ESCAPE_NONASCII   |
14427                   PERL_PV_PRETTY_ELLIPSES   |
14428                   PERL_PV_PRETTY_LTGT       |
14429                   PERL_PV_PRETTY_NOCLEAR
14430                   );
14431     } else if (k == TRIE) {
14432         /* print the details of the trie in dumpuntil instead, as
14433          * progi->data isn't available here */
14434         const char op = OP(o);
14435         const U32 n = ARG(o);
14436         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14437                (reg_ac_data *)progi->data->data[n] :
14438                NULL;
14439         const reg_trie_data * const trie
14440             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14441
14442         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14443         DEBUG_TRIE_COMPILE_r(
14444             Perl_sv_catpvf(aTHX_ sv,
14445                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14446                 (UV)trie->startstate,
14447                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14448                 (UV)trie->wordcount,
14449                 (UV)trie->minlen,
14450                 (UV)trie->maxlen,
14451                 (UV)TRIE_CHARCOUNT(trie),
14452                 (UV)trie->uniquecharcount
14453             )
14454         );
14455         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14456             int i;
14457             int rangestart = -1;
14458             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14459             sv_catpvs(sv, "[");
14460             for (i = 0; i <= 256; i++) {
14461                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
14462                     if (rangestart == -1)
14463                         rangestart = i;
14464                 } else if (rangestart != -1) {
14465                     if (i <= rangestart + 3)
14466                         for (; rangestart < i; rangestart++)
14467                             put_byte(sv, rangestart);
14468                     else {
14469                         put_byte(sv, rangestart);
14470                         sv_catpvs(sv, "-");
14471                         put_byte(sv, i - 1);
14472                     }
14473                     rangestart = -1;
14474                 }
14475             }
14476             sv_catpvs(sv, "]");
14477         }
14478
14479     } else if (k == CURLY) {
14480         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14481             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14482         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14483     }
14484     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
14485         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14486     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14487         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
14488         if ( RXp_PAREN_NAMES(prog) ) {
14489             if ( k != REF || (OP(o) < NREF)) {
14490                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14491                 SV **name= av_fetch(list, ARG(o), 0 );
14492                 if (name)
14493                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14494             }
14495             else {
14496                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14497                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14498                 I32 *nums=(I32*)SvPVX(sv_dat);
14499                 SV **name= av_fetch(list, nums[0], 0 );
14500                 I32 n;
14501                 if (name) {
14502                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
14503                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14504                                     (n ? "," : ""), (IV)nums[n]);
14505                     }
14506                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14507                 }
14508             }
14509         }
14510     } else if (k == GOSUB)
14511         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14512     else if (k == VERB) {
14513         if (!o->flags)
14514             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14515                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14516     } else if (k == LOGICAL)
14517         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
14518     else if (k == ANYOF) {
14519         int i, rangestart = -1;
14520         const U8 flags = ANYOF_FLAGS(o);
14521         int do_sep = 0;
14522
14523
14524         if (flags & ANYOF_LOCALE)
14525             sv_catpvs(sv, "{loc}");
14526         if (flags & ANYOF_LOC_FOLD)
14527             sv_catpvs(sv, "{i}");
14528         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14529         if (flags & ANYOF_INVERT)
14530             sv_catpvs(sv, "^");
14531
14532         /* output what the standard cp 0-255 bitmap matches */
14533         for (i = 0; i <= 256; i++) {
14534             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14535                 if (rangestart == -1)
14536                     rangestart = i;
14537             } else if (rangestart != -1) {
14538                 if (i <= rangestart + 3)
14539                     for (; rangestart < i; rangestart++)
14540                         put_byte(sv, rangestart);
14541                 else {
14542                     put_byte(sv, rangestart);
14543                     sv_catpvs(sv, "-");
14544                     put_byte(sv, i - 1);
14545                 }
14546                 do_sep = 1;
14547                 rangestart = -1;
14548             }
14549         }
14550
14551         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14552         /* output any special charclass tests (used entirely under use locale) */
14553         if (ANYOF_CLASS_TEST_ANY_SET(o))
14554             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14555                 if (ANYOF_CLASS_TEST(o,i)) {
14556                     sv_catpv(sv, anyofs[i]);
14557                     do_sep = 1;
14558                 }
14559
14560         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14561
14562         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14563             sv_catpvs(sv, "{non-utf8-latin1-all}");
14564         }
14565
14566         /* output information about the unicode matching */
14567         if (flags & ANYOF_UNICODE_ALL)
14568             sv_catpvs(sv, "{unicode_all}");
14569         else if (ANYOF_NONBITMAP(o))
14570             sv_catpvs(sv, "{unicode}");
14571         if (flags & ANYOF_NONBITMAP_NON_UTF8)
14572             sv_catpvs(sv, "{outside bitmap}");
14573
14574         if (ANYOF_NONBITMAP(o)) {
14575             SV *lv; /* Set if there is something outside the bit map */
14576             SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14577             bool byte_output = FALSE;   /* If something in the bitmap has been
14578                                            output */
14579
14580             if (lv && lv != &PL_sv_undef) {
14581                 if (sw) {
14582                     U8 s[UTF8_MAXBYTES_CASE+1];
14583
14584                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14585                         uvchr_to_utf8(s, i);
14586
14587                         if (i < 256
14588                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
14589                                                                things already
14590                                                                output as part
14591                                                                of the bitmap */
14592                             && swash_fetch(sw, s, TRUE))
14593                         {
14594                             if (rangestart == -1)
14595                                 rangestart = i;
14596                         } else if (rangestart != -1) {
14597                             byte_output = TRUE;
14598                             if (i <= rangestart + 3)
14599                                 for (; rangestart < i; rangestart++) {
14600                                     put_byte(sv, rangestart);
14601                                 }
14602                             else {
14603                                 put_byte(sv, rangestart);
14604                                 sv_catpvs(sv, "-");
14605                                 put_byte(sv, i-1);
14606                             }
14607                             rangestart = -1;
14608                         }
14609                     }
14610                 }
14611
14612                 {
14613                     char *s = savesvpv(lv);
14614                     char * const origs = s;
14615
14616                     while (*s && *s != '\n')
14617                         s++;
14618
14619                     if (*s == '\n') {
14620                         const char * const t = ++s;
14621
14622                         if (byte_output) {
14623                             sv_catpvs(sv, " ");
14624                         }
14625
14626                         while (*s) {
14627                             if (*s == '\n') {
14628
14629                                 /* Truncate very long output */
14630                                 if (s - origs > 256) {
14631                                     Perl_sv_catpvf(aTHX_ sv,
14632                                                    "%.*s...",
14633                                                    (int) (s - origs - 1),
14634                                                    t);
14635                                     goto out_dump;
14636                                 }
14637                                 *s = ' ';
14638                             }
14639                             else if (*s == '\t') {
14640                                 *s = '-';
14641                             }
14642                             s++;
14643                         }
14644                         if (s[-1] == ' ')
14645                             s[-1] = 0;
14646
14647                         sv_catpv(sv, t);
14648                     }
14649
14650                 out_dump:
14651
14652                     Safefree(origs);
14653                 }
14654                 SvREFCNT_dec_NN(lv);
14655             }
14656         }
14657
14658         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14659     }
14660     else if (k == POSIXD || k == NPOSIXD) {
14661         U8 index = FLAGS(o) * 2;
14662         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14663             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14664         }
14665         else {
14666             sv_catpv(sv, anyofs[index]);
14667         }
14668     }
14669     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14670         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14671 #else
14672     PERL_UNUSED_CONTEXT;
14673     PERL_UNUSED_ARG(sv);
14674     PERL_UNUSED_ARG(o);
14675     PERL_UNUSED_ARG(prog);
14676 #endif  /* DEBUGGING */
14677 }
14678
14679 SV *
14680 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14681 {                               /* Assume that RE_INTUIT is set */
14682     dVAR;
14683     struct regexp *const prog = ReANY(r);
14684     GET_RE_DEBUG_FLAGS_DECL;
14685
14686     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14687     PERL_UNUSED_CONTEXT;
14688
14689     DEBUG_COMPILE_r(
14690         {
14691             const char * const s = SvPV_nolen_const(prog->check_substr
14692                       ? prog->check_substr : prog->check_utf8);
14693
14694             if (!PL_colorset) reginitcolors();
14695             PerlIO_printf(Perl_debug_log,
14696                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14697                       PL_colors[4],
14698                       prog->check_substr ? "" : "utf8 ",
14699                       PL_colors[5],PL_colors[0],
14700                       s,
14701                       PL_colors[1],
14702                       (strlen(s) > 60 ? "..." : ""));
14703         } );
14704
14705     return prog->check_substr ? prog->check_substr : prog->check_utf8;
14706 }
14707
14708 /*
14709    pregfree()
14710
14711    handles refcounting and freeing the perl core regexp structure. When
14712    it is necessary to actually free the structure the first thing it
14713    does is call the 'free' method of the regexp_engine associated to
14714    the regexp, allowing the handling of the void *pprivate; member
14715    first. (This routine is not overridable by extensions, which is why
14716    the extensions free is called first.)
14717
14718    See regdupe and regdupe_internal if you change anything here.
14719 */
14720 #ifndef PERL_IN_XSUB_RE
14721 void
14722 Perl_pregfree(pTHX_ REGEXP *r)
14723 {
14724     SvREFCNT_dec(r);
14725 }
14726
14727 void
14728 Perl_pregfree2(pTHX_ REGEXP *rx)
14729 {
14730     dVAR;
14731     struct regexp *const r = ReANY(rx);
14732     GET_RE_DEBUG_FLAGS_DECL;
14733
14734     PERL_ARGS_ASSERT_PREGFREE2;
14735
14736     if (r->mother_re) {
14737         ReREFCNT_dec(r->mother_re);
14738     } else {
14739         CALLREGFREE_PVT(rx); /* free the private data */
14740         SvREFCNT_dec(RXp_PAREN_NAMES(r));
14741         Safefree(r->xpv_len_u.xpvlenu_pv);
14742     }
14743     if (r->substrs) {
14744         SvREFCNT_dec(r->anchored_substr);
14745         SvREFCNT_dec(r->anchored_utf8);
14746         SvREFCNT_dec(r->float_substr);
14747         SvREFCNT_dec(r->float_utf8);
14748         Safefree(r->substrs);
14749     }
14750     RX_MATCH_COPY_FREE(rx);
14751 #ifdef PERL_ANY_COW
14752     SvREFCNT_dec(r->saved_copy);
14753 #endif
14754     Safefree(r->offs);
14755     SvREFCNT_dec(r->qr_anoncv);
14756     rx->sv_u.svu_rx = 0;
14757 }
14758
14759 /*  reg_temp_copy()
14760
14761     This is a hacky workaround to the structural issue of match results
14762     being stored in the regexp structure which is in turn stored in
14763     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14764     could be PL_curpm in multiple contexts, and could require multiple
14765     result sets being associated with the pattern simultaneously, such
14766     as when doing a recursive match with (??{$qr})
14767
14768     The solution is to make a lightweight copy of the regexp structure
14769     when a qr// is returned from the code executed by (??{$qr}) this
14770     lightweight copy doesn't actually own any of its data except for
14771     the starp/end and the actual regexp structure itself.
14772
14773 */
14774
14775
14776 REGEXP *
14777 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14778 {
14779     struct regexp *ret;
14780     struct regexp *const r = ReANY(rx);
14781     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14782
14783     PERL_ARGS_ASSERT_REG_TEMP_COPY;
14784
14785     if (!ret_x)
14786         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14787     else {
14788         SvOK_off((SV *)ret_x);
14789         if (islv) {
14790             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14791                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
14792                made both spots point to the same regexp body.) */
14793             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14794             assert(!SvPVX(ret_x));
14795             ret_x->sv_u.svu_rx = temp->sv_any;
14796             temp->sv_any = NULL;
14797             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14798             SvREFCNT_dec_NN(temp);
14799             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14800                ing below will not set it. */
14801             SvCUR_set(ret_x, SvCUR(rx));
14802         }
14803     }
14804     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14805        sv_force_normal(sv) is called.  */
14806     SvFAKE_on(ret_x);
14807     ret = ReANY(ret_x);
14808
14809     SvFLAGS(ret_x) |= SvUTF8(rx);
14810     /* We share the same string buffer as the original regexp, on which we
14811        hold a reference count, incremented when mother_re is set below.
14812        The string pointer is copied here, being part of the regexp struct.
14813      */
14814     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14815            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14816     if (r->offs) {
14817         const I32 npar = r->nparens+1;
14818         Newx(ret->offs, npar, regexp_paren_pair);
14819         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14820     }
14821     if (r->substrs) {
14822         Newx(ret->substrs, 1, struct reg_substr_data);
14823         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14824
14825         SvREFCNT_inc_void(ret->anchored_substr);
14826         SvREFCNT_inc_void(ret->anchored_utf8);
14827         SvREFCNT_inc_void(ret->float_substr);
14828         SvREFCNT_inc_void(ret->float_utf8);
14829
14830         /* check_substr and check_utf8, if non-NULL, point to either their
14831            anchored or float namesakes, and don't hold a second reference.  */
14832     }
14833     RX_MATCH_COPIED_off(ret_x);
14834 #ifdef PERL_ANY_COW
14835     ret->saved_copy = NULL;
14836 #endif
14837     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14838     SvREFCNT_inc_void(ret->qr_anoncv);
14839
14840     return ret_x;
14841 }
14842 #endif
14843
14844 /* regfree_internal()
14845
14846    Free the private data in a regexp. This is overloadable by
14847    extensions. Perl takes care of the regexp structure in pregfree(),
14848    this covers the *pprivate pointer which technically perl doesn't
14849    know about, however of course we have to handle the
14850    regexp_internal structure when no extension is in use.
14851
14852    Note this is called before freeing anything in the regexp
14853    structure.
14854  */
14855
14856 void
14857 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14858 {
14859     dVAR;
14860     struct regexp *const r = ReANY(rx);
14861     RXi_GET_DECL(r,ri);
14862     GET_RE_DEBUG_FLAGS_DECL;
14863
14864     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14865
14866     DEBUG_COMPILE_r({
14867         if (!PL_colorset)
14868             reginitcolors();
14869         {
14870             SV *dsv= sv_newmortal();
14871             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14872                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14873             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14874                 PL_colors[4],PL_colors[5],s);
14875         }
14876     });
14877 #ifdef RE_TRACK_PATTERN_OFFSETS
14878     if (ri->u.offsets)
14879         Safefree(ri->u.offsets);             /* 20010421 MJD */
14880 #endif
14881     if (ri->code_blocks) {
14882         int n;
14883         for (n = 0; n < ri->num_code_blocks; n++)
14884             SvREFCNT_dec(ri->code_blocks[n].src_regex);
14885         Safefree(ri->code_blocks);
14886     }
14887
14888     if (ri->data) {
14889         int n = ri->data->count;
14890
14891         while (--n >= 0) {
14892           /* If you add a ->what type here, update the comment in regcomp.h */
14893             switch (ri->data->what[n]) {
14894             case 'a':
14895             case 'r':
14896             case 's':
14897             case 'S':
14898             case 'u':
14899                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14900                 break;
14901             case 'f':
14902                 Safefree(ri->data->data[n]);
14903                 break;
14904             case 'l':
14905             case 'L':
14906                 break;
14907             case 'T':
14908                 { /* Aho Corasick add-on structure for a trie node.
14909                      Used in stclass optimization only */
14910                     U32 refcount;
14911                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14912                     OP_REFCNT_LOCK;
14913                     refcount = --aho->refcount;
14914                     OP_REFCNT_UNLOCK;
14915                     if ( !refcount ) {
14916                         PerlMemShared_free(aho->states);
14917                         PerlMemShared_free(aho->fail);
14918                          /* do this last!!!! */
14919                         PerlMemShared_free(ri->data->data[n]);
14920                         PerlMemShared_free(ri->regstclass);
14921                     }
14922                 }
14923                 break;
14924             case 't':
14925                 {
14926                     /* trie structure. */
14927                     U32 refcount;
14928                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14929                     OP_REFCNT_LOCK;
14930                     refcount = --trie->refcount;
14931                     OP_REFCNT_UNLOCK;
14932                     if ( !refcount ) {
14933                         PerlMemShared_free(trie->charmap);
14934                         PerlMemShared_free(trie->states);
14935                         PerlMemShared_free(trie->trans);
14936                         if (trie->bitmap)
14937                             PerlMemShared_free(trie->bitmap);
14938                         if (trie->jump)
14939                             PerlMemShared_free(trie->jump);
14940                         PerlMemShared_free(trie->wordinfo);
14941                         /* do this last!!!! */
14942                         PerlMemShared_free(ri->data->data[n]);
14943                     }
14944                 }
14945                 break;
14946             default:
14947                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14948             }
14949         }
14950         Safefree(ri->data->what);
14951         Safefree(ri->data);
14952     }
14953
14954     Safefree(ri);
14955 }
14956
14957 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14958 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14959 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
14960
14961 /*
14962    re_dup - duplicate a regexp.
14963
14964    This routine is expected to clone a given regexp structure. It is only
14965    compiled under USE_ITHREADS.
14966
14967    After all of the core data stored in struct regexp is duplicated
14968    the regexp_engine.dupe method is used to copy any private data
14969    stored in the *pprivate pointer. This allows extensions to handle
14970    any duplication it needs to do.
14971
14972    See pregfree() and regfree_internal() if you change anything here.
14973 */
14974 #if defined(USE_ITHREADS)
14975 #ifndef PERL_IN_XSUB_RE
14976 void
14977 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14978 {
14979     dVAR;
14980     I32 npar;
14981     const struct regexp *r = ReANY(sstr);
14982     struct regexp *ret = ReANY(dstr);
14983
14984     PERL_ARGS_ASSERT_RE_DUP_GUTS;
14985
14986     npar = r->nparens+1;
14987     Newx(ret->offs, npar, regexp_paren_pair);
14988     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14989
14990     if (ret->substrs) {
14991         /* Do it this way to avoid reading from *r after the StructCopy().
14992            That way, if any of the sv_dup_inc()s dislodge *r from the L1
14993            cache, it doesn't matter.  */
14994         const bool anchored = r->check_substr
14995             ? r->check_substr == r->anchored_substr
14996             : r->check_utf8 == r->anchored_utf8;
14997         Newx(ret->substrs, 1, struct reg_substr_data);
14998         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14999
15000         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15001         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15002         ret->float_substr = sv_dup_inc(ret->float_substr, param);
15003         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15004
15005         /* check_substr and check_utf8, if non-NULL, point to either their
15006            anchored or float namesakes, and don't hold a second reference.  */
15007
15008         if (ret->check_substr) {
15009             if (anchored) {
15010                 assert(r->check_utf8 == r->anchored_utf8);
15011                 ret->check_substr = ret->anchored_substr;
15012                 ret->check_utf8 = ret->anchored_utf8;
15013             } else {
15014                 assert(r->check_substr == r->float_substr);
15015                 assert(r->check_utf8 == r->float_utf8);
15016                 ret->check_substr = ret->float_substr;
15017                 ret->check_utf8 = ret->float_utf8;
15018             }
15019         } else if (ret->check_utf8) {
15020             if (anchored) {
15021                 ret->check_utf8 = ret->anchored_utf8;
15022             } else {
15023                 ret->check_utf8 = ret->float_utf8;
15024             }
15025         }
15026     }
15027
15028     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15029     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15030
15031     if (ret->pprivate)
15032         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15033
15034     if (RX_MATCH_COPIED(dstr))
15035         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15036     else
15037         ret->subbeg = NULL;
15038 #ifdef PERL_ANY_COW
15039     ret->saved_copy = NULL;
15040 #endif
15041
15042     /* Whether mother_re be set or no, we need to copy the string.  We
15043        cannot refrain from copying it when the storage points directly to
15044        our mother regexp, because that's
15045                1: a buffer in a different thread
15046                2: something we no longer hold a reference on
15047                so we need to copy it locally.  */
15048     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15049     ret->mother_re   = NULL;
15050     ret->gofs = 0;
15051 }
15052 #endif /* PERL_IN_XSUB_RE */
15053
15054 /*
15055    regdupe_internal()
15056
15057    This is the internal complement to regdupe() which is used to copy
15058    the structure pointed to by the *pprivate pointer in the regexp.
15059    This is the core version of the extension overridable cloning hook.
15060    The regexp structure being duplicated will be copied by perl prior
15061    to this and will be provided as the regexp *r argument, however
15062    with the /old/ structures pprivate pointer value. Thus this routine
15063    may override any copying normally done by perl.
15064
15065    It returns a pointer to the new regexp_internal structure.
15066 */
15067
15068 void *
15069 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15070 {
15071     dVAR;
15072     struct regexp *const r = ReANY(rx);
15073     regexp_internal *reti;
15074     int len;
15075     RXi_GET_DECL(r,ri);
15076
15077     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15078
15079     len = ProgLen(ri);
15080
15081     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15082     Copy(ri->program, reti->program, len+1, regnode);
15083
15084     reti->num_code_blocks = ri->num_code_blocks;
15085     if (ri->code_blocks) {
15086         int n;
15087         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15088                 struct reg_code_block);
15089         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15090                 struct reg_code_block);
15091         for (n = 0; n < ri->num_code_blocks; n++)
15092              reti->code_blocks[n].src_regex = (REGEXP*)
15093                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15094     }
15095     else
15096         reti->code_blocks = NULL;
15097
15098     reti->regstclass = NULL;
15099
15100     if (ri->data) {
15101         struct reg_data *d;
15102         const int count = ri->data->count;
15103         int i;
15104
15105         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15106                 char, struct reg_data);
15107         Newx(d->what, count, U8);
15108
15109         d->count = count;
15110         for (i = 0; i < count; i++) {
15111             d->what[i] = ri->data->what[i];
15112             switch (d->what[i]) {
15113                 /* see also regcomp.h and regfree_internal() */
15114             case 'a': /* actually an AV, but the dup function is identical.  */
15115             case 'r':
15116             case 's':
15117             case 'S':
15118             case 'u': /* actually an HV, but the dup function is identical.  */
15119                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15120                 break;
15121             case 'f':
15122                 /* This is cheating. */
15123                 Newx(d->data[i], 1, struct regnode_charclass_class);
15124                 StructCopy(ri->data->data[i], d->data[i],
15125                             struct regnode_charclass_class);
15126                 reti->regstclass = (regnode*)d->data[i];
15127                 break;
15128             case 'T':
15129                 /* Trie stclasses are readonly and can thus be shared
15130                  * without duplication. We free the stclass in pregfree
15131                  * when the corresponding reg_ac_data struct is freed.
15132                  */
15133                 reti->regstclass= ri->regstclass;
15134                 /* Fall through */
15135             case 't':
15136                 OP_REFCNT_LOCK;
15137                 ((reg_trie_data*)ri->data->data[i])->refcount++;
15138                 OP_REFCNT_UNLOCK;
15139                 /* Fall through */
15140             case 'l':
15141             case 'L':
15142                 d->data[i] = ri->data->data[i];
15143                 break;
15144             default:
15145                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15146             }
15147         }
15148
15149         reti->data = d;
15150     }
15151     else
15152         reti->data = NULL;
15153
15154     reti->name_list_idx = ri->name_list_idx;
15155
15156 #ifdef RE_TRACK_PATTERN_OFFSETS
15157     if (ri->u.offsets) {
15158         Newx(reti->u.offsets, 2*len+1, U32);
15159         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15160     }
15161 #else
15162     SetProgLen(reti,len);
15163 #endif
15164
15165     return (void*)reti;
15166 }
15167
15168 #endif    /* USE_ITHREADS */
15169
15170 #ifndef PERL_IN_XSUB_RE
15171
15172 /*
15173  - regnext - dig the "next" pointer out of a node
15174  */
15175 regnode *
15176 Perl_regnext(pTHX_ regnode *p)
15177 {
15178     dVAR;
15179     I32 offset;
15180
15181     if (!p)
15182         return(NULL);
15183
15184     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
15185         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15186     }
15187
15188     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15189     if (offset == 0)
15190         return(NULL);
15191
15192     return(p+offset);
15193 }
15194 #endif
15195
15196 STATIC void
15197 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15198 {
15199     va_list args;
15200     STRLEN l1 = strlen(pat1);
15201     STRLEN l2 = strlen(pat2);
15202     char buf[512];
15203     SV *msv;
15204     const char *message;
15205
15206     PERL_ARGS_ASSERT_RE_CROAK2;
15207
15208     if (l1 > 510)
15209         l1 = 510;
15210     if (l1 + l2 > 510)
15211         l2 = 510 - l1;
15212     Copy(pat1, buf, l1 , char);
15213     Copy(pat2, buf + l1, l2 , char);
15214     buf[l1 + l2] = '\n';
15215     buf[l1 + l2 + 1] = '\0';
15216 #ifdef I_STDARG
15217     /* ANSI variant takes additional second argument */
15218     va_start(args, pat2);
15219 #else
15220     va_start(args);
15221 #endif
15222     msv = vmess(buf, &args);
15223     va_end(args);
15224     message = SvPV_const(msv,l1);
15225     if (l1 > 512)
15226         l1 = 512;
15227     Copy(message, buf, l1 , char);
15228     buf[l1-1] = '\0';                   /* Overwrite \n */
15229     Perl_croak(aTHX_ "%s", buf);
15230 }
15231
15232 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15233
15234 #ifndef PERL_IN_XSUB_RE
15235 void
15236 Perl_save_re_context(pTHX)
15237 {
15238     dVAR;
15239
15240     struct re_save_state *state;
15241
15242     SAVEVPTR(PL_curcop);
15243     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15244
15245     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15246     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15247     SSPUSHUV(SAVEt_RE_STATE);
15248
15249     Copy(&PL_reg_state, state, 1, struct re_save_state);
15250
15251     PL_reg_oldsaved = NULL;
15252     PL_reg_oldsavedlen = 0;
15253     PL_reg_oldsavedoffset = 0;
15254     PL_reg_oldsavedcoffset = 0;
15255     PL_reg_maxiter = 0;
15256     PL_reg_leftiter = 0;
15257     PL_reg_poscache = NULL;
15258     PL_reg_poscache_size = 0;
15259 #ifdef PERL_ANY_COW
15260     PL_nrs = NULL;
15261 #endif
15262
15263     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15264     if (PL_curpm) {
15265         const REGEXP * const rx = PM_GETRE(PL_curpm);
15266         if (rx) {
15267             U32 i;
15268             for (i = 1; i <= RX_NPARENS(rx); i++) {
15269                 char digits[TYPE_CHARS(long)];
15270                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15271                 GV *const *const gvp
15272                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15273
15274                 if (gvp) {
15275                     GV * const gv = *gvp;
15276                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15277                         save_scalar(gv);
15278                 }
15279             }
15280         }
15281     }
15282 }
15283 #endif
15284
15285 #ifdef DEBUGGING
15286
15287 STATIC void
15288 S_put_byte(pTHX_ SV *sv, int c)
15289 {
15290     PERL_ARGS_ASSERT_PUT_BYTE;
15291
15292     /* Our definition of isPRINT() ignores locales, so only bytes that are
15293        not part of UTF-8 are considered printable. I assume that the same
15294        holds for UTF-EBCDIC.
15295        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15296        which Wikipedia says:
15297
15298        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15299        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15300        identical, to the ASCII delete (DEL) or rubout control character.
15301        ) So the old condition can be simplified to !isPRINT(c)  */
15302     if (!isPRINT(c)) {
15303         if (c < 256) {
15304             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15305         }
15306         else {
15307             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15308         }
15309     }
15310     else {
15311         const char string = c;
15312         if (c == '-' || c == ']' || c == '\\' || c == '^')
15313             sv_catpvs(sv, "\\");
15314         sv_catpvn(sv, &string, 1);
15315     }
15316 }
15317
15318
15319 #define CLEAR_OPTSTART \
15320     if (optstart) STMT_START { \
15321             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15322             optstart=NULL; \
15323     } STMT_END
15324
15325 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15326
15327 STATIC const regnode *
15328 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15329             const regnode *last, const regnode *plast,
15330             SV* sv, I32 indent, U32 depth)
15331 {
15332     dVAR;
15333     U8 op = PSEUDO;     /* Arbitrary non-END op. */
15334     const regnode *next;
15335     const regnode *optstart= NULL;
15336
15337     RXi_GET_DECL(r,ri);
15338     GET_RE_DEBUG_FLAGS_DECL;
15339
15340     PERL_ARGS_ASSERT_DUMPUNTIL;
15341
15342 #ifdef DEBUG_DUMPUNTIL
15343     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15344         last ? last-start : 0,plast ? plast-start : 0);
15345 #endif
15346
15347     if (plast && plast < last)
15348         last= plast;
15349
15350     while (PL_regkind[op] != END && (!last || node < last)) {
15351         /* While that wasn't END last time... */
15352         NODE_ALIGN(node);
15353         op = OP(node);
15354         if (op == CLOSE || op == WHILEM)
15355             indent--;
15356         next = regnext((regnode *)node);
15357
15358         /* Where, what. */
15359         if (OP(node) == OPTIMIZED) {
15360             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15361                 optstart = node;
15362             else
15363                 goto after_print;
15364         } else
15365             CLEAR_OPTSTART;
15366
15367         regprop(r, sv, node);
15368         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15369                       (int)(2*indent + 1), "", SvPVX_const(sv));
15370
15371         if (OP(node) != OPTIMIZED) {
15372             if (next == NULL)           /* Next ptr. */
15373                 PerlIO_printf(Perl_debug_log, " (0)");
15374             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15375                 PerlIO_printf(Perl_debug_log, " (FAIL)");
15376             else
15377                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15378             (void)PerlIO_putc(Perl_debug_log, '\n');
15379         }
15380
15381       after_print:
15382         if (PL_regkind[(U8)op] == BRANCHJ) {
15383             assert(next);
15384             {
15385                 const regnode *nnode = (OP(next) == LONGJMP
15386                                        ? regnext((regnode *)next)
15387                                        : next);
15388                 if (last && nnode > last)
15389                     nnode = last;
15390                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15391             }
15392         }
15393         else if (PL_regkind[(U8)op] == BRANCH) {
15394             assert(next);
15395             DUMPUNTIL(NEXTOPER(node), next);
15396         }
15397         else if ( PL_regkind[(U8)op]  == TRIE ) {
15398             const regnode *this_trie = node;
15399             const char op = OP(node);
15400             const U32 n = ARG(node);
15401             const reg_ac_data * const ac = op>=AHOCORASICK ?
15402                (reg_ac_data *)ri->data->data[n] :
15403                NULL;
15404             const reg_trie_data * const trie =
15405                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15406 #ifdef DEBUGGING
15407             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15408 #endif
15409             const regnode *nextbranch= NULL;
15410             I32 word_idx;
15411             sv_setpvs(sv, "");
15412             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15413                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15414
15415                 PerlIO_printf(Perl_debug_log, "%*s%s ",
15416                    (int)(2*(indent+3)), "",
15417                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15418                             PL_colors[0], PL_colors[1],
15419                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15420                             PERL_PV_PRETTY_ELLIPSES    |
15421                             PERL_PV_PRETTY_LTGT
15422                             )
15423                             : "???"
15424                 );
15425                 if (trie->jump) {
15426                     U16 dist= trie->jump[word_idx+1];
15427                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15428                                   (UV)((dist ? this_trie + dist : next) - start));
15429                     if (dist) {
15430                         if (!nextbranch)
15431                             nextbranch= this_trie + trie->jump[0];
15432                         DUMPUNTIL(this_trie + dist, nextbranch);
15433                     }
15434                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15435                         nextbranch= regnext((regnode *)nextbranch);
15436                 } else {
15437                     PerlIO_printf(Perl_debug_log, "\n");
15438                 }
15439             }
15440             if (last && next > last)
15441                 node= last;
15442             else
15443                 node= next;
15444         }
15445         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15446             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15447                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15448         }
15449         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15450             assert(next);
15451             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15452         }
15453         else if ( op == PLUS || op == STAR) {
15454             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15455         }
15456         else if (PL_regkind[(U8)op] == ANYOF) {
15457             /* arglen 1 + class block */
15458             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15459                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15460             node = NEXTOPER(node);
15461         }
15462         else if (PL_regkind[(U8)op] == EXACT) {
15463             /* Literal string, where present. */
15464             node += NODE_SZ_STR(node) - 1;
15465             node = NEXTOPER(node);
15466         }
15467         else {
15468             node = NEXTOPER(node);
15469             node += regarglen[(U8)op];
15470         }
15471         if (op == CURLYX || op == OPEN)
15472             indent++;
15473     }
15474     CLEAR_OPTSTART;
15475 #ifdef DEBUG_DUMPUNTIL
15476     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15477 #endif
15478     return node;
15479 }
15480
15481 #endif  /* DEBUGGING */
15482
15483 /*
15484  * Local variables:
15485  * c-indentation-style: bsd
15486  * c-basic-offset: 4
15487  * indent-tabs-mode: nil
15488  * End:
15489  *
15490  * ex: set ts=8 sts=4 sw=4 et:
15491  */