regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89 #ifndef PERL_IN_XSUB_RE
  90 #  include "charclass_invlists.h"
  91 #endif
  92
  93 #ifdef op
  94 #undef op
  95 #endif /* op */
  96
  97 #ifdef MSDOS
  98 #  if defined(BUGGY_MSC6)
  99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 100 #    pragma optimize("a",off)
 101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 102 #    pragma optimize("w",on )
 103 #  endif /* BUGGY_MSC6 */
 104 #endif /* MSDOS */
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 typedef struct RExC_state_t {
 111     U32         flags;                  /* are we folding, multilining? */
 112     char        *precomp;               /* uncompiled string. */
 113     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 114     regexp      *rx;                    /* perl core regexp structure */
 115     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 116     char        *start;                 /* Start of input for compile */
 117     char        *end;                   /* End of input for compile */
 118     char        *parse;                 /* Input-scan pointer. */
 119     I32         whilem_seen;            /* number of WHILEM in this expr */
 120     regnode     *emit_start;            /* Start of emitted-code area */
 121     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 122     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 123     I32         naughty;                /* How bad is this pattern? */
 124     I32         sawback;                /* Did we see \1, ...? */
 125     U32         seen;
 126     I32         size;                   /* Code size. */
 127     I32         npar;                   /* Capture buffer count, (OPEN). */
 128     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 129     I32         nestroot;               /* root parens we are in - used by accept */
 130     I32         extralen;
 131     I32         seen_zerolen;
 132     I32         seen_evals;
 133     regnode     **open_parens;          /* pointers to open parens */
 134     regnode     **close_parens;         /* pointers to close parens */
 135     regnode     *opend;                 /* END node in program */
 136     I32         utf8;           /* whether the pattern is utf8 or not */
 137     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 138                                 /* XXX use this for future optimisation of case
 139                                  * where pattern must be upgraded to utf8. */
 140     I32         uni_semantics;  /* If a d charset modifier should use unicode
 141                                    rules, even if the pattern is not in
 142                                    utf8 */
 143     HV          *paren_names;           /* Paren names */
 144
 145     regnode     **recurse;              /* Recurse regops */
 146     I32         recurse_count;          /* Number of recurse regops */
 147     I32         in_lookbehind;
 148     I32         contains_locale;
 149     I32         override_recoding;
 150 #if ADD_TO_REGEXEC
 151     char        *starttry;              /* -Dr: where regtry was called. */
 152 #define RExC_starttry   (pRExC_state->starttry)
 153 #endif
 154 #ifdef DEBUGGING
 155     const char  *lastparse;
 156     I32         lastnum;
 157     AV          *paren_name_list;       /* idx -> name */
 158 #define RExC_lastparse  (pRExC_state->lastparse)
 159 #define RExC_lastnum    (pRExC_state->lastnum)
 160 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 161 #endif
 162 } RExC_state_t;
 163
 164 #define RExC_flags      (pRExC_state->flags)
 165 #define RExC_precomp    (pRExC_state->precomp)
 166 #define RExC_rx_sv      (pRExC_state->rx_sv)
 167 #define RExC_rx         (pRExC_state->rx)
 168 #define RExC_rxi        (pRExC_state->rxi)
 169 #define RExC_start      (pRExC_state->start)
 170 #define RExC_end        (pRExC_state->end)
 171 #define RExC_parse      (pRExC_state->parse)
 172 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 173 #ifdef RE_TRACK_PATTERN_OFFSETS
 174 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 175 #endif
 176 #define RExC_emit       (pRExC_state->emit)
 177 #define RExC_emit_start (pRExC_state->emit_start)
 178 #define RExC_emit_bound (pRExC_state->emit_bound)
 179 #define RExC_naughty    (pRExC_state->naughty)
 180 #define RExC_sawback    (pRExC_state->sawback)
 181 #define RExC_seen       (pRExC_state->seen)
 182 #define RExC_size       (pRExC_state->size)
 183 #define RExC_npar       (pRExC_state->npar)
 184 #define RExC_nestroot   (pRExC_state->nestroot)
 185 #define RExC_extralen   (pRExC_state->extralen)
 186 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 187 #define RExC_seen_evals (pRExC_state->seen_evals)
 188 #define RExC_utf8       (pRExC_state->utf8)
 189 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 190 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 191 #define RExC_open_parens        (pRExC_state->open_parens)
 192 #define RExC_close_parens       (pRExC_state->close_parens)
 193 #define RExC_opend      (pRExC_state->opend)
 194 #define RExC_paren_names        (pRExC_state->paren_names)
 195 #define RExC_recurse    (pRExC_state->recurse)
 196 #define RExC_recurse_count      (pRExC_state->recurse_count)
 197 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 198 #define RExC_contains_locale    (pRExC_state->contains_locale)
 199 #define RExC_override_recoding  (pRExC_state->override_recoding)
 200
 201
 202 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 203 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 204         ((*s) == '{' && regcurly(s)))
 205
 206 #ifdef SPSTART
 207 #undef SPSTART          /* dratted cpp namespace... */
 208 #endif
 209 /*
 210  * Flags to be passed up and down.
 211  */
 212 #define WORST           0       /* Worst case. */
 213 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 214
 215 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 216  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 217 #define SIMPLE          0x02
 218 #define SPSTART         0x04    /* Starts with * or +. */
 219 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 220 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 221
 222 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 223
 224 /* whether trie related optimizations are enabled */
 225 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 226 #define TRIE_STUDY_OPT
 227 #define FULL_TRIE_STUDY
 228 #define TRIE_STCLASS
 229 #endif
 230
 231
 232
 233 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 234 #define PBITVAL(paren) (1 << ((paren) & 7))
 235 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 236 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 237 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 238
 239 /* If not already in utf8, do a longjmp back to the beginning */
 240 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 241 #define REQUIRE_UTF8    STMT_START {                                       \
 242                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 243                         } STMT_END
 244
 245 /* About scan_data_t.
 246
 247   During optimisation we recurse through the regexp program performing
 248   various inplace (keyhole style) optimisations. In addition study_chunk
 249   and scan_commit populate this data structure with information about
 250   what strings MUST appear in the pattern. We look for the longest
 251   string that must appear at a fixed location, and we look for the
 252   longest string that may appear at a floating location. So for instance
 253   in the pattern:
 254
 255     /FOO[xX]A.*B[xX]BAR/
 256
 257   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 258   strings (because they follow a .* construct). study_chunk will identify
 259   both FOO and BAR as being the longest fixed and floating strings respectively.
 260
 261   The strings can be composites, for instance
 262
 263      /(f)(o)(o)/
 264
 265   will result in a composite fixed substring 'foo'.
 266
 267   For each string some basic information is maintained:
 268
 269   - offset or min_offset
 270     This is the position the string must appear at, or not before.
 271     It also implicitly (when combined with minlenp) tells us how many
 272     characters must match before the string we are searching for.
 273     Likewise when combined with minlenp and the length of the string it
 274     tells us how many characters must appear after the string we have
 275     found.
 276
 277   - max_offset
 278     Only used for floating strings. This is the rightmost point that
 279     the string can appear at. If set to I32 max it indicates that the
 280     string can occur infinitely far to the right.
 281
 282   - minlenp
 283     A pointer to the minimum length of the pattern that the string
 284     was found inside. This is important as in the case of positive
 285     lookahead or positive lookbehind we can have multiple patterns
 286     involved. Consider
 287
 288     /(?=FOO).*F/
 289
 290     The minimum length of the pattern overall is 3, the minimum length
 291     of the lookahead part is 3, but the minimum length of the part that
 292     will actually match is 1. So 'FOO's minimum length is 3, but the
 293     minimum length for the F is 1. This is important as the minimum length
 294     is used to determine offsets in front of and behind the string being
 295     looked for.  Since strings can be composites this is the length of the
 296     pattern at the time it was committed with a scan_commit. Note that
 297     the length is calculated by study_chunk, so that the minimum lengths
 298     are not known until the full pattern has been compiled, thus the
 299     pointer to the value.
 300
 301   - lookbehind
 302
 303     In the case of lookbehind the string being searched for can be
 304     offset past the start point of the final matching string.
 305     If this value was just blithely removed from the min_offset it would
 306     invalidate some of the calculations for how many chars must match
 307     before or after (as they are derived from min_offset and minlen and
 308     the length of the string being searched for).
 309     When the final pattern is compiled and the data is moved from the
 310     scan_data_t structure into the regexp structure the information
 311     about lookbehind is factored in, with the information that would
 312     have been lost precalculated in the end_shift field for the
 313     associated string.
 314
 315   The fields pos_min and pos_delta are used to store the minimum offset
 316   and the delta to the maximum offset at the current point in the pattern.
 317
 318 */
 319
 320 typedef struct scan_data_t {
 321     /*I32 len_min;      unused */
 322     /*I32 len_delta;    unused */
 323     I32 pos_min;
 324     I32 pos_delta;
 325     SV *last_found;
 326     I32 last_end;           /* min value, <0 unless valid. */
 327     I32 last_start_min;
 328     I32 last_start_max;
 329     SV **longest;           /* Either &l_fixed, or &l_float. */
 330     SV *longest_fixed;      /* longest fixed string found in pattern */
 331     I32 offset_fixed;       /* offset where it starts */
 332     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 333     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 334     SV *longest_float;      /* longest floating string found in pattern */
 335     I32 offset_float_min;   /* earliest point in string it can appear */
 336     I32 offset_float_max;   /* latest point in string it can appear */
 337     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 338     I32 lookbehind_float;   /* is the position of the string modified by LB */
 339     I32 flags;
 340     I32 whilem_c;
 341     I32 *last_closep;
 342     struct regnode_charclass_class *start_class;
 343 } scan_data_t;
 344
 345 /*
 346  * Forward declarations for pregcomp()'s friends.
 347  */
 348
 349 static const scan_data_t zero_scan_data =
 350   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 351
 352 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 353 #define SF_BEFORE_SEOL          0x0001
 354 #define SF_BEFORE_MEOL          0x0002
 355 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 356 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 357
 358 #ifdef NO_UNARY_PLUS
 359 #  define SF_FIX_SHIFT_EOL      (0+2)
 360 #  define SF_FL_SHIFT_EOL               (0+4)
 361 #else
 362 #  define SF_FIX_SHIFT_EOL      (+2)
 363 #  define SF_FL_SHIFT_EOL               (+4)
 364 #endif
 365
 366 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 367 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 368
 369 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 370 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 371 #define SF_IS_INF               0x0040
 372 #define SF_HAS_PAR              0x0080
 373 #define SF_IN_PAR               0x0100
 374 #define SF_HAS_EVAL             0x0200
 375 #define SCF_DO_SUBSTR           0x0400
 376 #define SCF_DO_STCLASS_AND      0x0800
 377 #define SCF_DO_STCLASS_OR       0x1000
 378 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 379 #define SCF_WHILEM_VISITED_POS  0x2000
 380
 381 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 382 #define SCF_SEEN_ACCEPT         0x8000
 383
 384 #define UTF cBOOL(RExC_utf8)
 385
 386 /* The enums for all these are ordered so things work out correctly */
 387 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 388 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 389 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 390 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 391 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 392 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 393 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 394
 395 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 396
 397 #define OOB_UNICODE             12345678
 398 #define OOB_NAMEDCLASS          -1
 399
 400 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 401 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 402
 403
 404 /* length of regex to show in messages that don't mark a position within */
 405 #define RegexLengthToShowInErrorMessages 127
 406
 407 /*
 408  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 409  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 410  * op/pragma/warn/regcomp.
 411  */
 412 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 413 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 414
 415 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 416
 417 /*
 418  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 419  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 420  * "...".
 421  */
 422 #define _FAIL(code) STMT_START {                                        \
 423     const char *ellipses = "";                                          \
 424     IV len = RExC_end - RExC_precomp;                                   \
 425                                                                         \
 426     if (!SIZE_ONLY)                                                     \
 427         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 428     if (len > RegexLengthToShowInErrorMessages) {                       \
 429         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 430         len = RegexLengthToShowInErrorMessages - 10;                    \
 431         ellipses = "...";                                               \
 432     }                                                                   \
 433     code;                                                               \
 434 } STMT_END
 435
 436 #define FAIL(msg) _FAIL(                            \
 437     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 438             msg, (int)len, RExC_precomp, ellipses))
 439
 440 #define FAIL2(msg,arg) _FAIL(                       \
 441     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 442             arg, (int)len, RExC_precomp, ellipses))
 443
 444 /*
 445  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 446  */
 447 #define Simple_vFAIL(m) STMT_START {                                    \
 448     const IV offset = RExC_parse - RExC_precomp;                        \
 449     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 450             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 451 } STMT_END
 452
 453 /*
 454  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 455  */
 456 #define vFAIL(m) STMT_START {                           \
 457     if (!SIZE_ONLY)                                     \
 458         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 459     Simple_vFAIL(m);                                    \
 460 } STMT_END
 461
 462 /*
 463  * Like Simple_vFAIL(), but accepts two arguments.
 464  */
 465 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 466     const IV offset = RExC_parse - RExC_precomp;                        \
 467     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 468             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 469 } STMT_END
 470
 471 /*
 472  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 473  */
 474 #define vFAIL2(m,a1) STMT_START {                       \
 475     if (!SIZE_ONLY)                                     \
 476         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 477     Simple_vFAIL2(m, a1);                               \
 478 } STMT_END
 479
 480
 481 /*
 482  * Like Simple_vFAIL(), but accepts three arguments.
 483  */
 484 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 485     const IV offset = RExC_parse - RExC_precomp;                \
 486     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 487             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 488 } STMT_END
 489
 490 /*
 491  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 492  */
 493 #define vFAIL3(m,a1,a2) STMT_START {                    \
 494     if (!SIZE_ONLY)                                     \
 495         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 496     Simple_vFAIL3(m, a1, a2);                           \
 497 } STMT_END
 498
 499 /*
 500  * Like Simple_vFAIL(), but accepts four arguments.
 501  */
 502 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 503     const IV offset = RExC_parse - RExC_precomp;                \
 504     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 505             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 506 } STMT_END
 507
 508 #define ckWARNreg(loc,m) STMT_START {                                   \
 509     const IV offset = loc - RExC_precomp;                               \
 510     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 511             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 512 } STMT_END
 513
 514 #define ckWARNregdep(loc,m) STMT_START {                                \
 515     const IV offset = loc - RExC_precomp;                               \
 516     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 517             m REPORT_LOCATION,                                          \
 518             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 519 } STMT_END
 520
 521 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 522     const IV offset = loc - RExC_precomp;                               \
 523     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 524             m REPORT_LOCATION,                                          \
 525             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 526 } STMT_END
 527
 528 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 529     const IV offset = loc - RExC_precomp;                               \
 530     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 531             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 532 } STMT_END
 533
 534 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 535     const IV offset = loc - RExC_precomp;                               \
 536     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 537             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 538 } STMT_END
 539
 540 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 541     const IV offset = loc - RExC_precomp;                               \
 542     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 543             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 544 } STMT_END
 545
 546 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 547     const IV offset = loc - RExC_precomp;                               \
 548     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 549             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 550 } STMT_END
 551
 552 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 553     const IV offset = loc - RExC_precomp;                               \
 554     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 555             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 556 } STMT_END
 557
 558 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 559     const IV offset = loc - RExC_precomp;                               \
 560     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 561             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 562 } STMT_END
 563
 564
 565 /* Allow for side effects in s */
 566 #define REGC(c,s) STMT_START {                  \
 567     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 568 } STMT_END
 569
 570 /* Macros for recording node offsets.   20001227 mjd@plover.com
 571  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 572  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 573  * Element 0 holds the number n.
 574  * Position is 1 indexed.
 575  */
 576 #ifndef RE_TRACK_PATTERN_OFFSETS
 577 #define Set_Node_Offset_To_R(node,byte)
 578 #define Set_Node_Offset(node,byte)
 579 #define Set_Cur_Node_Offset
 580 #define Set_Node_Length_To_R(node,len)
 581 #define Set_Node_Length(node,len)
 582 #define Set_Node_Cur_Length(node)
 583 #define Node_Offset(n)
 584 #define Node_Length(n)
 585 #define Set_Node_Offset_Length(node,offset,len)
 586 #define ProgLen(ri) ri->u.proglen
 587 #define SetProgLen(ri,x) ri->u.proglen = x
 588 #else
 589 #define ProgLen(ri) ri->u.offsets[0]
 590 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 591 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 592     if (! SIZE_ONLY) {                                                  \
 593         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 594                     __LINE__, (int)(node), (int)(byte)));               \
 595         if((node) < 0) {                                                \
 596             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 597         } else {                                                        \
 598             RExC_offsets[2*(node)-1] = (byte);                          \
 599         }                                                               \
 600     }                                                                   \
 601 } STMT_END
 602
 603 #define Set_Node_Offset(node,byte) \
 604     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 605 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 606
 607 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 608     if (! SIZE_ONLY) {                                                  \
 609         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 610                 __LINE__, (int)(node), (int)(len)));                    \
 611         if((node) < 0) {                                                \
 612             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 613         } else {                                                        \
 614             RExC_offsets[2*(node)] = (len);                             \
 615         }                                                               \
 616     }                                                                   \
 617 } STMT_END
 618
 619 #define Set_Node_Length(node,len) \
 620     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 621 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 622 #define Set_Node_Cur_Length(node) \
 623     Set_Node_Length(node, RExC_parse - parse_start)
 624
 625 /* Get offsets and lengths */
 626 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 627 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 628
 629 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 630     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 631     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 632 } STMT_END
 633 #endif
 634
 635 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 636 #define EXPERIMENTAL_INPLACESCAN
 637 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 638
 639 #define DEBUG_STUDYDATA(str,data,depth)                              \
 640 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 641     PerlIO_printf(Perl_debug_log,                                    \
 642         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 643         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 644         (int)(depth)*2, "",                                          \
 645         (IV)((data)->pos_min),                                       \
 646         (IV)((data)->pos_delta),                                     \
 647         (UV)((data)->flags),                                         \
 648         (IV)((data)->whilem_c),                                      \
 649         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 650         is_inf ? "INF " : ""                                         \
 651     );                                                               \
 652     if ((data)->last_found)                                          \
 653         PerlIO_printf(Perl_debug_log,                                \
 654             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 655             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 656             SvPVX_const((data)->last_found),                         \
 657             (IV)((data)->last_end),                                  \
 658             (IV)((data)->last_start_min),                            \
 659             (IV)((data)->last_start_max),                            \
 660             ((data)->longest &&                                      \
 661              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 662             SvPVX_const((data)->longest_fixed),                      \
 663             (IV)((data)->offset_fixed),                              \
 664             ((data)->longest &&                                      \
 665              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 666             SvPVX_const((data)->longest_float),                      \
 667             (IV)((data)->offset_float_min),                          \
 668             (IV)((data)->offset_float_max)                           \
 669         );                                                           \
 670     PerlIO_printf(Perl_debug_log,"\n");                              \
 671 });
 672
 673 static void clear_re(pTHX_ void *r);
 674
 675 /* Mark that we cannot extend a found fixed substring at this point.
 676    Update the longest found anchored substring and the longest found
 677    floating substrings if needed. */
 678
 679 STATIC void
 680 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 681 {
 682     const STRLEN l = CHR_SVLEN(data->last_found);
 683     const STRLEN old_l = CHR_SVLEN(*data->longest);
 684     GET_RE_DEBUG_FLAGS_DECL;
 685
 686     PERL_ARGS_ASSERT_SCAN_COMMIT;
 687
 688     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 689         SvSetMagicSV(*data->longest, data->last_found);
 690         if (*data->longest == data->longest_fixed) {
 691             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 692             if (data->flags & SF_BEFORE_EOL)
 693                 data->flags
 694                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 695             else
 696                 data->flags &= ~SF_FIX_BEFORE_EOL;
 697             data->minlen_fixed=minlenp;
 698             data->lookbehind_fixed=0;
 699         }
 700         else { /* *data->longest == data->longest_float */
 701             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 702             data->offset_float_max = (l
 703                                       ? data->last_start_max
 704                                       : data->pos_min + data->pos_delta);
 705             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 706                 data->offset_float_max = I32_MAX;
 707             if (data->flags & SF_BEFORE_EOL)
 708                 data->flags
 709                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 710             else
 711                 data->flags &= ~SF_FL_BEFORE_EOL;
 712             data->minlen_float=minlenp;
 713             data->lookbehind_float=0;
 714         }
 715     }
 716     SvCUR_set(data->last_found, 0);
 717     {
 718         SV * const sv = data->last_found;
 719         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 720             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 721             if (mg)
 722                 mg->mg_len = 0;
 723         }
 724     }
 725     data->last_end = -1;
 726     data->flags &= ~SF_BEFORE_EOL;
 727     DEBUG_STUDYDATA("commit: ",data,0);
 728 }
 729
 730 /* Can match anything (initialization) */
 731 STATIC void
 732 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 733 {
 734     PERL_ARGS_ASSERT_CL_ANYTHING;
 735
 736     ANYOF_BITMAP_SETALL(cl);
 737     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 738                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 739
 740     /* If any portion of the regex is to operate under locale rules,
 741      * initialization includes it.  The reason this isn't done for all regexes
 742      * is that the optimizer was written under the assumption that locale was
 743      * all-or-nothing.  Given the complexity and lack of documentation in the
 744      * optimizer, and that there are inadequate test cases for locale, so many
 745      * parts of it may not work properly, it is safest to avoid locale unless
 746      * necessary. */
 747     if (RExC_contains_locale) {
 748         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 749         cl->flags |= ANYOF_LOCALE;
 750     }
 751     else {
 752         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 753     }
 754 }
 755
 756 /* Can match anything (initialization) */
 757 STATIC int
 758 S_cl_is_anything(const struct regnode_charclass_class *cl)
 759 {
 760     int value;
 761
 762     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 763
 764     for (value = 0; value <= ANYOF_MAX; value += 2)
 765         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 766             return 1;
 767     if (!(cl->flags & ANYOF_UNICODE_ALL))
 768         return 0;
 769     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 770         return 0;
 771     return 1;
 772 }
 773
 774 /* Can match anything (initialization) */
 775 STATIC void
 776 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 777 {
 778     PERL_ARGS_ASSERT_CL_INIT;
 779
 780     Zero(cl, 1, struct regnode_charclass_class);
 781     cl->type = ANYOF;
 782     cl_anything(pRExC_state, cl);
 783     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 784 }
 785
 786 /* These two functions currently do the exact same thing */
 787 #define cl_init_zero            S_cl_init
 788
 789 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 790  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 791  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 792 STATIC void
 793 S_cl_and(struct regnode_charclass_class *cl,
 794         const struct regnode_charclass_class *and_with)
 795 {
 796     PERL_ARGS_ASSERT_CL_AND;
 797
 798     assert(and_with->type == ANYOF);
 799
 800     /* I (khw) am not sure all these restrictions are necessary XXX */
 801     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 802         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 803         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 804         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 805         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 806         int i;
 807
 808         if (and_with->flags & ANYOF_INVERT)
 809             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 810                 cl->bitmap[i] &= ~and_with->bitmap[i];
 811         else
 812             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 813                 cl->bitmap[i] &= and_with->bitmap[i];
 814     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 815
 816     if (and_with->flags & ANYOF_INVERT) {
 817
 818         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 819          * aren't affected by the inversion.  Those that are affected are
 820          * handled individually below */
 821         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 822         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 823         cl->flags |= affected_flags;
 824
 825         /* We currently don't know how to deal with things that aren't in the
 826          * bitmap, but we know that the intersection is no greater than what
 827          * is already in cl, so let there be false positives that get sorted
 828          * out after the synthetic start class succeeds, and the node is
 829          * matched for real. */
 830
 831         /* The inversion of these two flags indicate that the resulting
 832          * intersection doesn't have them */
 833         if (and_with->flags & ANYOF_UNICODE_ALL) {
 834             cl->flags &= ~ANYOF_UNICODE_ALL;
 835         }
 836         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 837             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 838         }
 839     }
 840     else {   /* and'd node is not inverted */
 841         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 842
 843         if (! ANYOF_NONBITMAP(and_with)) {
 844
 845             /* Here 'and_with' doesn't match anything outside the bitmap
 846              * (except possibly ANYOF_UNICODE_ALL), which means the
 847              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 848              * which case we don't know what the intersection is, but it's no
 849              * greater than what cl already has, so can just leave it alone,
 850              * with possible false positives */
 851             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 852                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 853                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 854             }
 855         }
 856         else if (! ANYOF_NONBITMAP(cl)) {
 857
 858             /* Here, 'and_with' does match something outside the bitmap, and cl
 859              * doesn't have a list of things to match outside the bitmap.  If
 860              * cl can match all code points above 255, the intersection will
 861              * be those above-255 code points that 'and_with' matches.  If cl
 862              * can't match all Unicode code points, it means that it can't
 863              * match anything outside the bitmap (since the 'if' that got us
 864              * into this block tested for that), so we leave the bitmap empty.
 865              */
 866             if (cl->flags & ANYOF_UNICODE_ALL) {
 867                 ARG_SET(cl, ARG(and_with));
 868
 869                 /* and_with's ARG may match things that don't require UTF8.
 870                  * And now cl's will too, in spite of this being an 'and'.  See
 871                  * the comments below about the kludge */
 872                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 873             }
 874         }
 875         else {
 876             /* Here, both 'and_with' and cl match something outside the
 877              * bitmap.  Currently we do not do the intersection, so just match
 878              * whatever cl had at the beginning.  */
 879         }
 880
 881
 882         /* Take the intersection of the two sets of flags.  However, the
 883          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 884          * kludge around the fact that this flag is not treated like the others
 885          * which are initialized in cl_anything().  The way the optimizer works
 886          * is that the synthetic start class (SSC) is initialized to match
 887          * anything, and then the first time a real node is encountered, its
 888          * values are AND'd with the SSC's with the result being the values of
 889          * the real node.  However, there are paths through the optimizer where
 890          * the AND never gets called, so those initialized bits are set
 891          * inappropriately, which is not usually a big deal, as they just cause
 892          * false positives in the SSC, which will just mean a probably
 893          * imperceptible slow down in execution.  However this bit has a
 894          * higher false positive consequence in that it can cause utf8.pm,
 895          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 896          * bigger slowdown and also causes significant extra memory to be used.
 897          * In order to prevent this, the code now takes a different tack.  The
 898          * bit isn't set unless some part of the regular expression needs it,
 899          * but once set it won't get cleared.  This means that these extra
 900          * modules won't get loaded unless there was some path through the
 901          * pattern that would have required them anyway, and  so any false
 902          * positives that occur by not ANDing them out when they could be
 903          * aren't as severe as they would be if we treated this bit like all
 904          * the others */
 905         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 906                                       & ANYOF_NONBITMAP_NON_UTF8;
 907         cl->flags &= and_with->flags;
 908         cl->flags |= outside_bitmap_but_not_utf8;
 909     }
 910 }
 911
 912 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 913  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 914  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 915 STATIC void
 916 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 917 {
 918     PERL_ARGS_ASSERT_CL_OR;
 919
 920     if (or_with->flags & ANYOF_INVERT) {
 921
 922         /* Here, the or'd node is to be inverted.  This means we take the
 923          * complement of everything not in the bitmap, but currently we don't
 924          * know what that is, so give up and match anything */
 925         if (ANYOF_NONBITMAP(or_with)) {
 926             cl_anything(pRExC_state, cl);
 927         }
 928         /* We do not use
 929          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 930          *   <= (B1 | !B2) | (CL1 | !CL2)
 931          * which is wasteful if CL2 is small, but we ignore CL2:
 932          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 933          * XXXX Can we handle case-fold?  Unclear:
 934          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 935          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 936          */
 937         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 938              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 939              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 940             int i;
 941
 942             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 943                 cl->bitmap[i] |= ~or_with->bitmap[i];
 944         } /* XXXX: logic is complicated otherwise */
 945         else {
 946             cl_anything(pRExC_state, cl);
 947         }
 948
 949         /* And, we can just take the union of the flags that aren't affected
 950          * by the inversion */
 951         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 952
 953         /* For the remaining flags:
 954             ANYOF_UNICODE_ALL and inverted means to not match anything above
 955                     255, which means that the union with cl should just be
 956                     what cl has in it, so can ignore this flag
 957             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 958                     is 127-255 to match them, but then invert that, so the
 959                     union with cl should just be what cl has in it, so can
 960                     ignore this flag
 961          */
 962     } else {    /* 'or_with' is not inverted */
 963         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 964         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 965              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 966                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 967             int i;
 968
 969             /* OR char bitmap and class bitmap separately */
 970             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 971                 cl->bitmap[i] |= or_with->bitmap[i];
 972             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 973                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 974                     cl->classflags[i] |= or_with->classflags[i];
 975                 cl->flags |= ANYOF_CLASS;
 976             }
 977         }
 978         else { /* XXXX: logic is complicated, leave it along for a moment. */
 979             cl_anything(pRExC_state, cl);
 980         }
 981
 982         if (ANYOF_NONBITMAP(or_with)) {
 983
 984             /* Use the added node's outside-the-bit-map match if there isn't a
 985              * conflict.  If there is a conflict (both nodes match something
 986              * outside the bitmap, but what they match outside is not the same
 987              * pointer, and hence not easily compared until XXX we extend
 988              * inversion lists this far), give up and allow the start class to
 989              * match everything outside the bitmap.  If that stuff is all above
 990              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 991             if (! ANYOF_NONBITMAP(cl)) {
 992                 ARG_SET(cl, ARG(or_with));
 993             }
 994             else if (ARG(cl) != ARG(or_with)) {
 995
 996                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
 997                     cl_anything(pRExC_state, cl);
 998                 }
 999                 else {
1000                     cl->flags |= ANYOF_UNICODE_ALL;
1001                 }
1002             }
1003         }
1004
1005         /* Take the union */
1006         cl->flags |= or_with->flags;
1007     }
1008 }
1009
1010 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1011 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1012 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1013 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1014
1015
1016 #ifdef DEBUGGING
1017 /*
1018    dump_trie(trie,widecharmap,revcharmap)
1019    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1020    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1021
1022    These routines dump out a trie in a somewhat readable format.
1023    The _interim_ variants are used for debugging the interim
1024    tables that are used to generate the final compressed
1025    representation which is what dump_trie expects.
1026
1027    Part of the reason for their existence is to provide a form
1028    of documentation as to how the different representations function.
1029
1030 */
1031
1032 /*
1033   Dumps the final compressed table form of the trie to Perl_debug_log.
1034   Used for debugging make_trie().
1035 */
1036
1037 STATIC void
1038 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1039             AV *revcharmap, U32 depth)
1040 {
1041     U32 state;
1042     SV *sv=sv_newmortal();
1043     int colwidth= widecharmap ? 6 : 4;
1044     U16 word;
1045     GET_RE_DEBUG_FLAGS_DECL;
1046
1047     PERL_ARGS_ASSERT_DUMP_TRIE;
1048
1049     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1050         (int)depth * 2 + 2,"",
1051         "Match","Base","Ofs" );
1052
1053     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1054         SV ** const tmp = av_fetch( revcharmap, state, 0);
1055         if ( tmp ) {
1056             PerlIO_printf( Perl_debug_log, "%*s",
1057                 colwidth,
1058                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1059                             PL_colors[0], PL_colors[1],
1060                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1061                             PERL_PV_ESCAPE_FIRSTCHAR
1062                 )
1063             );
1064         }
1065     }
1066     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1067         (int)depth * 2 + 2,"");
1068
1069     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1070         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1071     PerlIO_printf( Perl_debug_log, "\n");
1072
1073     for( state = 1 ; state < trie->statecount ; state++ ) {
1074         const U32 base = trie->states[ state ].trans.base;
1075
1076         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1077
1078         if ( trie->states[ state ].wordnum ) {
1079             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1080         } else {
1081             PerlIO_printf( Perl_debug_log, "%6s", "" );
1082         }
1083
1084         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1085
1086         if ( base ) {
1087             U32 ofs = 0;
1088
1089             while( ( base + ofs  < trie->uniquecharcount ) ||
1090                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1091                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1092                     ofs++;
1093
1094             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1095
1096             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1097                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1098                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1099                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1100                 {
1101                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1102                     colwidth,
1103                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1104                 } else {
1105                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1106                 }
1107             }
1108
1109             PerlIO_printf( Perl_debug_log, "]");
1110
1111         }
1112         PerlIO_printf( Perl_debug_log, "\n" );
1113     }
1114     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1115     for (word=1; word <= trie->wordcount; word++) {
1116         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1117             (int)word, (int)(trie->wordinfo[word].prev),
1118             (int)(trie->wordinfo[word].len));
1119     }
1120     PerlIO_printf(Perl_debug_log, "\n" );
1121 }
1122 /*
1123   Dumps a fully constructed but uncompressed trie in list form.
1124   List tries normally only are used for construction when the number of
1125   possible chars (trie->uniquecharcount) is very high.
1126   Used for debugging make_trie().
1127 */
1128 STATIC void
1129 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1130                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1131                          U32 depth)
1132 {
1133     U32 state;
1134     SV *sv=sv_newmortal();
1135     int colwidth= widecharmap ? 6 : 4;
1136     GET_RE_DEBUG_FLAGS_DECL;
1137
1138     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1139
1140     /* print out the table precompression.  */
1141     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1142         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1143         "------:-----+-----------------\n" );
1144
1145     for( state=1 ; state < next_alloc ; state ++ ) {
1146         U16 charid;
1147
1148         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1149             (int)depth * 2 + 2,"", (UV)state  );
1150         if ( ! trie->states[ state ].wordnum ) {
1151             PerlIO_printf( Perl_debug_log, "%5s| ","");
1152         } else {
1153             PerlIO_printf( Perl_debug_log, "W%4x| ",
1154                 trie->states[ state ].wordnum
1155             );
1156         }
1157         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1158             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1159             if ( tmp ) {
1160                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1161                     colwidth,
1162                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1163                             PL_colors[0], PL_colors[1],
1164                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1165                             PERL_PV_ESCAPE_FIRSTCHAR
1166                     ) ,
1167                     TRIE_LIST_ITEM(state,charid).forid,
1168                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1169                 );
1170                 if (!(charid % 10))
1171                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1172                         (int)((depth * 2) + 14), "");
1173             }
1174         }
1175         PerlIO_printf( Perl_debug_log, "\n");
1176     }
1177 }
1178
1179 /*
1180   Dumps a fully constructed but uncompressed trie in table form.
1181   This is the normal DFA style state transition table, with a few
1182   twists to facilitate compression later.
1183   Used for debugging make_trie().
1184 */
1185 STATIC void
1186 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1187                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1188                           U32 depth)
1189 {
1190     U32 state;
1191     U16 charid;
1192     SV *sv=sv_newmortal();
1193     int colwidth= widecharmap ? 6 : 4;
1194     GET_RE_DEBUG_FLAGS_DECL;
1195
1196     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1197
1198     /*
1199        print out the table precompression so that we can do a visual check
1200        that they are identical.
1201      */
1202
1203     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1204
1205     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1206         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1207         if ( tmp ) {
1208             PerlIO_printf( Perl_debug_log, "%*s",
1209                 colwidth,
1210                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1211                             PL_colors[0], PL_colors[1],
1212                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1213                             PERL_PV_ESCAPE_FIRSTCHAR
1214                 )
1215             );
1216         }
1217     }
1218
1219     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1220
1221     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1222         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1223     }
1224
1225     PerlIO_printf( Perl_debug_log, "\n" );
1226
1227     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1228
1229         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1230             (int)depth * 2 + 2,"",
1231             (UV)TRIE_NODENUM( state ) );
1232
1233         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1234             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1235             if (v)
1236                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1237             else
1238                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1239         }
1240         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1241             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1242         } else {
1243             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1244             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1245         }
1246     }
1247 }
1248
1249 #endif
1250
1251
1252 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1253   startbranch: the first branch in the whole branch sequence
1254   first      : start branch of sequence of branch-exact nodes.
1255                May be the same as startbranch
1256   last       : Thing following the last branch.
1257                May be the same as tail.
1258   tail       : item following the branch sequence
1259   count      : words in the sequence
1260   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1261   depth      : indent depth
1262
1263 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1264
1265 A trie is an N'ary tree where the branches are determined by digital
1266 decomposition of the key. IE, at the root node you look up the 1st character and
1267 follow that branch repeat until you find the end of the branches. Nodes can be
1268 marked as "accepting" meaning they represent a complete word. Eg:
1269
1270   /he|she|his|hers/
1271
1272 would convert into the following structure. Numbers represent states, letters
1273 following numbers represent valid transitions on the letter from that state, if
1274 the number is in square brackets it represents an accepting state, otherwise it
1275 will be in parenthesis.
1276
1277       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1278       |    |
1279       |   (2)
1280       |    |
1281      (1)   +-i->(6)-+-s->[7]
1282       |
1283       +-s->(3)-+-h->(4)-+-e->[5]
1284
1285       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1286
1287 This shows that when matching against the string 'hers' we will begin at state 1
1288 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1289 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1290 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1291 single traverse. We store a mapping from accepting to state to which word was
1292 matched, and then when we have multiple possibilities we try to complete the
1293 rest of the regex in the order in which they occured in the alternation.
1294
1295 The only prior NFA like behaviour that would be changed by the TRIE support is
1296 the silent ignoring of duplicate alternations which are of the form:
1297
1298  / (DUPE|DUPE) X? (?{ ... }) Y /x
1299
1300 Thus EVAL blocks following a trie may be called a different number of times with
1301 and without the optimisation. With the optimisations dupes will be silently
1302 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1303 the following demonstrates:
1304
1305  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1306
1307 which prints out 'word' three times, but
1308
1309  'words'=~/(word|word|word)(?{ print $1 })S/
1310
1311 which doesnt print it out at all. This is due to other optimisations kicking in.
1312
1313 Example of what happens on a structural level:
1314
1315 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1316
1317    1: CURLYM[1] {1,32767}(18)
1318    5:   BRANCH(8)
1319    6:     EXACT <ac>(16)
1320    8:   BRANCH(11)
1321    9:     EXACT <ad>(16)
1322   11:   BRANCH(14)
1323   12:     EXACT <ab>(16)
1324   16:   SUCCEED(0)
1325   17:   NOTHING(18)
1326   18: END(0)
1327
1328 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1329 and should turn into:
1330
1331    1: CURLYM[1] {1,32767}(18)
1332    5:   TRIE(16)
1333         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1334           <ac>
1335           <ad>
1336           <ab>
1337   16:   SUCCEED(0)
1338   17:   NOTHING(18)
1339   18: END(0)
1340
1341 Cases where tail != last would be like /(?foo|bar)baz/:
1342
1343    1: BRANCH(4)
1344    2:   EXACT <foo>(8)
1345    4: BRANCH(7)
1346    5:   EXACT <bar>(8)
1347    7: TAIL(8)
1348    8: EXACT <baz>(10)
1349   10: END(0)
1350
1351 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1352 and would end up looking like:
1353
1354     1: TRIE(8)
1355       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1356         <foo>
1357         <bar>
1358    7: TAIL(8)
1359    8: EXACT <baz>(10)
1360   10: END(0)
1361
1362     d = uvuni_to_utf8_flags(d, uv, 0);
1363
1364 is the recommended Unicode-aware way of saying
1365
1366     *(d++) = uv;
1367 */
1368
1369 #define TRIE_STORE_REVCHAR(val)                                            \
1370     STMT_START {                                                           \
1371         if (UTF) {                                                         \
1372             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1373             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1374             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1375             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1376             SvPOK_on(zlopp);                                               \
1377             SvUTF8_on(zlopp);                                              \
1378             av_push(revcharmap, zlopp);                                    \
1379         } else {                                                           \
1380             char ooooff = (char)val;                                           \
1381             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1382         }                                                                  \
1383         } STMT_END
1384
1385 #define TRIE_READ_CHAR STMT_START {                                                     \
1386     wordlen++;                                                                          \
1387     if ( UTF ) {                                                                        \
1388         /* if it is UTF then it is either already folded, or does not need folding */   \
1389         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1390     }                                                                                   \
1391     else if (folder == PL_fold_latin1) {                                                \
1392         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1393         if ( foldlen > 0 ) {                                                            \
1394            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1395            foldlen -= len;                                                              \
1396            scan += len;                                                                 \
1397            len = 0;                                                                     \
1398         } else {                                                                        \
1399             len = 1;                                                                    \
1400             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1401             skiplen = UNISKIP(uvc);                                                     \
1402             foldlen -= skiplen;                                                         \
1403             scan = foldbuf + skiplen;                                                   \
1404         }                                                                               \
1405     } else {                                                                            \
1406         /* raw data, will be folded later if needed */                                  \
1407         uvc = (U32)*uc;                                                                 \
1408         len = 1;                                                                        \
1409     }                                                                                   \
1410 } STMT_END
1411
1412
1413
1414 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1415     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1416         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1417         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1418     }                                                           \
1419     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1420     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1421     TRIE_LIST_CUR( state )++;                                   \
1422 } STMT_END
1423
1424 #define TRIE_LIST_NEW(state) STMT_START {                       \
1425     Newxz( trie->states[ state ].trans.list,               \
1426         4, reg_trie_trans_le );                                 \
1427      TRIE_LIST_CUR( state ) = 1;                                \
1428      TRIE_LIST_LEN( state ) = 4;                                \
1429 } STMT_END
1430
1431 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1432     U16 dupe= trie->states[ state ].wordnum;                    \
1433     regnode * const noper_next = regnext( noper );              \
1434                                                                 \
1435     DEBUG_r({                                                   \
1436         /* store the word for dumping */                        \
1437         SV* tmp;                                                \
1438         if (OP(noper) != NOTHING)                               \
1439             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1440         else                                                    \
1441             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1442         av_push( trie_words, tmp );                             \
1443     });                                                         \
1444                                                                 \
1445     curword++;                                                  \
1446     trie->wordinfo[curword].prev   = 0;                         \
1447     trie->wordinfo[curword].len    = wordlen;                   \
1448     trie->wordinfo[curword].accept = state;                     \
1449                                                                 \
1450     if ( noper_next < tail ) {                                  \
1451         if (!trie->jump)                                        \
1452             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1453         trie->jump[curword] = (U16)(noper_next - convert);      \
1454         if (!jumper)                                            \
1455             jumper = noper_next;                                \
1456         if (!nextbranch)                                        \
1457             nextbranch= regnext(cur);                           \
1458     }                                                           \
1459                                                                 \
1460     if ( dupe ) {                                               \
1461         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1462         /* chain, so that when the bits of chain are later    */\
1463         /* linked together, the dups appear in the chain      */\
1464         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1465         trie->wordinfo[dupe].prev = curword;                    \
1466     } else {                                                    \
1467         /* we haven't inserted this word yet.                */ \
1468         trie->states[ state ].wordnum = curword;                \
1469     }                                                           \
1470 } STMT_END
1471
1472
1473 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1474      ( ( base + charid >=  ucharcount                                   \
1475          && base + charid < ubound                                      \
1476          && state == trie->trans[ base - ucharcount + charid ].check    \
1477          && trie->trans[ base - ucharcount + charid ].next )            \
1478            ? trie->trans[ base - ucharcount + charid ].next             \
1479            : ( state==1 ? special : 0 )                                 \
1480       )
1481
1482 #define MADE_TRIE       1
1483 #define MADE_JUMP_TRIE  2
1484 #define MADE_EXACT_TRIE 4
1485
1486 STATIC I32
1487 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1488 {
1489     dVAR;
1490     /* first pass, loop through and scan words */
1491     reg_trie_data *trie;
1492     HV *widecharmap = NULL;
1493     AV *revcharmap = newAV();
1494     regnode *cur;
1495     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1496     STRLEN len = 0;
1497     UV uvc = 0;
1498     U16 curword = 0;
1499     U32 next_alloc = 0;
1500     regnode *jumper = NULL;
1501     regnode *nextbranch = NULL;
1502     regnode *convert = NULL;
1503     U32 *prev_states; /* temp array mapping each state to previous one */
1504     /* we just use folder as a flag in utf8 */
1505     const U8 * folder = NULL;
1506
1507 #ifdef DEBUGGING
1508     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1509     AV *trie_words = NULL;
1510     /* along with revcharmap, this only used during construction but both are
1511      * useful during debugging so we store them in the struct when debugging.
1512      */
1513 #else
1514     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1515     STRLEN trie_charcount=0;
1516 #endif
1517     SV *re_trie_maxbuff;
1518     GET_RE_DEBUG_FLAGS_DECL;
1519
1520     PERL_ARGS_ASSERT_MAKE_TRIE;
1521 #ifndef DEBUGGING
1522     PERL_UNUSED_ARG(depth);
1523 #endif
1524
1525     switch (flags) {
1526         case EXACT: break;
1527         case EXACTFA:
1528         case EXACTFU_SS:
1529         case EXACTFU_TRICKYFOLD:
1530         case EXACTFU: folder = PL_fold_latin1; break;
1531         case EXACTF:  folder = PL_fold; break;
1532         case EXACTFL: folder = PL_fold_locale; break;
1533         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1534     }
1535
1536     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1537     trie->refcount = 1;
1538     trie->startstate = 1;
1539     trie->wordcount = word_count;
1540     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1541     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1542     if (flags == EXACT)
1543         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1544     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1545                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1546
1547     DEBUG_r({
1548         trie_words = newAV();
1549     });
1550
1551     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1552     if (!SvIOK(re_trie_maxbuff)) {
1553         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1554     }
1555     DEBUG_OPTIMISE_r({
1556                 PerlIO_printf( Perl_debug_log,
1557                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1558                   (int)depth * 2 + 2, "",
1559                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1560                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1561                   (int)depth);
1562     });
1563
1564    /* Find the node we are going to overwrite */
1565     if ( first == startbranch && OP( last ) != BRANCH ) {
1566         /* whole branch chain */
1567         convert = first;
1568     } else {
1569         /* branch sub-chain */
1570         convert = NEXTOPER( first );
1571     }
1572
1573     /*  -- First loop and Setup --
1574
1575        We first traverse the branches and scan each word to determine if it
1576        contains widechars, and how many unique chars there are, this is
1577        important as we have to build a table with at least as many columns as we
1578        have unique chars.
1579
1580        We use an array of integers to represent the character codes 0..255
1581        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1582        native representation of the character value as the key and IV's for the
1583        coded index.
1584
1585        *TODO* If we keep track of how many times each character is used we can
1586        remap the columns so that the table compression later on is more
1587        efficient in terms of memory by ensuring the most common value is in the
1588        middle and the least common are on the outside.  IMO this would be better
1589        than a most to least common mapping as theres a decent chance the most
1590        common letter will share a node with the least common, meaning the node
1591        will not be compressible. With a middle is most common approach the worst
1592        case is when we have the least common nodes twice.
1593
1594      */
1595
1596     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1597         regnode * const noper = NEXTOPER( cur );
1598         const U8 *uc = (U8*)STRING( noper );
1599         const U8 * const e  = uc + STR_LEN( noper );
1600         STRLEN foldlen = 0;
1601         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1602         STRLEN skiplen = 0;
1603         const U8 *scan = (U8*)NULL;
1604         U32 wordlen      = 0;         /* required init */
1605         STRLEN chars = 0;
1606         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1607
1608         if (OP(noper) == NOTHING) {
1609             trie->minlen= 0;
1610             continue;
1611         }
1612         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1613             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1614                                           regardless of encoding */
1615             if (OP( noper ) == EXACTFU_SS) {
1616                 /* false positives are ok, so just set this */
1617                 TRIE_BITMAP_SET(trie,0xDF);
1618             }
1619         }
1620         for ( ; uc < e ; uc += len ) {
1621             TRIE_CHARCOUNT(trie)++;
1622             TRIE_READ_CHAR;
1623             chars++;
1624             if ( uvc < 256 ) {
1625                 if ( folder ) {
1626                     U8 folded= folder[ (U8) uvc ];
1627                     if ( !trie->charmap[ folded ] ) {
1628                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1629                         TRIE_STORE_REVCHAR( folded );
1630                     }
1631                 }
1632                 if ( !trie->charmap[ uvc ] ) {
1633                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1634                     TRIE_STORE_REVCHAR( uvc );
1635                 }
1636                 if ( set_bit ) {
1637                     /* store the codepoint in the bitmap, and its folded
1638                      * equivalent. */
1639                     TRIE_BITMAP_SET(trie, uvc);
1640
1641                     /* store the folded codepoint */
1642                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1643
1644                     if ( !UTF ) {
1645                         /* store first byte of utf8 representation of
1646                            variant codepoints */
1647                         if (! UNI_IS_INVARIANT(uvc)) {
1648                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1649                         }
1650                     }
1651                     set_bit = 0; /* We've done our bit :-) */
1652                 }
1653             } else {
1654                 SV** svpp;
1655                 if ( !widecharmap )
1656                     widecharmap = newHV();
1657
1658                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1659
1660                 if ( !svpp )
1661                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1662
1663                 if ( !SvTRUE( *svpp ) ) {
1664                     sv_setiv( *svpp, ++trie->uniquecharcount );
1665                     TRIE_STORE_REVCHAR(uvc);
1666                 }
1667             }
1668         }
1669         if( cur == first ) {
1670             trie->minlen = chars;
1671             trie->maxlen = chars;
1672         } else if (chars < trie->minlen) {
1673             trie->minlen = chars;
1674         } else if (chars > trie->maxlen) {
1675             trie->maxlen = chars;
1676         }
1677         if (OP( noper ) == EXACTFU_SS) {
1678             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1679             if (trie->minlen > 1)
1680                 trie->minlen= 1;
1681         }
1682         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1683             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1684              *                - We assume that any such sequence might match a 2 byte string */
1685             if (trie->minlen > 2 )
1686                 trie->minlen= 2;
1687         }
1688
1689     } /* end first pass */
1690     DEBUG_TRIE_COMPILE_r(
1691         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1692                 (int)depth * 2 + 2,"",
1693                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1694                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1695                 (int)trie->minlen, (int)trie->maxlen )
1696     );
1697
1698     /*
1699         We now know what we are dealing with in terms of unique chars and
1700         string sizes so we can calculate how much memory a naive
1701         representation using a flat table  will take. If it's over a reasonable
1702         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1703         conservative but potentially much slower representation using an array
1704         of lists.
1705
1706         At the end we convert both representations into the same compressed
1707         form that will be used in regexec.c for matching with. The latter
1708         is a form that cannot be used to construct with but has memory
1709         properties similar to the list form and access properties similar
1710         to the table form making it both suitable for fast searches and
1711         small enough that its feasable to store for the duration of a program.
1712
1713         See the comment in the code where the compressed table is produced
1714         inplace from the flat tabe representation for an explanation of how
1715         the compression works.
1716
1717     */
1718
1719
1720     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1721     prev_states[1] = 0;
1722
1723     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1724         /*
1725             Second Pass -- Array Of Lists Representation
1726
1727             Each state will be represented by a list of charid:state records
1728             (reg_trie_trans_le) the first such element holds the CUR and LEN
1729             points of the allocated array. (See defines above).
1730
1731             We build the initial structure using the lists, and then convert
1732             it into the compressed table form which allows faster lookups
1733             (but cant be modified once converted).
1734         */
1735
1736         STRLEN transcount = 1;
1737
1738         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1739             "%*sCompiling trie using list compiler\n",
1740             (int)depth * 2 + 2, ""));
1741
1742         trie->states = (reg_trie_state *)
1743             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1744                                   sizeof(reg_trie_state) );
1745         TRIE_LIST_NEW(1);
1746         next_alloc = 2;
1747
1748         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1749
1750             regnode * const noper = NEXTOPER( cur );
1751             U8 *uc           = (U8*)STRING( noper );
1752             const U8 * const e = uc + STR_LEN( noper );
1753             U32 state        = 1;         /* required init */
1754             U16 charid       = 0;         /* sanity init */
1755             U8 *scan         = (U8*)NULL; /* sanity init */
1756             STRLEN foldlen   = 0;         /* required init */
1757             U32 wordlen      = 0;         /* required init */
1758             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1759             STRLEN skiplen   = 0;
1760
1761             if (OP(noper) != NOTHING) {
1762                 for ( ; uc < e ; uc += len ) {
1763
1764                     TRIE_READ_CHAR;
1765
1766                     if ( uvc < 256 ) {
1767                         charid = trie->charmap[ uvc ];
1768                     } else {
1769                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1770                         if ( !svpp ) {
1771                             charid = 0;
1772                         } else {
1773                             charid=(U16)SvIV( *svpp );
1774                         }
1775                     }
1776                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1777                     if ( charid ) {
1778
1779                         U16 check;
1780                         U32 newstate = 0;
1781
1782                         charid--;
1783                         if ( !trie->states[ state ].trans.list ) {
1784                             TRIE_LIST_NEW( state );
1785                         }
1786                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1787                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1788                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1789                                 break;
1790                             }
1791                         }
1792                         if ( ! newstate ) {
1793                             newstate = next_alloc++;
1794                             prev_states[newstate] = state;
1795                             TRIE_LIST_PUSH( state, charid, newstate );
1796                             transcount++;
1797                         }
1798                         state = newstate;
1799                     } else {
1800                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1801                     }
1802                 }
1803             }
1804             TRIE_HANDLE_WORD(state);
1805
1806         } /* end second pass */
1807
1808         /* next alloc is the NEXT state to be allocated */
1809         trie->statecount = next_alloc;
1810         trie->states = (reg_trie_state *)
1811             PerlMemShared_realloc( trie->states,
1812                                    next_alloc
1813                                    * sizeof(reg_trie_state) );
1814
1815         /* and now dump it out before we compress it */
1816         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1817                                                          revcharmap, next_alloc,
1818                                                          depth+1)
1819         );
1820
1821         trie->trans = (reg_trie_trans *)
1822             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1823         {
1824             U32 state;
1825             U32 tp = 0;
1826             U32 zp = 0;
1827
1828
1829             for( state=1 ; state < next_alloc ; state ++ ) {
1830                 U32 base=0;
1831
1832                 /*
1833                 DEBUG_TRIE_COMPILE_MORE_r(
1834                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1835                 );
1836                 */
1837
1838                 if (trie->states[state].trans.list) {
1839                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1840                     U16 maxid=minid;
1841                     U16 idx;
1842
1843                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1844                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1845                         if ( forid < minid ) {
1846                             minid=forid;
1847                         } else if ( forid > maxid ) {
1848                             maxid=forid;
1849                         }
1850                     }
1851                     if ( transcount < tp + maxid - minid + 1) {
1852                         transcount *= 2;
1853                         trie->trans = (reg_trie_trans *)
1854                             PerlMemShared_realloc( trie->trans,
1855                                                      transcount
1856                                                      * sizeof(reg_trie_trans) );
1857                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1858                     }
1859                     base = trie->uniquecharcount + tp - minid;
1860                     if ( maxid == minid ) {
1861                         U32 set = 0;
1862                         for ( ; zp < tp ; zp++ ) {
1863                             if ( ! trie->trans[ zp ].next ) {
1864                                 base = trie->uniquecharcount + zp - minid;
1865                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1866                                 trie->trans[ zp ].check = state;
1867                                 set = 1;
1868                                 break;
1869                             }
1870                         }
1871                         if ( !set ) {
1872                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1873                             trie->trans[ tp ].check = state;
1874                             tp++;
1875                             zp = tp;
1876                         }
1877                     } else {
1878                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1879                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1880                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1881                             trie->trans[ tid ].check = state;
1882                         }
1883                         tp += ( maxid - minid + 1 );
1884                     }
1885                     Safefree(trie->states[ state ].trans.list);
1886                 }
1887                 /*
1888                 DEBUG_TRIE_COMPILE_MORE_r(
1889                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1890                 );
1891                 */
1892                 trie->states[ state ].trans.base=base;
1893             }
1894             trie->lasttrans = tp + 1;
1895         }
1896     } else {
1897         /*
1898            Second Pass -- Flat Table Representation.
1899
1900            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1901            We know that we will need Charcount+1 trans at most to store the data
1902            (one row per char at worst case) So we preallocate both structures
1903            assuming worst case.
1904
1905            We then construct the trie using only the .next slots of the entry
1906            structs.
1907
1908            We use the .check field of the first entry of the node temporarily to
1909            make compression both faster and easier by keeping track of how many non
1910            zero fields are in the node.
1911
1912            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1913            transition.
1914
1915            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1916            number representing the first entry of the node, and state as a
1917            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1918            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1919            are 2 entrys per node. eg:
1920
1921              A B       A B
1922           1. 2 4    1. 3 7
1923           2. 0 3    3. 0 5
1924           3. 0 0    5. 0 0
1925           4. 0 0    7. 0 0
1926
1927            The table is internally in the right hand, idx form. However as we also
1928            have to deal with the states array which is indexed by nodenum we have to
1929            use TRIE_NODENUM() to convert.
1930
1931         */
1932         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1933             "%*sCompiling trie using table compiler\n",
1934             (int)depth * 2 + 2, ""));
1935
1936         trie->trans = (reg_trie_trans *)
1937             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1938                                   * trie->uniquecharcount + 1,
1939                                   sizeof(reg_trie_trans) );
1940         trie->states = (reg_trie_state *)
1941             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1942                                   sizeof(reg_trie_state) );
1943         next_alloc = trie->uniquecharcount + 1;
1944
1945
1946         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1947
1948             regnode * const noper   = NEXTOPER( cur );
1949             const U8 *uc     = (U8*)STRING( noper );
1950             const U8 * const e = uc + STR_LEN( noper );
1951
1952             U32 state        = 1;         /* required init */
1953
1954             U16 charid       = 0;         /* sanity init */
1955             U32 accept_state = 0;         /* sanity init */
1956             U8 *scan         = (U8*)NULL; /* sanity init */
1957
1958             STRLEN foldlen   = 0;         /* required init */
1959             U32 wordlen      = 0;         /* required init */
1960             STRLEN skiplen   = 0;
1961             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1962
1963
1964             if ( OP(noper) != NOTHING ) {
1965                 for ( ; uc < e ; uc += len ) {
1966
1967                     TRIE_READ_CHAR;
1968
1969                     if ( uvc < 256 ) {
1970                         charid = trie->charmap[ uvc ];
1971                     } else {
1972                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1973                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1974                     }
1975                     if ( charid ) {
1976                         charid--;
1977                         if ( !trie->trans[ state + charid ].next ) {
1978                             trie->trans[ state + charid ].next = next_alloc;
1979                             trie->trans[ state ].check++;
1980                             prev_states[TRIE_NODENUM(next_alloc)]
1981                                     = TRIE_NODENUM(state);
1982                             next_alloc += trie->uniquecharcount;
1983                         }
1984                         state = trie->trans[ state + charid ].next;
1985                     } else {
1986                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1987                     }
1988                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1989                 }
1990             }
1991             accept_state = TRIE_NODENUM( state );
1992             TRIE_HANDLE_WORD(accept_state);
1993
1994         } /* end second pass */
1995
1996         /* and now dump it out before we compress it */
1997         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1998                                                           revcharmap,
1999                                                           next_alloc, depth+1));
2000
2001         {
2002         /*
2003            * Inplace compress the table.*
2004
2005            For sparse data sets the table constructed by the trie algorithm will
2006            be mostly 0/FAIL transitions or to put it another way mostly empty.
2007            (Note that leaf nodes will not contain any transitions.)
2008
2009            This algorithm compresses the tables by eliminating most such
2010            transitions, at the cost of a modest bit of extra work during lookup:
2011
2012            - Each states[] entry contains a .base field which indicates the
2013            index in the state[] array wheres its transition data is stored.
2014
2015            - If .base is 0 there are no valid transitions from that node.
2016
2017            - If .base is nonzero then charid is added to it to find an entry in
2018            the trans array.
2019
2020            -If trans[states[state].base+charid].check!=state then the
2021            transition is taken to be a 0/Fail transition. Thus if there are fail
2022            transitions at the front of the node then the .base offset will point
2023            somewhere inside the previous nodes data (or maybe even into a node
2024            even earlier), but the .check field determines if the transition is
2025            valid.
2026
2027            XXX - wrong maybe?
2028            The following process inplace converts the table to the compressed
2029            table: We first do not compress the root node 1,and mark all its
2030            .check pointers as 1 and set its .base pointer as 1 as well. This
2031            allows us to do a DFA construction from the compressed table later,
2032            and ensures that any .base pointers we calculate later are greater
2033            than 0.
2034
2035            - We set 'pos' to indicate the first entry of the second node.
2036
2037            - We then iterate over the columns of the node, finding the first and
2038            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2039            and set the .check pointers accordingly, and advance pos
2040            appropriately and repreat for the next node. Note that when we copy
2041            the next pointers we have to convert them from the original
2042            NODEIDX form to NODENUM form as the former is not valid post
2043            compression.
2044
2045            - If a node has no transitions used we mark its base as 0 and do not
2046            advance the pos pointer.
2047
2048            - If a node only has one transition we use a second pointer into the
2049            structure to fill in allocated fail transitions from other states.
2050            This pointer is independent of the main pointer and scans forward
2051            looking for null transitions that are allocated to a state. When it
2052            finds one it writes the single transition into the "hole".  If the
2053            pointer doesnt find one the single transition is appended as normal.
2054
2055            - Once compressed we can Renew/realloc the structures to release the
2056            excess space.
2057
2058            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2059            specifically Fig 3.47 and the associated pseudocode.
2060
2061            demq
2062         */
2063         const U32 laststate = TRIE_NODENUM( next_alloc );
2064         U32 state, charid;
2065         U32 pos = 0, zp=0;
2066         trie->statecount = laststate;
2067
2068         for ( state = 1 ; state < laststate ; state++ ) {
2069             U8 flag = 0;
2070             const U32 stateidx = TRIE_NODEIDX( state );
2071             const U32 o_used = trie->trans[ stateidx ].check;
2072             U32 used = trie->trans[ stateidx ].check;
2073             trie->trans[ stateidx ].check = 0;
2074
2075             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2076                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2077                     if ( trie->trans[ stateidx + charid ].next ) {
2078                         if (o_used == 1) {
2079                             for ( ; zp < pos ; zp++ ) {
2080                                 if ( ! trie->trans[ zp ].next ) {
2081                                     break;
2082                                 }
2083                             }
2084                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2085                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2086                             trie->trans[ zp ].check = state;
2087                             if ( ++zp > pos ) pos = zp;
2088                             break;
2089                         }
2090                         used--;
2091                     }
2092                     if ( !flag ) {
2093                         flag = 1;
2094                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2095                     }
2096                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2097                     trie->trans[ pos ].check = state;
2098                     pos++;
2099                 }
2100             }
2101         }
2102         trie->lasttrans = pos + 1;
2103         trie->states = (reg_trie_state *)
2104             PerlMemShared_realloc( trie->states, laststate
2105                                    * sizeof(reg_trie_state) );
2106         DEBUG_TRIE_COMPILE_MORE_r(
2107                 PerlIO_printf( Perl_debug_log,
2108                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2109                     (int)depth * 2 + 2,"",
2110                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2111                     (IV)next_alloc,
2112                     (IV)pos,
2113                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2114             );
2115
2116         } /* end table compress */
2117     }
2118     DEBUG_TRIE_COMPILE_MORE_r(
2119             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2120                 (int)depth * 2 + 2, "",
2121                 (UV)trie->statecount,
2122                 (UV)trie->lasttrans)
2123     );
2124     /* resize the trans array to remove unused space */
2125     trie->trans = (reg_trie_trans *)
2126         PerlMemShared_realloc( trie->trans, trie->lasttrans
2127                                * sizeof(reg_trie_trans) );
2128
2129     {   /* Modify the program and insert the new TRIE node */
2130         U8 nodetype =(U8)(flags & 0xFF);
2131         char *str=NULL;
2132
2133 #ifdef DEBUGGING
2134         regnode *optimize = NULL;
2135 #ifdef RE_TRACK_PATTERN_OFFSETS
2136
2137         U32 mjd_offset = 0;
2138         U32 mjd_nodelen = 0;
2139 #endif /* RE_TRACK_PATTERN_OFFSETS */
2140 #endif /* DEBUGGING */
2141         /*
2142            This means we convert either the first branch or the first Exact,
2143            depending on whether the thing following (in 'last') is a branch
2144            or not and whther first is the startbranch (ie is it a sub part of
2145            the alternation or is it the whole thing.)
2146            Assuming its a sub part we convert the EXACT otherwise we convert
2147            the whole branch sequence, including the first.
2148          */
2149         /* Find the node we are going to overwrite */
2150         if ( first != startbranch || OP( last ) == BRANCH ) {
2151             /* branch sub-chain */
2152             NEXT_OFF( first ) = (U16)(last - first);
2153 #ifdef RE_TRACK_PATTERN_OFFSETS
2154             DEBUG_r({
2155                 mjd_offset= Node_Offset((convert));
2156                 mjd_nodelen= Node_Length((convert));
2157             });
2158 #endif
2159             /* whole branch chain */
2160         }
2161 #ifdef RE_TRACK_PATTERN_OFFSETS
2162         else {
2163             DEBUG_r({
2164                 const  regnode *nop = NEXTOPER( convert );
2165                 mjd_offset= Node_Offset((nop));
2166                 mjd_nodelen= Node_Length((nop));
2167             });
2168         }
2169         DEBUG_OPTIMISE_r(
2170             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2171                 (int)depth * 2 + 2, "",
2172                 (UV)mjd_offset, (UV)mjd_nodelen)
2173         );
2174 #endif
2175         /* But first we check to see if there is a common prefix we can
2176            split out as an EXACT and put in front of the TRIE node.  */
2177         trie->startstate= 1;
2178         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2179             U32 state;
2180             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2181                 U32 ofs = 0;
2182                 I32 idx = -1;
2183                 U32 count = 0;
2184                 const U32 base = trie->states[ state ].trans.base;
2185
2186                 if ( trie->states[state].wordnum )
2187                         count = 1;
2188
2189                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2190                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2191                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2192                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2193                     {
2194                         if ( ++count > 1 ) {
2195                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2196                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2197                             if ( state == 1 ) break;
2198                             if ( count == 2 ) {
2199                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2200                                 DEBUG_OPTIMISE_r(
2201                                     PerlIO_printf(Perl_debug_log,
2202                                         "%*sNew Start State=%"UVuf" Class: [",
2203                                         (int)depth * 2 + 2, "",
2204                                         (UV)state));
2205                                 if (idx >= 0) {
2206                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2207                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2208
2209                                     TRIE_BITMAP_SET(trie,*ch);
2210                                     if ( folder )
2211                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2212                                     DEBUG_OPTIMISE_r(
2213                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2214                                     );
2215                                 }
2216                             }
2217                             TRIE_BITMAP_SET(trie,*ch);
2218                             if ( folder )
2219                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2220                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2221                         }
2222                         idx = ofs;
2223                     }
2224                 }
2225                 if ( count == 1 ) {
2226                     SV **tmp = av_fetch( revcharmap, idx, 0);
2227                     STRLEN len;
2228                     char *ch = SvPV( *tmp, len );
2229                     DEBUG_OPTIMISE_r({
2230                         SV *sv=sv_newmortal();
2231                         PerlIO_printf( Perl_debug_log,
2232                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2233                             (int)depth * 2 + 2, "",
2234                             (UV)state, (UV)idx,
2235                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2236                                 PL_colors[0], PL_colors[1],
2237                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2238                                 PERL_PV_ESCAPE_FIRSTCHAR
2239                             )
2240                         );
2241                     });
2242                     if ( state==1 ) {
2243                         OP( convert ) = nodetype;
2244                         str=STRING(convert);
2245                         STR_LEN(convert)=0;
2246                     }
2247                     STR_LEN(convert) += len;
2248                     while (len--)
2249                         *str++ = *ch++;
2250                 } else {
2251 #ifdef DEBUGGING
2252                     if (state>1)
2253                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2254 #endif
2255                     break;
2256                 }
2257             }
2258             trie->prefixlen = (state-1);
2259             if (str) {
2260                 regnode *n = convert+NODE_SZ_STR(convert);
2261                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2262                 trie->startstate = state;
2263                 trie->minlen -= (state - 1);
2264                 trie->maxlen -= (state - 1);
2265 #ifdef DEBUGGING
2266                /* At least the UNICOS C compiler choked on this
2267                 * being argument to DEBUG_r(), so let's just have
2268                 * it right here. */
2269                if (
2270 #ifdef PERL_EXT_RE_BUILD
2271                    1
2272 #else
2273                    DEBUG_r_TEST
2274 #endif
2275                    ) {
2276                    regnode *fix = convert;
2277                    U32 word = trie->wordcount;
2278                    mjd_nodelen++;
2279                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2280                    while( ++fix < n ) {
2281                        Set_Node_Offset_Length(fix, 0, 0);
2282                    }
2283                    while (word--) {
2284                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2285                        if (tmp) {
2286                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2287                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2288                            else
2289                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2290                        }
2291                    }
2292                }
2293 #endif
2294                 if (trie->maxlen) {
2295                     convert = n;
2296                 } else {
2297                     NEXT_OFF(convert) = (U16)(tail - convert);
2298                     DEBUG_r(optimize= n);
2299                 }
2300             }
2301         }
2302         if (!jumper)
2303             jumper = last;
2304         if ( trie->maxlen ) {
2305             NEXT_OFF( convert ) = (U16)(tail - convert);
2306             ARG_SET( convert, data_slot );
2307             /* Store the offset to the first unabsorbed branch in
2308                jump[0], which is otherwise unused by the jump logic.
2309                We use this when dumping a trie and during optimisation. */
2310             if (trie->jump)
2311                 trie->jump[0] = (U16)(nextbranch - convert);
2312
2313             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2314              *   and there is a bitmap
2315              *   and the first "jump target" node we found leaves enough room
2316              * then convert the TRIE node into a TRIEC node, with the bitmap
2317              * embedded inline in the opcode - this is hypothetically faster.
2318              */
2319             if ( !trie->states[trie->startstate].wordnum
2320                  && trie->bitmap
2321                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2322             {
2323                 OP( convert ) = TRIEC;
2324                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2325                 PerlMemShared_free(trie->bitmap);
2326                 trie->bitmap= NULL;
2327             } else
2328                 OP( convert ) = TRIE;
2329
2330             /* store the type in the flags */
2331             convert->flags = nodetype;
2332             DEBUG_r({
2333             optimize = convert
2334                       + NODE_STEP_REGNODE
2335                       + regarglen[ OP( convert ) ];
2336             });
2337             /* XXX We really should free up the resource in trie now,
2338                    as we won't use them - (which resources?) dmq */
2339         }
2340         /* needed for dumping*/
2341         DEBUG_r(if (optimize) {
2342             regnode *opt = convert;
2343
2344             while ( ++opt < optimize) {
2345                 Set_Node_Offset_Length(opt,0,0);
2346             }
2347             /*
2348                 Try to clean up some of the debris left after the
2349                 optimisation.
2350              */
2351             while( optimize < jumper ) {
2352                 mjd_nodelen += Node_Length((optimize));
2353                 OP( optimize ) = OPTIMIZED;
2354                 Set_Node_Offset_Length(optimize,0,0);
2355                 optimize++;
2356             }
2357             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2358         });
2359     } /* end node insert */
2360
2361     /*  Finish populating the prev field of the wordinfo array.  Walk back
2362      *  from each accept state until we find another accept state, and if
2363      *  so, point the first word's .prev field at the second word. If the
2364      *  second already has a .prev field set, stop now. This will be the
2365      *  case either if we've already processed that word's accept state,
2366      *  or that state had multiple words, and the overspill words were
2367      *  already linked up earlier.
2368      */
2369     {
2370         U16 word;
2371         U32 state;
2372         U16 prev;
2373
2374         for (word=1; word <= trie->wordcount; word++) {
2375             prev = 0;
2376             if (trie->wordinfo[word].prev)
2377                 continue;
2378             state = trie->wordinfo[word].accept;
2379             while (state) {
2380                 state = prev_states[state];
2381                 if (!state)
2382                     break;
2383                 prev = trie->states[state].wordnum;
2384                 if (prev)
2385                     break;
2386             }
2387             trie->wordinfo[word].prev = prev;
2388         }
2389         Safefree(prev_states);
2390     }
2391
2392
2393     /* and now dump out the compressed format */
2394     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2395
2396     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2397 #ifdef DEBUGGING
2398     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2399     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2400 #else
2401     SvREFCNT_dec(revcharmap);
2402 #endif
2403     return trie->jump
2404            ? MADE_JUMP_TRIE
2405            : trie->startstate>1
2406              ? MADE_EXACT_TRIE
2407              : MADE_TRIE;
2408 }
2409
2410 STATIC void
2411 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2412 {
2413 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2414
2415    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2416    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2417    ISBN 0-201-10088-6
2418
2419    We find the fail state for each state in the trie, this state is the longest proper
2420    suffix of the current state's 'word' that is also a proper prefix of another word in our
2421    trie. State 1 represents the word '' and is thus the default fail state. This allows
2422    the DFA not to have to restart after its tried and failed a word at a given point, it
2423    simply continues as though it had been matching the other word in the first place.
2424    Consider
2425       'abcdgu'=~/abcdefg|cdgu/
2426    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2427    fail, which would bring us to the state representing 'd' in the second word where we would
2428    try 'g' and succeed, proceeding to match 'cdgu'.
2429  */
2430  /* add a fail transition */
2431     const U32 trie_offset = ARG(source);
2432     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2433     U32 *q;
2434     const U32 ucharcount = trie->uniquecharcount;
2435     const U32 numstates = trie->statecount;
2436     const U32 ubound = trie->lasttrans + ucharcount;
2437     U32 q_read = 0;
2438     U32 q_write = 0;
2439     U32 charid;
2440     U32 base = trie->states[ 1 ].trans.base;
2441     U32 *fail;
2442     reg_ac_data *aho;
2443     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2444     GET_RE_DEBUG_FLAGS_DECL;
2445
2446     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2447 #ifndef DEBUGGING
2448     PERL_UNUSED_ARG(depth);
2449 #endif
2450
2451
2452     ARG_SET( stclass, data_slot );
2453     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2454     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2455     aho->trie=trie_offset;
2456     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2457     Copy( trie->states, aho->states, numstates, reg_trie_state );
2458     Newxz( q, numstates, U32);
2459     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2460     aho->refcount = 1;
2461     fail = aho->fail;
2462     /* initialize fail[0..1] to be 1 so that we always have
2463        a valid final fail state */
2464     fail[ 0 ] = fail[ 1 ] = 1;
2465
2466     for ( charid = 0; charid < ucharcount ; charid++ ) {
2467         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2468         if ( newstate ) {
2469             q[ q_write ] = newstate;
2470             /* set to point at the root */
2471             fail[ q[ q_write++ ] ]=1;
2472         }
2473     }
2474     while ( q_read < q_write) {
2475         const U32 cur = q[ q_read++ % numstates ];
2476         base = trie->states[ cur ].trans.base;
2477
2478         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2479             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2480             if (ch_state) {
2481                 U32 fail_state = cur;
2482                 U32 fail_base;
2483                 do {
2484                     fail_state = fail[ fail_state ];
2485                     fail_base = aho->states[ fail_state ].trans.base;
2486                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2487
2488                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2489                 fail[ ch_state ] = fail_state;
2490                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2491                 {
2492                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2493                 }
2494                 q[ q_write++ % numstates] = ch_state;
2495             }
2496         }
2497     }
2498     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2499        when we fail in state 1, this allows us to use the
2500        charclass scan to find a valid start char. This is based on the principle
2501        that theres a good chance the string being searched contains lots of stuff
2502        that cant be a start char.
2503      */
2504     fail[ 0 ] = fail[ 1 ] = 0;
2505     DEBUG_TRIE_COMPILE_r({
2506         PerlIO_printf(Perl_debug_log,
2507                       "%*sStclass Failtable (%"UVuf" states): 0",
2508                       (int)(depth * 2), "", (UV)numstates
2509         );
2510         for( q_read=1; q_read<numstates; q_read++ ) {
2511             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2512         }
2513         PerlIO_printf(Perl_debug_log, "\n");
2514     });
2515     Safefree(q);
2516     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2517 }
2518
2519
2520 /*
2521  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2522  * These need to be revisited when a newer toolchain becomes available.
2523  */
2524 #if defined(__sparc64__) && defined(__GNUC__)
2525 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2526 #       undef  SPARC64_GCC_WORKAROUND
2527 #       define SPARC64_GCC_WORKAROUND 1
2528 #   endif
2529 #endif
2530
2531 #define DEBUG_PEEP(str,scan,depth) \
2532     DEBUG_OPTIMISE_r({if (scan){ \
2533        SV * const mysv=sv_newmortal(); \
2534        regnode *Next = regnext(scan); \
2535        regprop(RExC_rx, mysv, scan); \
2536        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2537        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2538        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2539    }});
2540
2541
2542 /* The below joins as many adjacent EXACTish nodes as possible into a single
2543  * one, and looks for problematic sequences of characters whose folds vs.
2544  * non-folds have sufficiently different lengths, that the optimizer would be
2545  * fooled into rejecting legitimate matches of them, and the trie construction
2546  * code can't cope with them.  The joining is only done if:
2547  * 1) there is room in the current conglomerated node to entirely contain the
2548  *    next one.
2549  * 2) they are the exact same node type
2550  *
2551  * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2552  * these get optimized out
2553  *
2554  * If there are problematic code sequences, *min_subtract is set to the delta
2555  * that the minimum size of the node can be less than its actual size.  And,
2556  * the node type of the result is changed to reflect that it contains these
2557  * sequences.
2558  *
2559  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2560  * and contains LATIN SMALL LETTER SHARP S
2561  *
2562  * This is as good a place as any to discuss the design of handling these
2563  * problematic sequences.  It's been wrong in Perl for a very long time.  There
2564  * are three code points in Unicode whose folded lengths differ so much from
2565  * the un-folded lengths that it causes problems for the optimizer and trie
2566  * construction.  Why only these are problematic, and not others where lengths
2567  * also differ is something I (khw) do not understand.  New versions of Unicode
2568  * might add more such code points.  Hopefully the logic in fold_grind.t that
2569  * figures out what to test (in part by verifying that each size-combination
2570  * gets tested) will catch any that do come along, so they can be added to the
2571  * special handling below.  The chances of new ones are actually rather small,
2572  * as most, if not all, of the world's scripts that have casefolding have
2573  * already been encoded by Unicode.  Also, a number of Unicode's decisions were
2574  * made to allow compatibility with pre-existing standards, and almost all of
2575  * those have already been dealt with.  These would otherwise be the most
2576  * likely candidates for generating further tricky sequences.  In other words,
2577  * Unicode by itself is unlikely to add new ones unless it is for compatibility
2578  * with pre-existing standards, and there aren't many of those left.
2579  *
2580  * The previous designs for dealing with these involved assigning a special
2581  * node for them.  This approach doesn't work, as evidenced by this example:
2582  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2583  * Both these fold to "sss", but if the pattern is parsed to create a node of
2584  * that would match just the \xDF, it won't be able to handle the case where a
2585  * successful match would have to cross the node's boundary.  The new approach
2586  * that hopefully generally solves the problem generates an EXACTFU_SS node
2587  * that is "sss".
2588  *
2589  * There are a number of components to the approach (a lot of work for just
2590  * three code points!):
2591  * 1)   This routine examines each EXACTFish node that could contain the
2592  *      problematic sequences.  It returns in *min_subtract how much to
2593  *      subtract from the the actual length of the string to get a real minimum
2594  *      for one that could match it.  This number is usually 0 except for the
2595  *      problematic sequences.  This delta is used by the caller to adjust the
2596  *      min length of the match, and the delta between min and max, so that the
2597  *      optimizer doesn't reject these possibilities based on size constraints.
2598  * 2)   These sequences are not currently correctly handled by the trie code
2599  *      either, so it changes the joined node type to ops that are not handled
2600  *      by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
2601  * 3)   This is sufficient for the two Greek sequences (described below), but
2602  *      the one involving the Sharp s (\xDF) needs more.  The node type
2603  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2604  *      sequence in it.  For non-UTF-8 patterns and strings, this is the only
2605  *      case where there is a possible fold length change.  That means that a
2606  *      regular EXACTFU node without UTF-8 involvement doesn't have to concern
2607  *      itself with length changes, and so can be processed faster.  regexec.c
2608  *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
2609  *      is pre-folded by regcomp.c.  This saves effort in regex matching.
2610  *      However, probably mostly for historical reasons, the pre-folding isn't
2611  *      done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2612  *      nodes, as what they fold to isn't known until runtime.)  The fold
2613  *      possibilities for the non-UTF8 patterns are quite simple, except for
2614  *      the sharp s.  All the ones that don't involve a UTF-8 target string
2615  *      are members of a fold-pair, and arrays are set up for all of them
2616  *      that quickly find the other member of the pair.  It might actually
2617  *      be faster to pre-fold these, but it isn't currently done, except for
2618  *      the sharp s.  Code elsewhere in this file makes sure that it gets
2619  *      folded to 'ss', even if the pattern isn't UTF-8.  This avoids the
2620  *      issues described in the next item.
2621  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2622  *      'ss' or not is not knowable at compile time.  It will match iff the
2623  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2624  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2625  *      it can't be folded to "ss" at compile time, unlike EXACTFU does as
2626  *      described in item 3).  An assumption that the optimizer part of
2627  *      regexec.c (probably unwittingly) makes is that a character in the
2628  *      pattern corresponds to at most a single character in the target string.
2629  *      (And I do mean character, and not byte here, unlike other parts of the
2630  *      documentation that have never been updated to account for multibyte
2631  *      Unicode.)  This assumption is wrong only in this case, as all other
2632  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2633  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2634  *      reluctant to try to change this assumption, so instead the code punts.
2635  *      This routine examines EXACTF nodes for the sharp s, and returns a
2636  *      boolean indicating whether or not the node is an EXACTF node that
2637  *      contains a sharp s.  When it is true, the caller sets a flag that later
2638  *      causes the optimizer in this file to not set values for the floating
2639  *      and fixed string lengths, and thus avoids the optimizer code in
2640  *      regexec.c that makes the invalid assumption.  Thus, there is no
2641  *      optimization based on string lengths for EXACTF nodes that contain the
2642  *      sharp s.  This only happens for /id rules (which means the pattern
2643  *      isn't in UTF-8).
2644  */
2645
2646 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2647     if (PL_regkind[OP(scan)] == EXACT) \
2648         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2649
2650 STATIC U32
2651 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2652     /* Merge several consecutive EXACTish nodes into one. */
2653     regnode *n = regnext(scan);
2654     U32 stringok = 1;
2655     regnode *next = scan + NODE_SZ_STR(scan);
2656     U32 merged = 0;
2657     U32 stopnow = 0;
2658 #ifdef DEBUGGING
2659     regnode *stop = scan;
2660     GET_RE_DEBUG_FLAGS_DECL;
2661 #else
2662     PERL_UNUSED_ARG(depth);
2663 #endif
2664
2665     PERL_ARGS_ASSERT_JOIN_EXACT;
2666 #ifndef EXPERIMENTAL_INPLACESCAN
2667     PERL_UNUSED_ARG(flags);
2668     PERL_UNUSED_ARG(val);
2669 #endif
2670     DEBUG_PEEP("join",scan,depth);
2671
2672     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2673      * EXACT ones that are mergeable to the current one. */
2674     while (n
2675            && (PL_regkind[OP(n)] == NOTHING
2676                || (stringok && OP(n) == OP(scan)))
2677            && NEXT_OFF(n)
2678            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2679     {
2680
2681         if (OP(n) == TAIL || n > next)
2682             stringok = 0;
2683         if (PL_regkind[OP(n)] == NOTHING) {
2684             DEBUG_PEEP("skip:",n,depth);
2685             NEXT_OFF(scan) += NEXT_OFF(n);
2686             next = n + NODE_STEP_REGNODE;
2687 #ifdef DEBUGGING
2688             if (stringok)
2689                 stop = n;
2690 #endif
2691             n = regnext(n);
2692         }
2693         else if (stringok) {
2694             const unsigned int oldl = STR_LEN(scan);
2695             regnode * const nnext = regnext(n);
2696
2697             if (oldl + STR_LEN(n) > U8_MAX)
2698                 break;
2699
2700             DEBUG_PEEP("merg",n,depth);
2701             merged++;
2702
2703             NEXT_OFF(scan) += NEXT_OFF(n);
2704             STR_LEN(scan) += STR_LEN(n);
2705             next = n + NODE_SZ_STR(n);
2706             /* Now we can overwrite *n : */
2707             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2708 #ifdef DEBUGGING
2709             stop = next - 1;
2710 #endif
2711             n = nnext;
2712             if (stopnow) break;
2713         }
2714
2715 #ifdef EXPERIMENTAL_INPLACESCAN
2716         if (flags && !NEXT_OFF(n)) {
2717             DEBUG_PEEP("atch", val, depth);
2718             if (reg_off_by_arg[OP(n)]) {
2719                 ARG_SET(n, val - n);
2720             }
2721             else {
2722                 NEXT_OFF(n) = val - n;
2723             }
2724             stopnow = 1;
2725         }
2726 #endif
2727     }
2728
2729     *min_subtract = 0;
2730     *has_exactf_sharp_s = FALSE;
2731
2732     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2733      * can now analyze for sequences of problematic code points.  (Prior to
2734      * this final joining, sequences could have been split over boundaries, and
2735      * hence missed).  The sequences only happen in folding, hence for any
2736      * non-EXACT EXACTish node */
2737     if (OP(scan) != EXACT) {
2738         U8 *s;
2739         U8 * s0 = (U8*) STRING(scan);
2740         U8 * const s_end = s0 + STR_LEN(scan);
2741
2742         /* The below is perhaps overboard, but this allows us to save a test
2743          * each time through the loop at the expense of a mask.  This is
2744          * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2745          * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
2746          * This uses an exclusive 'or' to find that bit and then inverts it to
2747          * form a mask, with just a single 0, in the bit position where 'S' and
2748          * 's' differ. */
2749         const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2750         const U8 s_masked = 's' & S_or_s_mask;
2751
2752         /* One pass is made over the node's string looking for all the
2753          * possibilities.  to avoid some tests in the loop, there are two main
2754          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2755          * non-UTF-8 */
2756         if (UTF) {
2757
2758             /* There are two problematic Greek code points in Unicode
2759              * casefolding
2760              *
2761              * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2762              * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2763              *
2764              * which casefold to
2765              *
2766              * Unicode                      UTF-8
2767              *
2768              * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2769              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2770              *
2771              * This means that in case-insensitive matching (or "loose
2772              * matching", as Unicode calls it), an EXACTF of length six (the
2773              * UTF-8 encoded byte length of the above casefolded versions) can
2774              * match a target string of length two (the byte length of UTF-8
2775              * encoded U+0390 or U+03B0).  This would rather mess up the
2776              * minimum length computation.  (there are other code points that
2777              * also fold to these two sequences, but the delta is smaller)
2778              *
2779              * If these sequences are found, the minimum length is decreased by
2780              * four (six minus two).
2781              *
2782              * Similarly, 'ss' may match the single char and byte LATIN SMALL
2783              * LETTER SHARP S.  We decrease the min length by 1 for each
2784              * occurrence of 'ss' found */
2785
2786 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2787 #           define U390_first_byte 0xb4
2788             const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2789 #           define U3B0_first_byte 0xb5
2790             const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2791 #else
2792 #           define U390_first_byte 0xce
2793             const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2794 #           define U3B0_first_byte 0xcf
2795             const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2796 #endif
2797             const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2798                                                  yields a net of 0 */
2799             /* Examine the string for one of the problematic sequences */
2800             for (s = s0;
2801                  s < s_end - 1; /* Can stop 1 before the end, as minimum length
2802                                  * sequence we are looking for is 2 */
2803                  s += UTF8SKIP(s))
2804             {
2805
2806                 /* Look for the first byte in each problematic sequence */
2807                 switch (*s) {
2808                     /* We don't have to worry about other things that fold to
2809                      * 's' (such as the long s, U+017F), as all above-latin1
2810                      * code points have been pre-folded */
2811                     case 's':
2812                     case 'S':
2813
2814                         /* Current character is an 's' or 'S'.  If next one is
2815                          * as well, we have the dreaded sequence */
2816                         if (((*(s+1) & S_or_s_mask) == s_masked)
2817                             /* These two node types don't have special handling
2818                              * for 'ss' */
2819                             && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2820                         {
2821                             *min_subtract += 1;
2822                             OP(scan) = EXACTFU_SS;
2823                             s++;    /* No need to look at this character again */
2824                         }
2825                         break;
2826
2827                     case U390_first_byte:
2828                         if (s_end - s >= len
2829
2830                             /* The 1's are because are skipping comparing the
2831                              * first byte */
2832                             && memEQ(s + 1, U390_tail, len - 1))
2833                         {
2834                             goto greek_sequence;
2835                         }
2836                         break;
2837
2838                     case U3B0_first_byte:
2839                         if (! (s_end - s >= len
2840                                && memEQ(s + 1, U3B0_tail, len - 1)))
2841                         {
2842                             break;
2843                         }
2844                       greek_sequence:
2845                         *min_subtract += 4;
2846
2847                         /* This can't currently be handled by trie's, so change
2848                          * the node type to indicate this.  If EXACTFA and
2849                          * EXACTFL were ever to be handled by trie's, this
2850                          * would have to be changed.  If this node has already
2851                          * been changed to EXACTFU_SS in this loop, leave it as
2852                          * is.  (I (khw) think it doesn't matter in regexec.c
2853                          * for UTF patterns, but no need to change it */
2854                         if (OP(scan) == EXACTFU) {
2855                             OP(scan) = EXACTFU_TRICKYFOLD;
2856                         }
2857                         s += 6; /* We already know what this sequence is.  Skip
2858                                    the rest of it */
2859                         break;
2860                 }
2861             }
2862         }
2863         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2864
2865             /* Here, the pattern is not UTF-8.  We need to look only for the
2866              * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2867              * in the final position.  Otherwise we can stop looking 1 byte
2868              * earlier because have to find both the first and second 's' */
2869             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2870
2871             for (s = s0; s < upper; s++) {
2872                 switch (*s) {
2873                     case 'S':
2874                     case 's':
2875                         if (s_end - s > 1
2876                             && ((*(s+1) & S_or_s_mask) == s_masked))
2877                         {
2878                             *min_subtract += 1;
2879
2880                             /* EXACTF nodes need to know that the minimum
2881                              * length changed so that a sharp s in the string
2882                              * can match this ss in the pattern, but they
2883                              * remain EXACTF nodes, as they are not trie'able,
2884                              * so don't have to invent a new node type to
2885                              * exclude them from the trie code */
2886                             if (OP(scan) != EXACTF) {
2887                                 OP(scan) = EXACTFU_SS;
2888                             }
2889                             s++;
2890                         }
2891                         break;
2892                     case LATIN_SMALL_LETTER_SHARP_S:
2893                         if (OP(scan) == EXACTF) {
2894                             *has_exactf_sharp_s = TRUE;
2895                         }
2896                         break;
2897                 }
2898             }
2899         }
2900     }
2901
2902 #ifdef DEBUGGING
2903     /* Allow dumping but overwriting the collection of skipped
2904      * ops and/or strings with fake optimized ops */
2905     n = scan + NODE_SZ_STR(scan);
2906     while (n <= stop) {
2907         OP(n) = OPTIMIZED;
2908         FLAGS(n) = 0;
2909         NEXT_OFF(n) = 0;
2910         n++;
2911     }
2912 #endif
2913     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2914     return stopnow;
2915 }
2916
2917 /* REx optimizer.  Converts nodes into quicker variants "in place".
2918    Finds fixed substrings.  */
2919
2920 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2921    to the position after last scanned or to NULL. */
2922
2923 #define INIT_AND_WITHP \
2924     assert(!and_withp); \
2925     Newx(and_withp,1,struct regnode_charclass_class); \
2926     SAVEFREEPV(and_withp)
2927
2928 /* this is a chain of data about sub patterns we are processing that
2929    need to be handled separately/specially in study_chunk. Its so
2930    we can simulate recursion without losing state.  */
2931 struct scan_frame;
2932 typedef struct scan_frame {
2933     regnode *last;  /* last node to process in this frame */
2934     regnode *next;  /* next node to process when last is reached */
2935     struct scan_frame *prev; /*previous frame*/
2936     I32 stop; /* what stopparen do we use */
2937 } scan_frame;
2938
2939
2940 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2941
2942 #define CASE_SYNST_FNC(nAmE)                                       \
2943 case nAmE:                                                         \
2944     if (flags & SCF_DO_STCLASS_AND) {                              \
2945             for (value = 0; value < 256; value++)                  \
2946                 if (!is_ ## nAmE ## _cp(value))                       \
2947                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2948     }                                                              \
2949     else {                                                         \
2950             for (value = 0; value < 256; value++)                  \
2951                 if (is_ ## nAmE ## _cp(value))                        \
2952                     ANYOF_BITMAP_SET(data->start_class, value);    \
2953     }                                                              \
2954     break;                                                         \
2955 case N ## nAmE:                                                    \
2956     if (flags & SCF_DO_STCLASS_AND) {                              \
2957             for (value = 0; value < 256; value++)                   \
2958                 if (is_ ## nAmE ## _cp(value))                         \
2959                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2960     }                                                               \
2961     else {                                                          \
2962             for (value = 0; value < 256; value++)                   \
2963                 if (!is_ ## nAmE ## _cp(value))                        \
2964                     ANYOF_BITMAP_SET(data->start_class, value);     \
2965     }                                                               \
2966     break
2967
2968
2969
2970 STATIC I32
2971 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2972                         I32 *minlenp, I32 *deltap,
2973                         regnode *last,
2974                         scan_data_t *data,
2975                         I32 stopparen,
2976                         U8* recursed,
2977                         struct regnode_charclass_class *and_withp,
2978                         U32 flags, U32 depth)
2979                         /* scanp: Start here (read-write). */
2980                         /* deltap: Write maxlen-minlen here. */
2981                         /* last: Stop before this one. */
2982                         /* data: string data about the pattern */
2983                         /* stopparen: treat close N as END */
2984                         /* recursed: which subroutines have we recursed into */
2985                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2986 {
2987     dVAR;
2988     I32 min = 0, pars = 0, code;
2989     regnode *scan = *scanp, *next;
2990     I32 delta = 0;
2991     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2992     int is_inf_internal = 0;            /* The studied chunk is infinite */
2993     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2994     scan_data_t data_fake;
2995     SV *re_trie_maxbuff = NULL;
2996     regnode *first_non_open = scan;
2997     I32 stopmin = I32_MAX;
2998     scan_frame *frame = NULL;
2999     GET_RE_DEBUG_FLAGS_DECL;
3000
3001     PERL_ARGS_ASSERT_STUDY_CHUNK;
3002
3003 #ifdef DEBUGGING
3004     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3005 #endif
3006
3007     if ( depth == 0 ) {
3008         while (first_non_open && OP(first_non_open) == OPEN)
3009             first_non_open=regnext(first_non_open);
3010     }
3011
3012
3013   fake_study_recurse:
3014     while ( scan && OP(scan) != END && scan < last ){
3015         UV min_subtract = 0;    /* How much to subtract from the minimum node
3016                                    length to get a real minimum (because the
3017                                    folded version may be shorter) */
3018         bool has_exactf_sharp_s = FALSE;
3019         /* Peephole optimizer: */
3020         DEBUG_STUDYDATA("Peep:", data,depth);
3021         DEBUG_PEEP("Peep",scan,depth);
3022
3023         /* Its not clear to khw or hv why this is done here, and not in the
3024          * clauses that deal with EXACT nodes.  khw's guess is that it's
3025          * because of a previous design */
3026         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3027
3028         /* Follow the next-chain of the current node and optimize
3029            away all the NOTHINGs from it.  */
3030         if (OP(scan) != CURLYX) {
3031             const int max = (reg_off_by_arg[OP(scan)]
3032                        ? I32_MAX
3033                        /* I32 may be smaller than U16 on CRAYs! */
3034                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3035             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3036             int noff;
3037             regnode *n = scan;
3038
3039             /* Skip NOTHING and LONGJMP. */
3040             while ((n = regnext(n))
3041                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3042                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3043                    && off + noff < max)
3044                 off += noff;
3045             if (reg_off_by_arg[OP(scan)])
3046                 ARG(scan) = off;
3047             else
3048                 NEXT_OFF(scan) = off;
3049         }
3050
3051
3052
3053         /* The principal pseudo-switch.  Cannot be a switch, since we
3054            look into several different things.  */
3055         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3056                    || OP(scan) == IFTHEN) {
3057             next = regnext(scan);
3058             code = OP(scan);
3059             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3060
3061             if (OP(next) == code || code == IFTHEN) {
3062                 /* NOTE - There is similar code to this block below for handling
3063                    TRIE nodes on a re-study.  If you change stuff here check there
3064                    too. */
3065                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3066                 struct regnode_charclass_class accum;
3067                 regnode * const startbranch=scan;
3068
3069                 if (flags & SCF_DO_SUBSTR)
3070                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3071                 if (flags & SCF_DO_STCLASS)
3072                     cl_init_zero(pRExC_state, &accum);
3073
3074                 while (OP(scan) == code) {
3075                     I32 deltanext, minnext, f = 0, fake;
3076                     struct regnode_charclass_class this_class;
3077
3078                     num++;
3079                     data_fake.flags = 0;
3080                     if (data) {
3081                         data_fake.whilem_c = data->whilem_c;
3082                         data_fake.last_closep = data->last_closep;
3083                     }
3084                     else
3085                         data_fake.last_closep = &fake;
3086
3087                     data_fake.pos_delta = delta;
3088                     next = regnext(scan);
3089                     scan = NEXTOPER(scan);
3090                     if (code != BRANCH)
3091                         scan = NEXTOPER(scan);
3092                     if (flags & SCF_DO_STCLASS) {
3093                         cl_init(pRExC_state, &this_class);
3094                         data_fake.start_class = &this_class;
3095                         f = SCF_DO_STCLASS_AND;
3096                     }
3097                     if (flags & SCF_WHILEM_VISITED_POS)
3098                         f |= SCF_WHILEM_VISITED_POS;
3099
3100                     /* we suppose the run is continuous, last=next...*/
3101                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3102                                           next, &data_fake,
3103                                           stopparen, recursed, NULL, f,depth+1);
3104                     if (min1 > minnext)
3105                         min1 = minnext;
3106                     if (max1 < minnext + deltanext)
3107                         max1 = minnext + deltanext;
3108                     if (deltanext == I32_MAX)
3109                         is_inf = is_inf_internal = 1;
3110                     scan = next;
3111                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3112                         pars++;
3113                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3114                         if ( stopmin > minnext)
3115                             stopmin = min + min1;
3116                         flags &= ~SCF_DO_SUBSTR;
3117                         if (data)
3118                             data->flags |= SCF_SEEN_ACCEPT;
3119                     }
3120                     if (data) {
3121                         if (data_fake.flags & SF_HAS_EVAL)
3122                             data->flags |= SF_HAS_EVAL;
3123                         data->whilem_c = data_fake.whilem_c;
3124                     }
3125                     if (flags & SCF_DO_STCLASS)
3126                         cl_or(pRExC_state, &accum, &this_class);
3127                 }
3128                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3129                     min1 = 0;
3130                 if (flags & SCF_DO_SUBSTR) {
3131                     data->pos_min += min1;
3132                     data->pos_delta += max1 - min1;
3133                     if (max1 != min1 || is_inf)
3134                         data->longest = &(data->longest_float);
3135                 }
3136                 min += min1;
3137                 delta += max1 - min1;
3138                 if (flags & SCF_DO_STCLASS_OR) {
3139                     cl_or(pRExC_state, data->start_class, &accum);
3140                     if (min1) {
3141                         cl_and(data->start_class, and_withp);
3142                         flags &= ~SCF_DO_STCLASS;
3143                     }
3144                 }
3145                 else if (flags & SCF_DO_STCLASS_AND) {
3146                     if (min1) {
3147                         cl_and(data->start_class, &accum);
3148                         flags &= ~SCF_DO_STCLASS;
3149                     }
3150                     else {
3151                         /* Switch to OR mode: cache the old value of
3152                          * data->start_class */
3153                         INIT_AND_WITHP;
3154                         StructCopy(data->start_class, and_withp,
3155                                    struct regnode_charclass_class);
3156                         flags &= ~SCF_DO_STCLASS_AND;
3157                         StructCopy(&accum, data->start_class,
3158                                    struct regnode_charclass_class);
3159                         flags |= SCF_DO_STCLASS_OR;
3160                         data->start_class->flags |= ANYOF_EOS;
3161                     }
3162                 }
3163
3164                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3165                 /* demq.
3166
3167                    Assuming this was/is a branch we are dealing with: 'scan' now
3168                    points at the item that follows the branch sequence, whatever
3169                    it is. We now start at the beginning of the sequence and look
3170                    for subsequences of
3171
3172                    BRANCH->EXACT=>x1
3173                    BRANCH->EXACT=>x2
3174                    tail
3175
3176                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3177
3178                    If we can find such a subsequence we need to turn the first
3179                    element into a trie and then add the subsequent branch exact
3180                    strings to the trie.
3181
3182                    We have two cases
3183
3184                      1. patterns where the whole set of branches can be converted.
3185
3186                      2. patterns where only a subset can be converted.
3187
3188                    In case 1 we can replace the whole set with a single regop
3189                    for the trie. In case 2 we need to keep the start and end
3190                    branches so
3191
3192                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3193                      becomes BRANCH TRIE; BRANCH X;
3194
3195                   There is an additional case, that being where there is a
3196                   common prefix, which gets split out into an EXACT like node
3197                   preceding the TRIE node.
3198
3199                   If x(1..n)==tail then we can do a simple trie, if not we make
3200                   a "jump" trie, such that when we match the appropriate word
3201                   we "jump" to the appropriate tail node. Essentially we turn
3202                   a nested if into a case structure of sorts.
3203
3204                 */
3205
3206                     int made=0;
3207                     if (!re_trie_maxbuff) {
3208                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3209                         if (!SvIOK(re_trie_maxbuff))
3210                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3211                     }
3212                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3213                         regnode *cur;
3214                         regnode *first = (regnode *)NULL;
3215                         regnode *last = (regnode *)NULL;
3216                         regnode *tail = scan;
3217                         U8 trietype = 0;
3218                         U32 count=0;
3219
3220 #ifdef DEBUGGING
3221                         SV * const mysv = sv_newmortal();       /* for dumping */
3222 #endif
3223                         /* var tail is used because there may be a TAIL
3224                            regop in the way. Ie, the exacts will point to the
3225                            thing following the TAIL, but the last branch will
3226                            point at the TAIL. So we advance tail. If we
3227                            have nested (?:) we may have to move through several
3228                            tails.
3229                          */
3230
3231                         while ( OP( tail ) == TAIL ) {
3232                             /* this is the TAIL generated by (?:) */
3233                             tail = regnext( tail );
3234                         }
3235
3236
3237                         DEBUG_OPTIMISE_r({
3238                             regprop(RExC_rx, mysv, tail );
3239                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3240                                 (int)depth * 2 + 2, "",
3241                                 "Looking for TRIE'able sequences. Tail node is: ",
3242                                 SvPV_nolen_const( mysv )
3243                             );
3244                         });
3245
3246                         /*
3247
3248                             Step through the branches
3249                                 cur represents each branch,
3250                                 noper is the first thing to be matched as part of that branch
3251                                 noper_next is the regnext() of that node.
3252
3253                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3254                             via a "jump trie" but we also support building with NOJUMPTRIE,
3255                             which restricts the trie logic to structures like /FOO|BAR/.
3256
3257                             If noper is a trieable nodetype then the branch is a possible optimization
3258                             target. If we are building under NOJUMPTRIE then we require that noper_next
3259                             is the same as scan (our current position in the regex program).
3260
3261                             Once we have two or more consecutive such branches we can create a
3262                             trie of the EXACT's contents and stitch it in place into the program.
3263
3264                             If the sequence represents all of the branches in the alternation we
3265                             replace the entire thing with a single TRIE node.
3266
3267                             Otherwise when it is a subsequence we need to stitch it in place and
3268                             replace only the relevant branches. This means the first branch has
3269                             to remain as it is used by the alternation logic, and its next pointer,
3270                             and needs to be repointed at the item on the branch chain following
3271                             the last branch we have optimized away.
3272
3273                             This could be either a BRANCH, in which case the subsequence is internal,
3274                             or it could be the item following the branch sequence in which case the
3275                             subsequence is at the end (which does not necessarily mean the first node
3276                             is the start of the alternation).
3277
3278                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3279
3280                                 optype          |  trietype
3281                                 ----------------+-----------
3282                                 NOTHING         | NOTHING
3283                                 EXACT           | EXACT
3284                                 EXACTFU         | EXACTFU
3285                                 EXACTFU_SS      | EXACTFU
3286                                 EXACTFU_TRICKYFOLD | EXACTFU
3287                                 EXACTFA         | 0
3288
3289
3290                         */
3291 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3292                        ( EXACT == (X) )   ? EXACT :        \
3293                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3294                        0 )
3295
3296                         /* dont use tail as the end marker for this traverse */
3297                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3298                             regnode * const noper = NEXTOPER( cur );
3299                             U8 noper_type = OP( noper );
3300                             U8 noper_trietype = TRIE_TYPE( noper_type );
3301 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3302                             regnode * const noper_next = regnext( noper );
3303 #endif
3304
3305                             DEBUG_OPTIMISE_r({
3306                                 regprop(RExC_rx, mysv, cur);
3307                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3308                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3309
3310                                 regprop(RExC_rx, mysv, noper);
3311                                 PerlIO_printf( Perl_debug_log, " -> %s",
3312                                     SvPV_nolen_const(mysv));
3313
3314                                 if ( noper_next ) {
3315                                   regprop(RExC_rx, mysv, noper_next );
3316                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3317                                     SvPV_nolen_const(mysv));
3318                                 }
3319                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3320                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3321                             });
3322
3323                             /* Is noper a trieable nodetype that can be merged with the
3324                              * current trie (if there is one)? */
3325                             if ( noper_trietype
3326                                   &&
3327                                   (
3328                                         ( noper_trietype == NOTHING )
3329                                         ||
3330                                         ( trietype == NOTHING )
3331                                         ||
3332                                         ( trietype == noper_trietype )
3333                                   )
3334 #ifdef NOJUMPTRIE
3335                                   && noper_next == tail
3336 #endif
3337                                   && count < U16_MAX)
3338                             {
3339                                 /* Handle mergable triable node
3340                                  * Either we are the first node in a new trieable sequence,
3341                                  * in which case we do some bookkeeping, otherwise we update
3342                                  * the end pointer. */
3343                                 count++;
3344                                 if ( !first ) {
3345                                     first = cur;
3346                                     trietype = noper_trietype;
3347                                 } else {
3348                                     if ( trietype == NOTHING )
3349                                         trietype = noper_trietype;
3350                                     last = cur;
3351                                 }
3352                             } /* end handle mergable triable node */
3353                             else {
3354                                 /* handle unmergable node -
3355                                  * noper may either be a triable node which can not be tried
3356                                  * together with the current trie, or a non triable node */
3357                                 if ( last && trietype != NOTHING ) {
3358                                     /* if last is set then we have found at least two triable branch
3359                                      * sequences in a row of a similar trietype so we can turn them
3360                                      * into a trie */
3361                                     make_trie( pRExC_state,
3362                                             startbranch, first, cur, tail, count,
3363                                             trietype, depth+1 );
3364                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3365                                 }
3366                                 if ( noper_trietype
3367 #ifdef NOJUMPTRIE
3368                                      && noper_next == tail
3369 #endif
3370                                 ){
3371                                     /* noper is triable, so we can start a new trie sequence */
3372                                     count = 1;
3373                                     first = cur;
3374                                     trietype = noper_trietype;
3375                                 } else if (first) {
3376                                     /* if we already saw a first but the current node is not triable then we have
3377                                      * to reset the first information. */
3378                                     count = 0;
3379                                     first = NULL;
3380                                     trietype = 0;
3381                                 }
3382                             } /* end handle unmergable node */
3383                         } /* loop over branches */
3384                         DEBUG_OPTIMISE_r({
3385                             regprop(RExC_rx, mysv, cur);
3386                             PerlIO_printf( Perl_debug_log,
3387                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3388                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3389
3390                         });
3391                         if ( last && trietype != NOTHING ) {
3392                             /* the last branch of the sequence was part of a trie,
3393                              * so we have to construct it here outside of the loop
3394                              */
3395                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3396 #ifdef TRIE_STUDY_OPT
3397                             if ( ((made == MADE_EXACT_TRIE &&
3398                                  startbranch == first)
3399                                  || ( first_non_open == first )) &&
3400                                  depth==0 ) {
3401                                 flags |= SCF_TRIE_RESTUDY;
3402                                 if ( startbranch == first
3403                                      && scan == tail )
3404                                 {
3405                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3406                                 }
3407                             }
3408 #endif
3409                         } /* end if ( last) */
3410                     } /* TRIE_MAXBUF is non zero */
3411
3412                 } /* do trie */
3413
3414             }
3415             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3416                 scan = NEXTOPER(NEXTOPER(scan));
3417             } else                      /* single branch is optimized. */
3418                 scan = NEXTOPER(scan);
3419             continue;
3420         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3421             scan_frame *newframe = NULL;
3422             I32 paren;
3423             regnode *start;
3424             regnode *end;
3425
3426             if (OP(scan) != SUSPEND) {
3427             /* set the pointer */
3428                 if (OP(scan) == GOSUB) {
3429                     paren = ARG(scan);
3430                     RExC_recurse[ARG2L(scan)] = scan;
3431                     start = RExC_open_parens[paren-1];
3432                     end   = RExC_close_parens[paren-1];
3433                 } else {
3434                     paren = 0;
3435                     start = RExC_rxi->program + 1;
3436                     end   = RExC_opend;
3437                 }
3438                 if (!recursed) {
3439                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3440                     SAVEFREEPV(recursed);
3441                 }
3442                 if (!PAREN_TEST(recursed,paren+1)) {
3443                     PAREN_SET(recursed,paren+1);
3444                     Newx(newframe,1,scan_frame);
3445                 } else {
3446                     if (flags & SCF_DO_SUBSTR) {
3447                         SCAN_COMMIT(pRExC_state,data,minlenp);
3448                         data->longest = &(data->longest_float);
3449                     }
3450                     is_inf = is_inf_internal = 1;
3451                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3452                         cl_anything(pRExC_state, data->start_class);
3453                     flags &= ~SCF_DO_STCLASS;
3454                 }
3455             } else {
3456                 Newx(newframe,1,scan_frame);
3457                 paren = stopparen;
3458                 start = scan+2;
3459                 end = regnext(scan);
3460             }
3461             if (newframe) {
3462                 assert(start);
3463                 assert(end);
3464                 SAVEFREEPV(newframe);
3465                 newframe->next = regnext(scan);
3466                 newframe->last = last;
3467                 newframe->stop = stopparen;
3468                 newframe->prev = frame;
3469
3470                 frame = newframe;
3471                 scan =  start;
3472                 stopparen = paren;
3473                 last = end;
3474
3475                 continue;
3476             }
3477         }
3478         else if (OP(scan) == EXACT) {
3479             I32 l = STR_LEN(scan);
3480             UV uc;
3481             if (UTF) {
3482                 const U8 * const s = (U8*)STRING(scan);
3483                 l = utf8_length(s, s + l);
3484                 uc = utf8_to_uvchr(s, NULL);
3485             } else {
3486                 uc = *((U8*)STRING(scan));
3487             }
3488             min += l;
3489             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3490                 /* The code below prefers earlier match for fixed
3491                    offset, later match for variable offset.  */
3492                 if (data->last_end == -1) { /* Update the start info. */
3493                     data->last_start_min = data->pos_min;
3494                     data->last_start_max = is_inf
3495                         ? I32_MAX : data->pos_min + data->pos_delta;
3496                 }
3497                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3498                 if (UTF)
3499                     SvUTF8_on(data->last_found);
3500                 {
3501                     SV * const sv = data->last_found;
3502                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3503                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3504                     if (mg && mg->mg_len >= 0)
3505                         mg->mg_len += utf8_length((U8*)STRING(scan),
3506                                                   (U8*)STRING(scan)+STR_LEN(scan));
3507                 }
3508                 data->last_end = data->pos_min + l;
3509                 data->pos_min += l; /* As in the first entry. */
3510                 data->flags &= ~SF_BEFORE_EOL;
3511             }
3512             if (flags & SCF_DO_STCLASS_AND) {
3513                 /* Check whether it is compatible with what we know already! */
3514                 int compat = 1;
3515
3516
3517                 /* If compatible, we or it in below.  It is compatible if is
3518                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3519                  * it's for a locale.  Even if there isn't unicode semantics
3520                  * here, at runtime there may be because of matching against a
3521                  * utf8 string, so accept a possible false positive for
3522                  * latin1-range folds */
3523                 if (uc >= 0x100 ||
3524                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3525                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3526                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3527                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3528                     )
3529                 {
3530                     compat = 0;
3531                 }
3532                 ANYOF_CLASS_ZERO(data->start_class);
3533                 ANYOF_BITMAP_ZERO(data->start_class);
3534                 if (compat)
3535                     ANYOF_BITMAP_SET(data->start_class, uc);
3536                 else if (uc >= 0x100) {
3537                     int i;
3538
3539                     /* Some Unicode code points fold to the Latin1 range; as
3540                      * XXX temporary code, instead of figuring out if this is
3541                      * one, just assume it is and set all the start class bits
3542                      * that could be some such above 255 code point's fold
3543                      * which will generate fals positives.  As the code
3544                      * elsewhere that does compute the fold settles down, it
3545                      * can be extracted out and re-used here */
3546                     for (i = 0; i < 256; i++){
3547                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3548                             ANYOF_BITMAP_SET(data->start_class, i);
3549                         }
3550                     }
3551                 }
3552                 data->start_class->flags &= ~ANYOF_EOS;
3553                 if (uc < 0x100)
3554                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3555             }
3556             else if (flags & SCF_DO_STCLASS_OR) {
3557                 /* false positive possible if the class is case-folded */
3558                 if (uc < 0x100)
3559                     ANYOF_BITMAP_SET(data->start_class, uc);
3560                 else
3561                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3562                 data->start_class->flags &= ~ANYOF_EOS;
3563                 cl_and(data->start_class, and_withp);
3564             }
3565             flags &= ~SCF_DO_STCLASS;
3566         }
3567         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3568             I32 l = STR_LEN(scan);
3569             UV uc = *((U8*)STRING(scan));
3570
3571             /* Search for fixed substrings supports EXACT only. */
3572             if (flags & SCF_DO_SUBSTR) {
3573                 assert(data);
3574                 SCAN_COMMIT(pRExC_state, data, minlenp);
3575             }
3576             if (UTF) {
3577                 const U8 * const s = (U8 *)STRING(scan);
3578                 l = utf8_length(s, s + l);
3579                 uc = utf8_to_uvchr(s, NULL);
3580             }
3581             else if (has_exactf_sharp_s) {
3582                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3583             }
3584             min += l - min_subtract;
3585             if (min < 0) {
3586                 min = 0;
3587             }
3588             delta += min_subtract;
3589             if (flags & SCF_DO_SUBSTR) {
3590                 data->pos_min += l - min_subtract;
3591                 if (data->pos_min < 0) {
3592                     data->pos_min = 0;
3593                 }
3594                 data->pos_delta += min_subtract;
3595                 if (min_subtract) {
3596                     data->longest = &(data->longest_float);
3597                 }
3598             }
3599             if (flags & SCF_DO_STCLASS_AND) {
3600                 /* Check whether it is compatible with what we know already! */
3601                 int compat = 1;
3602                 if (uc >= 0x100 ||
3603                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3604                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3605                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3606                 {
3607                     compat = 0;
3608                 }
3609                 ANYOF_CLASS_ZERO(data->start_class);
3610                 ANYOF_BITMAP_ZERO(data->start_class);
3611                 if (compat) {
3612                     ANYOF_BITMAP_SET(data->start_class, uc);
3613                     data->start_class->flags &= ~ANYOF_EOS;
3614                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3615                     if (OP(scan) == EXACTFL) {
3616                         /* XXX This set is probably no longer necessary, and
3617                          * probably wrong as LOCALE now is on in the initial
3618                          * state */
3619                         data->start_class->flags |= ANYOF_LOCALE;
3620                     }
3621                     else {
3622
3623                         /* Also set the other member of the fold pair.  In case
3624                          * that unicode semantics is called for at runtime, use
3625                          * the full latin1 fold.  (Can't do this for locale,
3626                          * because not known until runtime) */
3627                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3628
3629                         /* All other (EXACTFL handled above) folds except under
3630                          * /iaa that include s, S, and sharp_s also may include
3631                          * the others */
3632                         if (OP(scan) != EXACTFA) {
3633                             if (uc == 's' || uc == 'S') {
3634                                 ANYOF_BITMAP_SET(data->start_class,
3635                                                  LATIN_SMALL_LETTER_SHARP_S);
3636                             }
3637                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3638                                 ANYOF_BITMAP_SET(data->start_class, 's');
3639                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3640                             }
3641                         }
3642                     }
3643                 }
3644                 else if (uc >= 0x100) {
3645                     int i;
3646                     for (i = 0; i < 256; i++){
3647                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3648                             ANYOF_BITMAP_SET(data->start_class, i);
3649                         }
3650                     }
3651                 }
3652             }
3653             else if (flags & SCF_DO_STCLASS_OR) {
3654                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3655                     /* false positive possible if the class is case-folded.
3656                        Assume that the locale settings are the same... */
3657                     if (uc < 0x100) {
3658                         ANYOF_BITMAP_SET(data->start_class, uc);
3659                         if (OP(scan) != EXACTFL) {
3660
3661                             /* And set the other member of the fold pair, but
3662                              * can't do that in locale because not known until
3663                              * run-time */
3664                             ANYOF_BITMAP_SET(data->start_class,
3665                                              PL_fold_latin1[uc]);
3666
3667                             /* All folds except under /iaa that include s, S,
3668                              * and sharp_s also may include the others */
3669                             if (OP(scan) != EXACTFA) {
3670                                 if (uc == 's' || uc == 'S') {
3671                                     ANYOF_BITMAP_SET(data->start_class,
3672                                                    LATIN_SMALL_LETTER_SHARP_S);
3673                                 }
3674                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3675                                     ANYOF_BITMAP_SET(data->start_class, 's');
3676                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3677                                 }
3678                             }
3679                         }
3680                     }
3681                     data->start_class->flags &= ~ANYOF_EOS;
3682                 }
3683                 cl_and(data->start_class, and_withp);
3684             }
3685             flags &= ~SCF_DO_STCLASS;
3686         }
3687         else if (REGNODE_VARIES(OP(scan))) {
3688             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3689             I32 f = flags, pos_before = 0;
3690             regnode * const oscan = scan;
3691             struct regnode_charclass_class this_class;
3692             struct regnode_charclass_class *oclass = NULL;
3693             I32 next_is_eval = 0;
3694
3695             switch (PL_regkind[OP(scan)]) {
3696             case WHILEM:                /* End of (?:...)* . */
3697                 scan = NEXTOPER(scan);
3698                 goto finish;
3699             case PLUS:
3700                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3701                     next = NEXTOPER(scan);
3702                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3703                         mincount = 1;
3704                         maxcount = REG_INFTY;
3705                         next = regnext(scan);
3706                         scan = NEXTOPER(scan);
3707                         goto do_curly;
3708                     }
3709                 }
3710                 if (flags & SCF_DO_SUBSTR)
3711                     data->pos_min++;
3712                 min++;
3713                 /* Fall through. */
3714             case STAR:
3715                 if (flags & SCF_DO_STCLASS) {
3716                     mincount = 0;
3717                     maxcount = REG_INFTY;
3718                     next = regnext(scan);
3719                     scan = NEXTOPER(scan);
3720                     goto do_curly;
3721                 }
3722                 is_inf = is_inf_internal = 1;
3723                 scan = regnext(scan);
3724                 if (flags & SCF_DO_SUBSTR) {
3725                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3726                     data->longest = &(data->longest_float);
3727                 }
3728                 goto optimize_curly_tail;
3729             case CURLY:
3730                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3731                     && (scan->flags == stopparen))
3732                 {
3733                     mincount = 1;
3734                     maxcount = 1;
3735                 } else {
3736                     mincount = ARG1(scan);
3737                     maxcount = ARG2(scan);
3738                 }
3739                 next = regnext(scan);
3740                 if (OP(scan) == CURLYX) {
3741                     I32 lp = (data ? *(data->last_closep) : 0);
3742                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3743                 }
3744                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3745                 next_is_eval = (OP(scan) == EVAL);
3746               do_curly:
3747                 if (flags & SCF_DO_SUBSTR) {
3748                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3749                     pos_before = data->pos_min;
3750                 }
3751                 if (data) {
3752                     fl = data->flags;
3753                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3754                     if (is_inf)
3755                         data->flags |= SF_IS_INF;
3756                 }
3757                 if (flags & SCF_DO_STCLASS) {
3758                     cl_init(pRExC_state, &this_class);
3759                     oclass = data->start_class;
3760                     data->start_class = &this_class;
3761                     f |= SCF_DO_STCLASS_AND;
3762                     f &= ~SCF_DO_STCLASS_OR;
3763                 }
3764                 /* Exclude from super-linear cache processing any {n,m}
3765                    regops for which the combination of input pos and regex
3766                    pos is not enough information to determine if a match
3767                    will be possible.
3768
3769                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3770                    regex pos at the \s*, the prospects for a match depend not
3771                    only on the input position but also on how many (bar\s*)
3772                    repeats into the {4,8} we are. */
3773                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3774                     f &= ~SCF_WHILEM_VISITED_POS;
3775
3776                 /* This will finish on WHILEM, setting scan, or on NULL: */
3777                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3778                                       last, data, stopparen, recursed, NULL,
3779                                       (mincount == 0
3780                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3781
3782                 if (flags & SCF_DO_STCLASS)
3783                     data->start_class = oclass;
3784                 if (mincount == 0 || minnext == 0) {
3785                     if (flags & SCF_DO_STCLASS_OR) {
3786                         cl_or(pRExC_state, data->start_class, &this_class);
3787                     }
3788                     else if (flags & SCF_DO_STCLASS_AND) {
3789                         /* Switch to OR mode: cache the old value of
3790                          * data->start_class */
3791                         INIT_AND_WITHP;
3792                         StructCopy(data->start_class, and_withp,
3793                                    struct regnode_charclass_class);
3794                         flags &= ~SCF_DO_STCLASS_AND;
3795                         StructCopy(&this_class, data->start_class,
3796                                    struct regnode_charclass_class);
3797                         flags |= SCF_DO_STCLASS_OR;
3798                         data->start_class->flags |= ANYOF_EOS;
3799                     }
3800                 } else {                /* Non-zero len */
3801                     if (flags & SCF_DO_STCLASS_OR) {
3802                         cl_or(pRExC_state, data->start_class, &this_class);
3803                         cl_and(data->start_class, and_withp);
3804                     }
3805                     else if (flags & SCF_DO_STCLASS_AND)
3806                         cl_and(data->start_class, &this_class);
3807                     flags &= ~SCF_DO_STCLASS;
3808                 }
3809                 if (!scan)              /* It was not CURLYX, but CURLY. */
3810                     scan = next;
3811                 if ( /* ? quantifier ok, except for (?{ ... }) */
3812                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3813                     && (minnext == 0) && (deltanext == 0)
3814                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3815                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3816                 {
3817                     ckWARNreg(RExC_parse,
3818                               "Quantifier unexpected on zero-length expression");
3819                 }
3820
3821                 min += minnext * mincount;
3822                 is_inf_internal |= ((maxcount == REG_INFTY
3823                                      && (minnext + deltanext) > 0)
3824                                     || deltanext == I32_MAX);
3825                 is_inf |= is_inf_internal;
3826                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3827
3828                 /* Try powerful optimization CURLYX => CURLYN. */
3829                 if (  OP(oscan) == CURLYX && data
3830                       && data->flags & SF_IN_PAR
3831                       && !(data->flags & SF_HAS_EVAL)
3832                       && !deltanext && minnext == 1 ) {
3833                     /* Try to optimize to CURLYN.  */
3834                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3835                     regnode * const nxt1 = nxt;
3836 #ifdef DEBUGGING
3837                     regnode *nxt2;
3838 #endif
3839
3840                     /* Skip open. */
3841                     nxt = regnext(nxt);
3842                     if (!REGNODE_SIMPLE(OP(nxt))
3843                         && !(PL_regkind[OP(nxt)] == EXACT
3844                              && STR_LEN(nxt) == 1))
3845                         goto nogo;
3846 #ifdef DEBUGGING
3847                     nxt2 = nxt;
3848 #endif
3849                     nxt = regnext(nxt);
3850                     if (OP(nxt) != CLOSE)
3851                         goto nogo;
3852                     if (RExC_open_parens) {
3853                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3854                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3855                     }
3856                     /* Now we know that nxt2 is the only contents: */
3857                     oscan->flags = (U8)ARG(nxt);
3858                     OP(oscan) = CURLYN;
3859                     OP(nxt1) = NOTHING; /* was OPEN. */
3860
3861 #ifdef DEBUGGING
3862                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3863                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3864                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3865                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3866                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3867                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3868 #endif
3869                 }
3870               nogo:
3871
3872                 /* Try optimization CURLYX => CURLYM. */
3873                 if (  OP(oscan) == CURLYX && data
3874                       && !(data->flags & SF_HAS_PAR)
3875                       && !(data->flags & SF_HAS_EVAL)
3876                       && !deltanext     /* atom is fixed width */
3877                       && minnext != 0   /* CURLYM can't handle zero width */
3878                 ) {
3879                     /* XXXX How to optimize if data == 0? */
3880                     /* Optimize to a simpler form.  */
3881                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3882                     regnode *nxt2;
3883
3884                     OP(oscan) = CURLYM;
3885                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3886                             && (OP(nxt2) != WHILEM))
3887                         nxt = nxt2;
3888                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3889                     /* Need to optimize away parenths. */
3890                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3891                         /* Set the parenth number.  */
3892                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3893
3894                         oscan->flags = (U8)ARG(nxt);
3895                         if (RExC_open_parens) {
3896                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3897                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3898                         }
3899                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3900                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3901
3902 #ifdef DEBUGGING
3903                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3904                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3905                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3906                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3907 #endif
3908 #if 0
3909                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3910                             regnode *nnxt = regnext(nxt1);
3911                             if (nnxt == nxt) {
3912                                 if (reg_off_by_arg[OP(nxt1)])
3913                                     ARG_SET(nxt1, nxt2 - nxt1);
3914                                 else if (nxt2 - nxt1 < U16_MAX)
3915                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3916                                 else
3917                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3918                             }
3919                             nxt1 = nnxt;
3920                         }
3921 #endif
3922                         /* Optimize again: */
3923                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3924                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3925                     }
3926                     else
3927                         oscan->flags = 0;
3928                 }
3929                 else if ((OP(oscan) == CURLYX)
3930                          && (flags & SCF_WHILEM_VISITED_POS)
3931                          /* See the comment on a similar expression above.
3932                             However, this time it's not a subexpression
3933                             we care about, but the expression itself. */
3934                          && (maxcount == REG_INFTY)
3935                          && data && ++data->whilem_c < 16) {
3936                     /* This stays as CURLYX, we can put the count/of pair. */
3937                     /* Find WHILEM (as in regexec.c) */
3938                     regnode *nxt = oscan + NEXT_OFF(oscan);
3939
3940                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3941                         nxt += ARG(nxt);
3942                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3943                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3944                 }
3945                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3946                     pars++;
3947                 if (flags & SCF_DO_SUBSTR) {
3948                     SV *last_str = NULL;
3949                     int counted = mincount != 0;
3950
3951                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3952 #if defined(SPARC64_GCC_WORKAROUND)
3953                         I32 b = 0;
3954                         STRLEN l = 0;
3955                         const char *s = NULL;
3956                         I32 old = 0;
3957
3958                         if (pos_before >= data->last_start_min)
3959                             b = pos_before;
3960                         else
3961                             b = data->last_start_min;
3962
3963                         l = 0;
3964                         s = SvPV_const(data->last_found, l);
3965                         old = b - data->last_start_min;
3966
3967 #else
3968                         I32 b = pos_before >= data->last_start_min
3969                             ? pos_before : data->last_start_min;
3970                         STRLEN l;
3971                         const char * const s = SvPV_const(data->last_found, l);
3972                         I32 old = b - data->last_start_min;
3973 #endif
3974
3975                         if (UTF)
3976                             old = utf8_hop((U8*)s, old) - (U8*)s;
3977                         l -= old;
3978                         /* Get the added string: */
3979                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3980                         if (deltanext == 0 && pos_before == b) {
3981                             /* What was added is a constant string */
3982                             if (mincount > 1) {
3983                                 SvGROW(last_str, (mincount * l) + 1);
3984                                 repeatcpy(SvPVX(last_str) + l,
3985                                           SvPVX_const(last_str), l, mincount - 1);
3986                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3987                                 /* Add additional parts. */
3988                                 SvCUR_set(data->last_found,
3989                                           SvCUR(data->last_found) - l);
3990                                 sv_catsv(data->last_found, last_str);
3991                                 {
3992                                     SV * sv = data->last_found;
3993                                     MAGIC *mg =
3994                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3995                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3996                                     if (mg && mg->mg_len >= 0)
3997                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3998                                 }
3999                                 data->last_end += l * (mincount - 1);
4000                             }
4001                         } else {
4002                             /* start offset must point into the last copy */
4003                             data->last_start_min += minnext * (mincount - 1);
4004                             data->last_start_max += is_inf ? I32_MAX
4005                                 : (maxcount - 1) * (minnext + data->pos_delta);
4006                         }
4007                     }
4008                     /* It is counted once already... */
4009                     data->pos_min += minnext * (mincount - counted);
4010                     data->pos_delta += - counted * deltanext +
4011                         (minnext + deltanext) * maxcount - minnext * mincount;
4012                     if (mincount != maxcount) {
4013                          /* Cannot extend fixed substrings found inside
4014                             the group.  */
4015                         SCAN_COMMIT(pRExC_state,data,minlenp);
4016                         if (mincount && last_str) {
4017                             SV * const sv = data->last_found;
4018                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4019                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4020
4021                             if (mg)
4022                                 mg->mg_len = -1;
4023                             sv_setsv(sv, last_str);
4024                             data->last_end = data->pos_min;
4025                             data->last_start_min =
4026                                 data->pos_min - CHR_SVLEN(last_str);
4027                             data->last_start_max = is_inf
4028                                 ? I32_MAX
4029                                 : data->pos_min + data->pos_delta
4030                                 - CHR_SVLEN(last_str);
4031                         }
4032                         data->longest = &(data->longest_float);
4033                     }
4034                     SvREFCNT_dec(last_str);
4035                 }
4036                 if (data && (fl & SF_HAS_EVAL))
4037                     data->flags |= SF_HAS_EVAL;
4038               optimize_curly_tail:
4039                 if (OP(oscan) != CURLYX) {
4040                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4041                            && NEXT_OFF(next))
4042                         NEXT_OFF(oscan) += NEXT_OFF(next);
4043                 }
4044                 continue;
4045             default:                    /* REF, ANYOFV, and CLUMP only? */
4046                 if (flags & SCF_DO_SUBSTR) {
4047                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4048                     data->longest = &(data->longest_float);
4049                 }
4050                 is_inf = is_inf_internal = 1;
4051                 if (flags & SCF_DO_STCLASS_OR)
4052                     cl_anything(pRExC_state, data->start_class);
4053                 flags &= ~SCF_DO_STCLASS;
4054                 break;
4055             }
4056         }
4057         else if (OP(scan) == LNBREAK) {
4058             if (flags & SCF_DO_STCLASS) {
4059                 int value = 0;
4060                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4061                 if (flags & SCF_DO_STCLASS_AND) {
4062                     for (value = 0; value < 256; value++)
4063                         if (!is_VERTWS_cp(value))
4064                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4065                 }
4066                 else {
4067                     for (value = 0; value < 256; value++)
4068                         if (is_VERTWS_cp(value))
4069                             ANYOF_BITMAP_SET(data->start_class, value);
4070                 }
4071                 if (flags & SCF_DO_STCLASS_OR)
4072                     cl_and(data->start_class, and_withp);
4073                 flags &= ~SCF_DO_STCLASS;
4074             }
4075             min += 1;
4076             delta += 1;
4077             if (flags & SCF_DO_SUBSTR) {
4078                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4079                 data->pos_min += 1;
4080                 data->pos_delta += 1;
4081                 data->longest = &(data->longest_float);
4082             }
4083         }
4084         else if (REGNODE_SIMPLE(OP(scan))) {
4085             int value = 0;
4086
4087             if (flags & SCF_DO_SUBSTR) {
4088                 SCAN_COMMIT(pRExC_state,data,minlenp);
4089                 data->pos_min++;
4090             }
4091             min++;
4092             if (flags & SCF_DO_STCLASS) {
4093                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4094
4095                 /* Some of the logic below assumes that switching
4096                    locale on will only add false positives. */
4097                 switch (PL_regkind[OP(scan)]) {
4098                 case SANY:
4099                 default:
4100                   do_default:
4101                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4102                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4103                         cl_anything(pRExC_state, data->start_class);
4104                     break;
4105                 case REG_ANY:
4106                     if (OP(scan) == SANY)
4107                         goto do_default;
4108                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4109                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4110                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4111                         cl_anything(pRExC_state, data->start_class);
4112                     }
4113                     if (flags & SCF_DO_STCLASS_AND || !value)
4114                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4115                     break;
4116                 case ANYOF:
4117                     if (flags & SCF_DO_STCLASS_AND)
4118                         cl_and(data->start_class,
4119                                (struct regnode_charclass_class*)scan);
4120                     else
4121                         cl_or(pRExC_state, data->start_class,
4122                               (struct regnode_charclass_class*)scan);
4123                     break;
4124                 case ALNUM:
4125                     if (flags & SCF_DO_STCLASS_AND) {
4126                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4127                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4128                             if (OP(scan) == ALNUMU) {
4129                                 for (value = 0; value < 256; value++) {
4130                                     if (!isWORDCHAR_L1(value)) {
4131                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4132                                     }
4133                                 }
4134                             } else {
4135                                 for (value = 0; value < 256; value++) {
4136                                     if (!isALNUM(value)) {
4137                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4138                                     }
4139                                 }
4140                             }
4141                         }
4142                     }
4143                     else {
4144                         if (data->start_class->flags & ANYOF_LOCALE)
4145                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4146
4147                         /* Even if under locale, set the bits for non-locale
4148                          * in case it isn't a true locale-node.  This will
4149                          * create false positives if it truly is locale */
4150                         if (OP(scan) == ALNUMU) {
4151                             for (value = 0; value < 256; value++) {
4152                                 if (isWORDCHAR_L1(value)) {
4153                                     ANYOF_BITMAP_SET(data->start_class, value);
4154                                 }
4155                             }
4156                         } else {
4157                             for (value = 0; value < 256; value++) {
4158                                 if (isALNUM(value)) {
4159                                     ANYOF_BITMAP_SET(data->start_class, value);
4160                                 }
4161                             }
4162                         }
4163                     }
4164                     break;
4165                 case NALNUM:
4166                     if (flags & SCF_DO_STCLASS_AND) {
4167                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4168                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4169                             if (OP(scan) == NALNUMU) {
4170                                 for (value = 0; value < 256; value++) {
4171                                     if (isWORDCHAR_L1(value)) {
4172                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4173                                     }
4174                                 }
4175                             } else {
4176                                 for (value = 0; value < 256; value++) {
4177                                     if (isALNUM(value)) {
4178                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4179                                     }
4180                                 }
4181                             }
4182                         }
4183                     }
4184                     else {
4185                         if (data->start_class->flags & ANYOF_LOCALE)
4186                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4187
4188                         /* Even if under locale, set the bits for non-locale in
4189                          * case it isn't a true locale-node.  This will create
4190                          * false positives if it truly is locale */
4191                         if (OP(scan) == NALNUMU) {
4192                             for (value = 0; value < 256; value++) {
4193                                 if (! isWORDCHAR_L1(value)) {
4194                                     ANYOF_BITMAP_SET(data->start_class, value);
4195                                 }
4196                             }
4197                         } else {
4198                             for (value = 0; value < 256; value++) {
4199                                 if (! isALNUM(value)) {
4200                                     ANYOF_BITMAP_SET(data->start_class, value);
4201                                 }
4202                             }
4203                         }
4204                     }
4205                     break;
4206                 case SPACE:
4207                     if (flags & SCF_DO_STCLASS_AND) {
4208                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4209                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4210                             if (OP(scan) == SPACEU) {
4211                                 for (value = 0; value < 256; value++) {
4212                                     if (!isSPACE_L1(value)) {
4213                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4214                                     }
4215                                 }
4216                             } else {
4217                                 for (value = 0; value < 256; value++) {
4218                                     if (!isSPACE(value)) {
4219                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4220                                     }
4221                                 }
4222                             }
4223                         }
4224                     }
4225                     else {
4226                         if (data->start_class->flags & ANYOF_LOCALE) {
4227                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4228                         }
4229                         if (OP(scan) == SPACEU) {
4230                             for (value = 0; value < 256; value++) {
4231                                 if (isSPACE_L1(value)) {
4232                                     ANYOF_BITMAP_SET(data->start_class, value);
4233                                 }
4234                             }
4235                         } else {
4236                             for (value = 0; value < 256; value++) {
4237                                 if (isSPACE(value)) {
4238                                     ANYOF_BITMAP_SET(data->start_class, value);
4239                                 }
4240                             }
4241                         }
4242                     }
4243                     break;
4244                 case NSPACE:
4245                     if (flags & SCF_DO_STCLASS_AND) {
4246                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4247                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4248                             if (OP(scan) == NSPACEU) {
4249                                 for (value = 0; value < 256; value++) {
4250                                     if (isSPACE_L1(value)) {
4251                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4252                                     }
4253                                 }
4254                             } else {
4255                                 for (value = 0; value < 256; value++) {
4256                                     if (isSPACE(value)) {
4257                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4258                                     }
4259                                 }
4260                             }
4261                         }
4262                     }
4263                     else {
4264                         if (data->start_class->flags & ANYOF_LOCALE)
4265                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4266                         if (OP(scan) == NSPACEU) {
4267                             for (value = 0; value < 256; value++) {
4268                                 if (!isSPACE_L1(value)) {
4269                                     ANYOF_BITMAP_SET(data->start_class, value);
4270                                 }
4271                             }
4272                         }
4273                         else {
4274                             for (value = 0; value < 256; value++) {
4275                                 if (!isSPACE(value)) {
4276                                     ANYOF_BITMAP_SET(data->start_class, value);
4277                                 }
4278                             }
4279                         }
4280                     }
4281                     break;
4282                 case DIGIT:
4283                     if (flags & SCF_DO_STCLASS_AND) {
4284                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4285                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4286                             for (value = 0; value < 256; value++)
4287                                 if (!isDIGIT(value))
4288                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4289                         }
4290                     }
4291                     else {
4292                         if (data->start_class->flags & ANYOF_LOCALE)
4293                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4294                         for (value = 0; value < 256; value++)
4295                             if (isDIGIT(value))
4296                                 ANYOF_BITMAP_SET(data->start_class, value);
4297                     }
4298                     break;
4299                 case NDIGIT:
4300                     if (flags & SCF_DO_STCLASS_AND) {
4301                         if (!(data->start_class->flags & ANYOF_LOCALE))
4302                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4303                         for (value = 0; value < 256; value++)
4304                             if (isDIGIT(value))
4305                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4306                     }
4307                     else {
4308                         if (data->start_class->flags & ANYOF_LOCALE)
4309                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4310                         for (value = 0; value < 256; value++)
4311                             if (!isDIGIT(value))
4312                                 ANYOF_BITMAP_SET(data->start_class, value);
4313                     }
4314                     break;
4315                 CASE_SYNST_FNC(VERTWS);
4316                 CASE_SYNST_FNC(HORIZWS);
4317
4318                 }
4319                 if (flags & SCF_DO_STCLASS_OR)
4320                     cl_and(data->start_class, and_withp);
4321                 flags &= ~SCF_DO_STCLASS;
4322             }
4323         }
4324         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4325             data->flags |= (OP(scan) == MEOL
4326                             ? SF_BEFORE_MEOL
4327                             : SF_BEFORE_SEOL);
4328         }
4329         else if (  PL_regkind[OP(scan)] == BRANCHJ
4330                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4331                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4332                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4333             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4334                 || OP(scan) == UNLESSM )
4335             {
4336                 /* Negative Lookahead/lookbehind
4337                    In this case we can't do fixed string optimisation.
4338                 */
4339
4340                 I32 deltanext, minnext, fake = 0;
4341                 regnode *nscan;
4342                 struct regnode_charclass_class intrnl;
4343                 int f = 0;
4344
4345                 data_fake.flags = 0;
4346                 if (data) {
4347                     data_fake.whilem_c = data->whilem_c;
4348                     data_fake.last_closep = data->last_closep;
4349                 }
4350                 else
4351                     data_fake.last_closep = &fake;
4352                 data_fake.pos_delta = delta;
4353                 if ( flags & SCF_DO_STCLASS && !scan->flags
4354                      && OP(scan) == IFMATCH ) { /* Lookahead */
4355                     cl_init(pRExC_state, &intrnl);
4356                     data_fake.start_class = &intrnl;
4357                     f |= SCF_DO_STCLASS_AND;
4358                 }
4359                 if (flags & SCF_WHILEM_VISITED_POS)
4360                     f |= SCF_WHILEM_VISITED_POS;
4361                 next = regnext(scan);
4362                 nscan = NEXTOPER(NEXTOPER(scan));
4363                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4364                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4365                 if (scan->flags) {
4366                     if (deltanext) {
4367                         FAIL("Variable length lookbehind not implemented");
4368                     }
4369                     else if (minnext > (I32)U8_MAX) {
4370                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4371                     }
4372                     scan->flags = (U8)minnext;
4373                 }
4374                 if (data) {
4375                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4376                         pars++;
4377                     if (data_fake.flags & SF_HAS_EVAL)
4378                         data->flags |= SF_HAS_EVAL;
4379                     data->whilem_c = data_fake.whilem_c;
4380                 }
4381                 if (f & SCF_DO_STCLASS_AND) {
4382                     if (flags & SCF_DO_STCLASS_OR) {
4383                         /* OR before, AND after: ideally we would recurse with
4384                          * data_fake to get the AND applied by study of the
4385                          * remainder of the pattern, and then derecurse;
4386                          * *** HACK *** for now just treat as "no information".
4387                          * See [perl #56690].
4388                          */
4389                         cl_init(pRExC_state, data->start_class);
4390                     }  else {
4391                         /* AND before and after: combine and continue */
4392                         const int was = (data->start_class->flags & ANYOF_EOS);
4393
4394                         cl_and(data->start_class, &intrnl);
4395                         if (was)
4396                             data->start_class->flags |= ANYOF_EOS;
4397                     }
4398                 }
4399             }
4400 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4401             else {
4402                 /* Positive Lookahead/lookbehind
4403                    In this case we can do fixed string optimisation,
4404                    but we must be careful about it. Note in the case of
4405                    lookbehind the positions will be offset by the minimum
4406                    length of the pattern, something we won't know about
4407                    until after the recurse.
4408                 */
4409                 I32 deltanext, fake = 0;
4410                 regnode *nscan;
4411                 struct regnode_charclass_class intrnl;
4412                 int f = 0;
4413                 /* We use SAVEFREEPV so that when the full compile
4414                     is finished perl will clean up the allocated
4415                     minlens when it's all done. This way we don't
4416                     have to worry about freeing them when we know
4417                     they wont be used, which would be a pain.
4418                  */
4419                 I32 *minnextp;
4420                 Newx( minnextp, 1, I32 );
4421                 SAVEFREEPV(minnextp);
4422
4423                 if (data) {
4424                     StructCopy(data, &data_fake, scan_data_t);
4425                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4426                         f |= SCF_DO_SUBSTR;
4427                         if (scan->flags)
4428                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4429                         data_fake.last_found=newSVsv(data->last_found);
4430                     }
4431                 }
4432                 else
4433                     data_fake.last_closep = &fake;
4434                 data_fake.flags = 0;
4435                 data_fake.pos_delta = delta;
4436                 if (is_inf)
4437                     data_fake.flags |= SF_IS_INF;
4438                 if ( flags & SCF_DO_STCLASS && !scan->flags
4439                      && OP(scan) == IFMATCH ) { /* Lookahead */
4440                     cl_init(pRExC_state, &intrnl);
4441                     data_fake.start_class = &intrnl;
4442                     f |= SCF_DO_STCLASS_AND;
4443                 }
4444                 if (flags & SCF_WHILEM_VISITED_POS)
4445                     f |= SCF_WHILEM_VISITED_POS;
4446                 next = regnext(scan);
4447                 nscan = NEXTOPER(NEXTOPER(scan));
4448
4449                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4450                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4451                 if (scan->flags) {
4452                     if (deltanext) {
4453                         FAIL("Variable length lookbehind not implemented");
4454                     }
4455                     else if (*minnextp > (I32)U8_MAX) {
4456                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4457                     }
4458                     scan->flags = (U8)*minnextp;
4459                 }
4460
4461                 *minnextp += min;
4462
4463                 if (f & SCF_DO_STCLASS_AND) {
4464                     const int was = (data->start_class->flags & ANYOF_EOS);
4465
4466                     cl_and(data->start_class, &intrnl);
4467                     if (was)
4468                         data->start_class->flags |= ANYOF_EOS;
4469                 }
4470                 if (data) {
4471                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4472                         pars++;
4473                     if (data_fake.flags & SF_HAS_EVAL)
4474                         data->flags |= SF_HAS_EVAL;
4475                     data->whilem_c = data_fake.whilem_c;
4476                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4477                         if (RExC_rx->minlen<*minnextp)
4478                             RExC_rx->minlen=*minnextp;
4479                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4480                         SvREFCNT_dec(data_fake.last_found);
4481
4482                         if ( data_fake.minlen_fixed != minlenp )
4483                         {
4484                             data->offset_fixed= data_fake.offset_fixed;
4485                             data->minlen_fixed= data_fake.minlen_fixed;
4486                             data->lookbehind_fixed+= scan->flags;
4487                         }
4488                         if ( data_fake.minlen_float != minlenp )
4489                         {
4490                             data->minlen_float= data_fake.minlen_float;
4491                             data->offset_float_min=data_fake.offset_float_min;
4492                             data->offset_float_max=data_fake.offset_float_max;
4493                             data->lookbehind_float+= scan->flags;
4494                         }
4495                     }
4496                 }
4497
4498
4499             }
4500 #endif
4501         }
4502         else if (OP(scan) == OPEN) {
4503             if (stopparen != (I32)ARG(scan))
4504                 pars++;
4505         }
4506         else if (OP(scan) == CLOSE) {
4507             if (stopparen == (I32)ARG(scan)) {
4508                 break;
4509             }
4510             if ((I32)ARG(scan) == is_par) {
4511                 next = regnext(scan);
4512
4513                 if ( next && (OP(next) != WHILEM) && next < last)
4514                     is_par = 0;         /* Disable optimization */
4515             }
4516             if (data)
4517                 *(data->last_closep) = ARG(scan);
4518         }
4519         else if (OP(scan) == EVAL) {
4520                 if (data)
4521                     data->flags |= SF_HAS_EVAL;
4522         }
4523         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4524             if (flags & SCF_DO_SUBSTR) {
4525                 SCAN_COMMIT(pRExC_state,data,minlenp);
4526                 flags &= ~SCF_DO_SUBSTR;
4527             }
4528             if (data && OP(scan)==ACCEPT) {
4529                 data->flags |= SCF_SEEN_ACCEPT;
4530                 if (stopmin > min)
4531                     stopmin = min;
4532             }
4533         }
4534         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4535         {
4536                 if (flags & SCF_DO_SUBSTR) {
4537                     SCAN_COMMIT(pRExC_state,data,minlenp);
4538                     data->longest = &(data->longest_float);
4539                 }
4540                 is_inf = is_inf_internal = 1;
4541                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4542                     cl_anything(pRExC_state, data->start_class);
4543                 flags &= ~SCF_DO_STCLASS;
4544         }
4545         else if (OP(scan) == GPOS) {
4546             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4547                 !(delta || is_inf || (data && data->pos_delta)))
4548             {
4549                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4550                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4551                 if (RExC_rx->gofs < (U32)min)
4552                     RExC_rx->gofs = min;
4553             } else {
4554                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4555                 RExC_rx->gofs = 0;
4556             }
4557         }
4558 #ifdef TRIE_STUDY_OPT
4559 #ifdef FULL_TRIE_STUDY
4560         else if (PL_regkind[OP(scan)] == TRIE) {
4561             /* NOTE - There is similar code to this block above for handling
4562                BRANCH nodes on the initial study.  If you change stuff here
4563                check there too. */
4564             regnode *trie_node= scan;
4565             regnode *tail= regnext(scan);
4566             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4567             I32 max1 = 0, min1 = I32_MAX;
4568             struct regnode_charclass_class accum;
4569
4570             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4571                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4572             if (flags & SCF_DO_STCLASS)
4573                 cl_init_zero(pRExC_state, &accum);
4574
4575             if (!trie->jump) {
4576                 min1= trie->minlen;
4577                 max1= trie->maxlen;
4578             } else {
4579                 const regnode *nextbranch= NULL;
4580                 U32 word;
4581
4582                 for ( word=1 ; word <= trie->wordcount ; word++)
4583                 {
4584                     I32 deltanext=0, minnext=0, f = 0, fake;
4585                     struct regnode_charclass_class this_class;
4586
4587                     data_fake.flags = 0;
4588                     if (data) {
4589                         data_fake.whilem_c = data->whilem_c;
4590                         data_fake.last_closep = data->last_closep;
4591                     }
4592                     else
4593                         data_fake.last_closep = &fake;
4594                     data_fake.pos_delta = delta;
4595                     if (flags & SCF_DO_STCLASS) {
4596                         cl_init(pRExC_state, &this_class);
4597                         data_fake.start_class = &this_class;
4598                         f = SCF_DO_STCLASS_AND;
4599                     }
4600                     if (flags & SCF_WHILEM_VISITED_POS)
4601                         f |= SCF_WHILEM_VISITED_POS;
4602
4603                     if (trie->jump[word]) {
4604                         if (!nextbranch)
4605                             nextbranch = trie_node + trie->jump[0];
4606                         scan= trie_node + trie->jump[word];
4607                         /* We go from the jump point to the branch that follows
4608                            it. Note this means we need the vestigal unused branches
4609                            even though they arent otherwise used.
4610                          */
4611                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4612                             &deltanext, (regnode *)nextbranch, &data_fake,
4613                             stopparen, recursed, NULL, f,depth+1);
4614                     }
4615                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4616                         nextbranch= regnext((regnode*)nextbranch);
4617
4618                     if (min1 > (I32)(minnext + trie->minlen))
4619                         min1 = minnext + trie->minlen;
4620                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4621                         max1 = minnext + deltanext + trie->maxlen;
4622                     if (deltanext == I32_MAX)
4623                         is_inf = is_inf_internal = 1;
4624
4625                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4626                         pars++;
4627                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4628                         if ( stopmin > min + min1)
4629                             stopmin = min + min1;
4630                         flags &= ~SCF_DO_SUBSTR;
4631                         if (data)
4632                             data->flags |= SCF_SEEN_ACCEPT;
4633                     }
4634                     if (data) {
4635                         if (data_fake.flags & SF_HAS_EVAL)
4636                             data->flags |= SF_HAS_EVAL;
4637                         data->whilem_c = data_fake.whilem_c;
4638                     }
4639                     if (flags & SCF_DO_STCLASS)
4640                         cl_or(pRExC_state, &accum, &this_class);
4641                 }
4642             }
4643             if (flags & SCF_DO_SUBSTR) {
4644                 data->pos_min += min1;
4645                 data->pos_delta += max1 - min1;
4646                 if (max1 != min1 || is_inf)
4647                     data->longest = &(data->longest_float);
4648             }
4649             min += min1;
4650             delta += max1 - min1;
4651             if (flags & SCF_DO_STCLASS_OR) {
4652                 cl_or(pRExC_state, data->start_class, &accum);
4653                 if (min1) {
4654                     cl_and(data->start_class, and_withp);
4655                     flags &= ~SCF_DO_STCLASS;
4656                 }
4657             }
4658             else if (flags & SCF_DO_STCLASS_AND) {
4659                 if (min1) {
4660                     cl_and(data->start_class, &accum);
4661                     flags &= ~SCF_DO_STCLASS;
4662                 }
4663                 else {
4664                     /* Switch to OR mode: cache the old value of
4665                      * data->start_class */
4666                     INIT_AND_WITHP;
4667                     StructCopy(data->start_class, and_withp,
4668                                struct regnode_charclass_class);
4669                     flags &= ~SCF_DO_STCLASS_AND;
4670                     StructCopy(&accum, data->start_class,
4671                                struct regnode_charclass_class);
4672                     flags |= SCF_DO_STCLASS_OR;
4673                     data->start_class->flags |= ANYOF_EOS;
4674                 }
4675             }
4676             scan= tail;
4677             continue;
4678         }
4679 #else
4680         else if (PL_regkind[OP(scan)] == TRIE) {
4681             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4682             U8*bang=NULL;
4683
4684             min += trie->minlen;
4685             delta += (trie->maxlen - trie->minlen);
4686             flags &= ~SCF_DO_STCLASS; /* xxx */
4687             if (flags & SCF_DO_SUBSTR) {
4688                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4689                 data->pos_min += trie->minlen;
4690                 data->pos_delta += (trie->maxlen - trie->minlen);
4691                 if (trie->maxlen != trie->minlen)
4692                     data->longest = &(data->longest_float);
4693             }
4694             if (trie->jump) /* no more substrings -- for now /grr*/
4695                 flags &= ~SCF_DO_SUBSTR;
4696         }
4697 #endif /* old or new */
4698 #endif /* TRIE_STUDY_OPT */
4699
4700         /* Else: zero-length, ignore. */
4701         scan = regnext(scan);
4702     }
4703     if (frame) {
4704         last = frame->last;
4705         scan = frame->next;
4706         stopparen = frame->stop;
4707         frame = frame->prev;
4708         goto fake_study_recurse;
4709     }
4710
4711   finish:
4712     assert(!frame);
4713     DEBUG_STUDYDATA("pre-fin:",data,depth);
4714
4715     *scanp = scan;
4716     *deltap = is_inf_internal ? I32_MAX : delta;
4717     if (flags & SCF_DO_SUBSTR && is_inf)
4718         data->pos_delta = I32_MAX - data->pos_min;
4719     if (is_par > (I32)U8_MAX)
4720         is_par = 0;
4721     if (is_par && pars==1 && data) {
4722         data->flags |= SF_IN_PAR;
4723         data->flags &= ~SF_HAS_PAR;
4724     }
4725     else if (pars && data) {
4726         data->flags |= SF_HAS_PAR;
4727         data->flags &= ~SF_IN_PAR;
4728     }
4729     if (flags & SCF_DO_STCLASS_OR)
4730         cl_and(data->start_class, and_withp);
4731     if (flags & SCF_TRIE_RESTUDY)
4732         data->flags |=  SCF_TRIE_RESTUDY;
4733
4734     DEBUG_STUDYDATA("post-fin:",data,depth);
4735
4736     return min < stopmin ? min : stopmin;
4737 }
4738
4739 STATIC U32
4740 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4741 {
4742     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4743
4744     PERL_ARGS_ASSERT_ADD_DATA;
4745
4746     Renewc(RExC_rxi->data,
4747            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4748            char, struct reg_data);
4749     if(count)
4750         Renew(RExC_rxi->data->what, count + n, U8);
4751     else
4752         Newx(RExC_rxi->data->what, n, U8);
4753     RExC_rxi->data->count = count + n;
4754     Copy(s, RExC_rxi->data->what + count, n, U8);
4755     return count;
4756 }
4757
4758 /*XXX: todo make this not included in a non debugging perl */
4759 #ifndef PERL_IN_XSUB_RE
4760 void
4761 Perl_reginitcolors(pTHX)
4762 {
4763     dVAR;
4764     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4765     if (s) {
4766         char *t = savepv(s);
4767         int i = 0;
4768         PL_colors[0] = t;
4769         while (++i < 6) {
4770             t = strchr(t, '\t');
4771             if (t) {
4772                 *t = '\0';
4773                 PL_colors[i] = ++t;
4774             }
4775             else
4776                 PL_colors[i] = t = (char *)"";
4777         }
4778     } else {
4779         int i = 0;
4780         while (i < 6)
4781             PL_colors[i++] = (char *)"";
4782     }
4783     PL_colorset = 1;
4784 }
4785 #endif
4786
4787
4788 #ifdef TRIE_STUDY_OPT
4789 #define CHECK_RESTUDY_GOTO                                  \
4790         if (                                                \
4791               (data.flags & SCF_TRIE_RESTUDY)               \
4792               && ! restudied++                              \
4793         )     goto reStudy
4794 #else
4795 #define CHECK_RESTUDY_GOTO
4796 #endif
4797
4798 /*
4799  - pregcomp - compile a regular expression into internal code
4800  *
4801  * We can't allocate space until we know how big the compiled form will be,
4802  * but we can't compile it (and thus know how big it is) until we've got a
4803  * place to put the code.  So we cheat:  we compile it twice, once with code
4804  * generation turned off and size counting turned on, and once "for real".
4805  * This also means that we don't allocate space until we are sure that the
4806  * thing really will compile successfully, and we never have to move the
4807  * code and thus invalidate pointers into it.  (Note that it has to be in
4808  * one piece because free() must be able to free it all.) [NB: not true in perl]
4809  *
4810  * Beware that the optimization-preparation code in here knows about some
4811  * of the structure of the compiled regexp.  [I'll say.]
4812  */
4813
4814
4815
4816 #ifndef PERL_IN_XSUB_RE
4817 #define RE_ENGINE_PTR &PL_core_reg_engine
4818 #else
4819 extern const struct regexp_engine my_reg_engine;
4820 #define RE_ENGINE_PTR &my_reg_engine
4821 #endif
4822
4823 #ifndef PERL_IN_XSUB_RE
4824 REGEXP *
4825 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4826 {
4827     dVAR;
4828     HV * const table = GvHV(PL_hintgv);
4829
4830     PERL_ARGS_ASSERT_PREGCOMP;
4831
4832     /* Dispatch a request to compile a regexp to correct
4833        regexp engine. */
4834     if (table) {
4835         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4836         GET_RE_DEBUG_FLAGS_DECL;
4837         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4838             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4839             DEBUG_COMPILE_r({
4840                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4841                     SvIV(*ptr));
4842             });
4843             return CALLREGCOMP_ENG(eng, pattern, flags);
4844         }
4845     }
4846     return Perl_re_compile(aTHX_ pattern, flags);
4847 }
4848 #endif
4849
4850 REGEXP *
4851 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4852 {
4853     dVAR;
4854     REGEXP *rx;
4855     struct regexp *r;
4856     register regexp_internal *ri;
4857     STRLEN plen;
4858     char* VOL exp;
4859     char* xend;
4860     regnode *scan;
4861     I32 flags;
4862     I32 minlen = 0;
4863     U32 pm_flags;
4864
4865     /* these are all flags - maybe they should be turned
4866      * into a single int with different bit masks */
4867     I32 sawlookahead = 0;
4868     I32 sawplus = 0;
4869     I32 sawopen = 0;
4870     bool used_setjump = FALSE;
4871     regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4872
4873     U8 jump_ret = 0;
4874     dJMPENV;
4875     scan_data_t data;
4876     RExC_state_t RExC_state;
4877     RExC_state_t * const pRExC_state = &RExC_state;
4878 #ifdef TRIE_STUDY_OPT
4879     int restudied;
4880     RExC_state_t copyRExC_state;
4881 #endif
4882     GET_RE_DEBUG_FLAGS_DECL;
4883
4884     PERL_ARGS_ASSERT_RE_COMPILE;
4885
4886     DEBUG_r(if (!PL_colorset) reginitcolors());
4887
4888 #ifndef PERL_IN_XSUB_RE
4889     /* Initialize these here instead of as-needed, as is quick and avoids
4890      * having to test them each time otherwise */
4891     if (! PL_AboveLatin1) {
4892         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
4893         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
4894         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
4895
4896         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4897         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4898
4899         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
4900         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
4901
4902         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
4903         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
4904
4905         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
4906
4907         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
4908         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
4909
4910         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
4911
4912         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
4913         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
4914
4915         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4916         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4917
4918         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
4919         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
4920
4921         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
4922         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
4923
4924         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
4925         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
4926
4927         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
4928         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
4929
4930         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
4931         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
4932
4933         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
4934         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
4935
4936         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
4937
4938         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
4939         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
4940
4941         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
4942         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
4943     }
4944 #endif
4945
4946     exp = SvPV(pattern, plen);
4947
4948     if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4949         RExC_utf8 = RExC_orig_utf8 = 0;
4950     }
4951     else {
4952         RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4953     }
4954     RExC_uni_semantics = 0;
4955     RExC_contains_locale = 0;
4956
4957     /****************** LONG JUMP TARGET HERE***********************/
4958     /* Longjmp back to here if have to switch in midstream to utf8 */
4959     if (! RExC_orig_utf8) {
4960         JMPENV_PUSH(jump_ret);
4961         used_setjump = TRUE;
4962     }
4963
4964     if (jump_ret == 0) {    /* First time through */
4965         xend = exp + plen;
4966
4967         DEBUG_COMPILE_r({
4968             SV *dsv= sv_newmortal();
4969             RE_PV_QUOTED_DECL(s, RExC_utf8,
4970                 dsv, exp, plen, 60);
4971             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4972                            PL_colors[4],PL_colors[5],s);
4973         });
4974     }
4975     else {  /* longjumped back */
4976         STRLEN len = plen;
4977
4978         /* If the cause for the longjmp was other than changing to utf8, pop
4979          * our own setjmp, and longjmp to the correct handler */
4980         if (jump_ret != UTF8_LONGJMP) {
4981             JMPENV_POP;
4982             JMPENV_JUMP(jump_ret);
4983         }
4984
4985         GET_RE_DEBUG_FLAGS;
4986
4987         /* It's possible to write a regexp in ascii that represents Unicode
4988         codepoints outside of the byte range, such as via \x{100}. If we
4989         detect such a sequence we have to convert the entire pattern to utf8
4990         and then recompile, as our sizing calculation will have been based
4991         on 1 byte == 1 character, but we will need to use utf8 to encode
4992         at least some part of the pattern, and therefore must convert the whole
4993         thing.
4994         -- dmq */
4995         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4996             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4997         exp = (char*)Perl_bytes_to_utf8(aTHX_
4998                                         (U8*)SvPV_nomg(pattern, plen),
4999                                         &len);
5000         xend = exp + len;
5001         RExC_orig_utf8 = RExC_utf8 = 1;
5002         SAVEFREEPV(exp);
5003     }
5004
5005 #ifdef TRIE_STUDY_OPT
5006     restudied = 0;
5007 #endif
5008
5009     pm_flags = orig_pm_flags;
5010
5011     if (initial_charset == REGEX_LOCALE_CHARSET) {
5012         RExC_contains_locale = 1;
5013     }
5014     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5015
5016         /* Set to use unicode semantics if the pattern is in utf8 and has the
5017          * 'depends' charset specified, as it means unicode when utf8  */
5018         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5019     }
5020
5021     RExC_precomp = exp;
5022     RExC_flags = pm_flags;
5023     RExC_sawback = 0;
5024
5025     RExC_seen = 0;
5026     RExC_in_lookbehind = 0;
5027     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5028     RExC_seen_evals = 0;
5029     RExC_extralen = 0;
5030     RExC_override_recoding = 0;
5031
5032     /* First pass: determine size, legality. */
5033     RExC_parse = exp;
5034     RExC_start = exp;
5035     RExC_end = xend;
5036     RExC_naughty = 0;
5037     RExC_npar = 1;
5038     RExC_nestroot = 0;
5039     RExC_size = 0L;
5040     RExC_emit = &PL_regdummy;
5041     RExC_whilem_seen = 0;
5042     RExC_open_parens = NULL;
5043     RExC_close_parens = NULL;
5044     RExC_opend = NULL;
5045     RExC_paren_names = NULL;
5046 #ifdef DEBUGGING
5047     RExC_paren_name_list = NULL;
5048 #endif
5049     RExC_recurse = NULL;
5050     RExC_recurse_count = 0;
5051
5052 #if 0 /* REGC() is (currently) a NOP at the first pass.
5053        * Clever compilers notice this and complain. --jhi */
5054     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5055 #endif
5056     DEBUG_PARSE_r(
5057         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5058         RExC_lastnum=0;
5059         RExC_lastparse=NULL;
5060     );
5061     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5062         RExC_precomp = NULL;
5063         return(NULL);
5064     }
5065
5066     /* Here, finished first pass.  Get rid of any added setjmp */
5067     if (used_setjump) {
5068         JMPENV_POP;
5069     }
5070
5071     DEBUG_PARSE_r({
5072         PerlIO_printf(Perl_debug_log,
5073             "Required size %"IVdf" nodes\n"
5074             "Starting second pass (creation)\n",
5075             (IV)RExC_size);
5076         RExC_lastnum=0;
5077         RExC_lastparse=NULL;
5078     });
5079
5080     /* The first pass could have found things that force Unicode semantics */
5081     if ((RExC_utf8 || RExC_uni_semantics)
5082          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5083     {
5084         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5085     }
5086
5087     /* Small enough for pointer-storage convention?
5088        If extralen==0, this means that we will not need long jumps. */
5089     if (RExC_size >= 0x10000L && RExC_extralen)
5090         RExC_size += RExC_extralen;
5091     else
5092         RExC_extralen = 0;
5093     if (RExC_whilem_seen > 15)
5094         RExC_whilem_seen = 15;
5095
5096     /* Allocate space and zero-initialize. Note, the two step process
5097        of zeroing when in debug mode, thus anything assigned has to
5098        happen after that */
5099     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5100     r = (struct regexp*)SvANY(rx);
5101     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5102          char, regexp_internal);
5103     if ( r == NULL || ri == NULL )
5104         FAIL("Regexp out of space");
5105 #ifdef DEBUGGING
5106     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5107     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5108 #else
5109     /* bulk initialize base fields with 0. */
5110     Zero(ri, sizeof(regexp_internal), char);
5111 #endif
5112
5113     /* non-zero initialization begins here */
5114     RXi_SET( r, ri );
5115     r->engine= RE_ENGINE_PTR;
5116     r->extflags = pm_flags;
5117     {
5118         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5119         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5120
5121         /* The caret is output if there are any defaults: if not all the STD
5122          * flags are set, or if no character set specifier is needed */
5123         bool has_default =
5124                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5125                     || ! has_charset);
5126         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5127         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5128                             >> RXf_PMf_STD_PMMOD_SHIFT);
5129         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5130         char *p;
5131         /* Allocate for the worst case, which is all the std flags are turned
5132          * on.  If more precision is desired, we could do a population count of
5133          * the flags set.  This could be done with a small lookup table, or by
5134          * shifting, masking and adding, or even, when available, assembly
5135          * language for a machine-language population count.
5136          * We never output a minus, as all those are defaults, so are
5137          * covered by the caret */
5138         const STRLEN wraplen = plen + has_p + has_runon
5139             + has_default       /* If needs a caret */
5140
5141                 /* If needs a character set specifier */
5142             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5143             + (sizeof(STD_PAT_MODS) - 1)
5144             + (sizeof("(?:)") - 1);
5145
5146         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5147         SvPOK_on(rx);
5148         SvFLAGS(rx) |= SvUTF8(pattern);
5149         *p++='('; *p++='?';
5150
5151         /* If a default, cover it using the caret */
5152         if (has_default) {
5153             *p++= DEFAULT_PAT_MOD;
5154         }
5155         if (has_charset) {
5156             STRLEN len;
5157             const char* const name = get_regex_charset_name(r->extflags, &len);
5158             Copy(name, p, len, char);
5159             p += len;
5160         }
5161         if (has_p)
5162             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5163         {
5164             char ch;
5165             while((ch = *fptr++)) {
5166                 if(reganch & 1)
5167                     *p++ = ch;
5168                 reganch >>= 1;
5169             }
5170         }
5171
5172         *p++ = ':';
5173         Copy(RExC_precomp, p, plen, char);
5174         assert ((RX_WRAPPED(rx) - p) < 16);
5175         r->pre_prefix = p - RX_WRAPPED(rx);
5176         p += plen;
5177         if (has_runon)
5178             *p++ = '\n';
5179         *p++ = ')';
5180         *p = 0;
5181         SvCUR_set(rx, p - SvPVX_const(rx));
5182     }
5183
5184     r->intflags = 0;
5185     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5186
5187     if (RExC_seen & REG_SEEN_RECURSE) {
5188         Newxz(RExC_open_parens, RExC_npar,regnode *);
5189         SAVEFREEPV(RExC_open_parens);
5190         Newxz(RExC_close_parens,RExC_npar,regnode *);
5191         SAVEFREEPV(RExC_close_parens);
5192     }
5193
5194     /* Useful during FAIL. */
5195 #ifdef RE_TRACK_PATTERN_OFFSETS
5196     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5197     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5198                           "%s %"UVuf" bytes for offset annotations.\n",
5199                           ri->u.offsets ? "Got" : "Couldn't get",
5200                           (UV)((2*RExC_size+1) * sizeof(U32))));
5201 #endif
5202     SetProgLen(ri,RExC_size);
5203     RExC_rx_sv = rx;
5204     RExC_rx = r;
5205     RExC_rxi = ri;
5206
5207     /* Second pass: emit code. */
5208     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
5209     RExC_parse = exp;
5210     RExC_end = xend;
5211     RExC_naughty = 0;
5212     RExC_npar = 1;
5213     RExC_emit_start = ri->program;
5214     RExC_emit = ri->program;
5215     RExC_emit_bound = ri->program + RExC_size + 1;
5216
5217     /* Store the count of eval-groups for security checks: */
5218     RExC_rx->seen_evals = RExC_seen_evals;
5219     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5220     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5221         ReREFCNT_dec(rx);
5222         return(NULL);
5223     }
5224     /* XXXX To minimize changes to RE engine we always allocate
5225        3-units-long substrs field. */
5226     Newx(r->substrs, 1, struct reg_substr_data);
5227     if (RExC_recurse_count) {
5228         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5229         SAVEFREEPV(RExC_recurse);
5230     }
5231
5232 reStudy:
5233     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5234     Zero(r->substrs, 1, struct reg_substr_data);
5235
5236 #ifdef TRIE_STUDY_OPT
5237     if (!restudied) {
5238         StructCopy(&zero_scan_data, &data, scan_data_t);
5239         copyRExC_state = RExC_state;
5240     } else {
5241         U32 seen=RExC_seen;
5242         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5243
5244         RExC_state = copyRExC_state;
5245         if (seen & REG_TOP_LEVEL_BRANCHES)
5246             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5247         else
5248             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5249         if (data.last_found) {
5250             SvREFCNT_dec(data.longest_fixed);
5251             SvREFCNT_dec(data.longest_float);
5252             SvREFCNT_dec(data.last_found);
5253         }
5254         StructCopy(&zero_scan_data, &data, scan_data_t);
5255     }
5256 #else
5257     StructCopy(&zero_scan_data, &data, scan_data_t);
5258 #endif
5259
5260     /* Dig out information for optimizations. */
5261     r->extflags = RExC_flags; /* was pm_op */
5262     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5263
5264     if (UTF)
5265         SvUTF8_on(rx);  /* Unicode in it? */
5266     ri->regstclass = NULL;
5267     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5268         r->intflags |= PREGf_NAUGHTY;
5269     scan = ri->program + 1;             /* First BRANCH. */
5270
5271     /* testing for BRANCH here tells us whether there is "must appear"
5272        data in the pattern. If there is then we can use it for optimisations */
5273     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5274         I32 fake;
5275         STRLEN longest_float_length, longest_fixed_length;
5276         struct regnode_charclass_class ch_class; /* pointed to by data */
5277         int stclass_flag;
5278         I32 last_close = 0; /* pointed to by data */
5279         regnode *first= scan;
5280         regnode *first_next= regnext(first);
5281         /*
5282          * Skip introductions and multiplicators >= 1
5283          * so that we can extract the 'meat' of the pattern that must
5284          * match in the large if() sequence following.
5285          * NOTE that EXACT is NOT covered here, as it is normally
5286          * picked up by the optimiser separately.
5287          *
5288          * This is unfortunate as the optimiser isnt handling lookahead
5289          * properly currently.
5290          *
5291          */
5292         while ((OP(first) == OPEN && (sawopen = 1)) ||
5293                /* An OR of *one* alternative - should not happen now. */
5294             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5295             /* for now we can't handle lookbehind IFMATCH*/
5296             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5297             (OP(first) == PLUS) ||
5298             (OP(first) == MINMOD) ||
5299                /* An {n,m} with n>0 */
5300             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5301             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5302         {
5303                 /*
5304                  * the only op that could be a regnode is PLUS, all the rest
5305                  * will be regnode_1 or regnode_2.
5306                  *
5307                  */
5308                 if (OP(first) == PLUS)
5309                     sawplus = 1;
5310                 else
5311                     first += regarglen[OP(first)];
5312
5313                 first = NEXTOPER(first);
5314                 first_next= regnext(first);
5315         }
5316
5317         /* Starting-point info. */
5318       again:
5319         DEBUG_PEEP("first:",first,0);
5320         /* Ignore EXACT as we deal with it later. */
5321         if (PL_regkind[OP(first)] == EXACT) {
5322             if (OP(first) == EXACT)
5323                 NOOP;   /* Empty, get anchored substr later. */
5324             else
5325                 ri->regstclass = first;
5326         }
5327 #ifdef TRIE_STCLASS
5328         else if (PL_regkind[OP(first)] == TRIE &&
5329                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5330         {
5331             regnode *trie_op;
5332             /* this can happen only on restudy */
5333             if ( OP(first) == TRIE ) {
5334                 struct regnode_1 *trieop = (struct regnode_1 *)
5335                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
5336                 StructCopy(first,trieop,struct regnode_1);
5337                 trie_op=(regnode *)trieop;
5338             } else {
5339                 struct regnode_charclass *trieop = (struct regnode_charclass *)
5340                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5341                 StructCopy(first,trieop,struct regnode_charclass);
5342                 trie_op=(regnode *)trieop;
5343             }
5344             OP(trie_op)+=2;
5345             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5346             ri->regstclass = trie_op;
5347         }
5348 #endif
5349         else if (REGNODE_SIMPLE(OP(first)))
5350             ri->regstclass = first;
5351         else if (PL_regkind[OP(first)] == BOUND ||
5352                  PL_regkind[OP(first)] == NBOUND)
5353             ri->regstclass = first;
5354         else if (PL_regkind[OP(first)] == BOL) {
5355             r->extflags |= (OP(first) == MBOL
5356                            ? RXf_ANCH_MBOL
5357                            : (OP(first) == SBOL
5358                               ? RXf_ANCH_SBOL
5359                               : RXf_ANCH_BOL));
5360             first = NEXTOPER(first);
5361             goto again;
5362         }
5363         else if (OP(first) == GPOS) {
5364             r->extflags |= RXf_ANCH_GPOS;
5365             first = NEXTOPER(first);
5366             goto again;
5367         }
5368         else if ((!sawopen || !RExC_sawback) &&
5369             (OP(first) == STAR &&
5370             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5371             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5372         {
5373             /* turn .* into ^.* with an implied $*=1 */
5374             const int type =
5375                 (OP(NEXTOPER(first)) == REG_ANY)
5376                     ? RXf_ANCH_MBOL
5377                     : RXf_ANCH_SBOL;
5378             r->extflags |= type;
5379             r->intflags |= PREGf_IMPLICIT;
5380             first = NEXTOPER(first);
5381             goto again;
5382         }
5383         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5384             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5385             /* x+ must match at the 1st pos of run of x's */
5386             r->intflags |= PREGf_SKIP;
5387
5388         /* Scan is after the zeroth branch, first is atomic matcher. */
5389 #ifdef TRIE_STUDY_OPT
5390         DEBUG_PARSE_r(
5391             if (!restudied)
5392                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5393                               (IV)(first - scan + 1))
5394         );
5395 #else
5396         DEBUG_PARSE_r(
5397             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5398                 (IV)(first - scan + 1))
5399         );
5400 #endif
5401
5402
5403         /*
5404         * If there's something expensive in the r.e., find the
5405         * longest literal string that must appear and make it the
5406         * regmust.  Resolve ties in favor of later strings, since
5407         * the regstart check works with the beginning of the r.e.
5408         * and avoiding duplication strengthens checking.  Not a
5409         * strong reason, but sufficient in the absence of others.
5410         * [Now we resolve ties in favor of the earlier string if
5411         * it happens that c_offset_min has been invalidated, since the
5412         * earlier string may buy us something the later one won't.]
5413         */
5414
5415         data.longest_fixed = newSVpvs("");
5416         data.longest_float = newSVpvs("");
5417         data.last_found = newSVpvs("");
5418         data.longest = &(data.longest_fixed);
5419         first = scan;
5420         if (!ri->regstclass) {
5421             cl_init(pRExC_state, &ch_class);
5422             data.start_class = &ch_class;
5423             stclass_flag = SCF_DO_STCLASS_AND;
5424         } else                          /* XXXX Check for BOUND? */
5425             stclass_flag = 0;
5426         data.last_closep = &last_close;
5427
5428         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5429             &data, -1, NULL, NULL,
5430             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5431
5432
5433         CHECK_RESTUDY_GOTO;
5434
5435
5436         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5437              && data.last_start_min == 0 && data.last_end > 0
5438              && !RExC_seen_zerolen
5439              && !(RExC_seen & REG_SEEN_VERBARG)
5440              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5441             r->extflags |= RXf_CHECK_ALL;
5442         scan_commit(pRExC_state, &data,&minlen,0);
5443         SvREFCNT_dec(data.last_found);
5444
5445         /* Note that code very similar to this but for anchored string
5446            follows immediately below, changes may need to be made to both.
5447            Be careful.
5448          */
5449         longest_float_length = CHR_SVLEN(data.longest_float);
5450         if (longest_float_length
5451             || (data.flags & SF_FL_BEFORE_EOL
5452                 && (!(data.flags & SF_FL_BEFORE_MEOL)
5453                     || (RExC_flags & RXf_PMf_MULTILINE))))
5454         {
5455             I32 t,ml;
5456
5457             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5458             if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5459                 || (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5460                     && data.offset_fixed == data.offset_float_min
5461                     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5462                     goto remove_float;          /* As in (a)+. */
5463
5464             /* copy the information about the longest float from the reg_scan_data
5465                over to the program. */
5466             if (SvUTF8(data.longest_float)) {
5467                 r->float_utf8 = data.longest_float;
5468                 r->float_substr = NULL;
5469             } else {
5470                 r->float_substr = data.longest_float;
5471                 r->float_utf8 = NULL;
5472             }
5473             /* float_end_shift is how many chars that must be matched that
5474                follow this item. We calculate it ahead of time as once the
5475                lookbehind offset is added in we lose the ability to correctly
5476                calculate it.*/
5477             ml = data.minlen_float ? *(data.minlen_float)
5478                                    : (I32)longest_float_length;
5479             r->float_end_shift = ml - data.offset_float_min
5480                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5481                 + data.lookbehind_float;
5482             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5483             r->float_max_offset = data.offset_float_max;
5484             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5485                 r->float_max_offset -= data.lookbehind_float;
5486
5487             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5488                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5489                            || (RExC_flags & RXf_PMf_MULTILINE)));
5490             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5491         }
5492         else {
5493           remove_float:
5494             r->float_substr = r->float_utf8 = NULL;
5495             SvREFCNT_dec(data.longest_float);
5496             longest_float_length = 0;
5497         }
5498
5499         /* Note that code very similar to this but for floating string
5500            is immediately above, changes may need to be made to both.
5501            Be careful.
5502          */
5503         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5504
5505         /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5506         if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5507             && (longest_fixed_length
5508                 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5509                     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5510                         || (RExC_flags & RXf_PMf_MULTILINE)))) )
5511         {
5512             I32 t,ml;
5513
5514             /* copy the information about the longest fixed
5515                from the reg_scan_data over to the program. */
5516             if (SvUTF8(data.longest_fixed)) {
5517                 r->anchored_utf8 = data.longest_fixed;
5518                 r->anchored_substr = NULL;
5519             } else {
5520                 r->anchored_substr = data.longest_fixed;
5521                 r->anchored_utf8 = NULL;
5522             }
5523             /* fixed_end_shift is how many chars that must be matched that
5524                follow this item. We calculate it ahead of time as once the
5525                lookbehind offset is added in we lose the ability to correctly
5526                calculate it.*/
5527             ml = data.minlen_fixed ? *(data.minlen_fixed)
5528                                    : (I32)longest_fixed_length;
5529             r->anchored_end_shift = ml - data.offset_fixed
5530                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5531                 + data.lookbehind_fixed;
5532             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5533
5534             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5535                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5536                      || (RExC_flags & RXf_PMf_MULTILINE)));
5537             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5538         }
5539         else {
5540             r->anchored_substr = r->anchored_utf8 = NULL;
5541             SvREFCNT_dec(data.longest_fixed);
5542             longest_fixed_length = 0;
5543         }
5544         if (ri->regstclass
5545             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5546             ri->regstclass = NULL;
5547
5548         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5549             && stclass_flag
5550             && !(data.start_class->flags & ANYOF_EOS)
5551             && !cl_is_anything(data.start_class))
5552         {
5553             const U32 n = add_data(pRExC_state, 1, "f");
5554             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5555
5556             Newx(RExC_rxi->data->data[n], 1,
5557                 struct regnode_charclass_class);
5558             StructCopy(data.start_class,
5559                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5560                        struct regnode_charclass_class);
5561             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5562             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5563             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5564                       regprop(r, sv, (regnode*)data.start_class);
5565                       PerlIO_printf(Perl_debug_log,
5566                                     "synthetic stclass \"%s\".\n",
5567                                     SvPVX_const(sv));});
5568         }
5569
5570         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5571         if (longest_fixed_length > longest_float_length) {
5572             r->check_end_shift = r->anchored_end_shift;
5573             r->check_substr = r->anchored_substr;
5574             r->check_utf8 = r->anchored_utf8;
5575             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5576             if (r->extflags & RXf_ANCH_SINGLE)
5577                 r->extflags |= RXf_NOSCAN;
5578         }
5579         else {
5580             r->check_end_shift = r->float_end_shift;
5581             r->check_substr = r->float_substr;
5582             r->check_utf8 = r->float_utf8;
5583             r->check_offset_min = r->float_min_offset;
5584             r->check_offset_max = r->float_max_offset;
5585         }
5586         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5587            This should be changed ASAP!  */
5588         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5589             r->extflags |= RXf_USE_INTUIT;
5590             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5591                 r->extflags |= RXf_INTUIT_TAIL;
5592         }
5593         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5594         if ( (STRLEN)minlen < longest_float_length )
5595             minlen= longest_float_length;
5596         if ( (STRLEN)minlen < longest_fixed_length )
5597             minlen= longest_fixed_length;
5598         */
5599     }
5600     else {
5601         /* Several toplevels. Best we can is to set minlen. */
5602         I32 fake;
5603         struct regnode_charclass_class ch_class;
5604         I32 last_close = 0;
5605
5606         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5607
5608         scan = ri->program + 1;
5609         cl_init(pRExC_state, &ch_class);
5610         data.start_class = &ch_class;
5611         data.last_closep = &last_close;
5612
5613
5614         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5615             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5616
5617         CHECK_RESTUDY_GOTO;
5618
5619         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5620                 = r->float_substr = r->float_utf8 = NULL;
5621
5622         if (!(data.start_class->flags & ANYOF_EOS)
5623             && !cl_is_anything(data.start_class))
5624         {
5625             const U32 n = add_data(pRExC_state, 1, "f");
5626             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5627
5628             Newx(RExC_rxi->data->data[n], 1,
5629                 struct regnode_charclass_class);
5630             StructCopy(data.start_class,
5631                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5632                        struct regnode_charclass_class);
5633             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5634             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5635             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5636                       regprop(r, sv, (regnode*)data.start_class);
5637                       PerlIO_printf(Perl_debug_log,
5638                                     "synthetic stclass \"%s\".\n",
5639                                     SvPVX_const(sv));});
5640         }
5641     }
5642
5643     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5644        the "real" pattern. */
5645     DEBUG_OPTIMISE_r({
5646         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5647                       (IV)minlen, (IV)r->minlen);
5648     });
5649     r->minlenret = minlen;
5650     if (r->minlen < minlen)
5651         r->minlen = minlen;
5652
5653     if (RExC_seen & REG_SEEN_GPOS)
5654         r->extflags |= RXf_GPOS_SEEN;
5655     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5656         r->extflags |= RXf_LOOKBEHIND_SEEN;
5657     if (RExC_seen & REG_SEEN_EVAL)
5658         r->extflags |= RXf_EVAL_SEEN;
5659     if (RExC_seen & REG_SEEN_CANY)
5660         r->extflags |= RXf_CANY_SEEN;
5661     if (RExC_seen & REG_SEEN_VERBARG)
5662         r->intflags |= PREGf_VERBARG_SEEN;
5663     if (RExC_seen & REG_SEEN_CUTGROUP)
5664         r->intflags |= PREGf_CUTGROUP_SEEN;
5665     if (RExC_paren_names)
5666         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5667     else
5668         RXp_PAREN_NAMES(r) = NULL;
5669
5670 #ifdef STUPID_PATTERN_CHECKS
5671     if (RX_PRELEN(rx) == 0)
5672         r->extflags |= RXf_NULL;
5673     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5674         /* XXX: this should happen BEFORE we compile */
5675         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5676     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5677         r->extflags |= RXf_WHITE;
5678     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5679         r->extflags |= RXf_START_ONLY;
5680 #else
5681     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5682             /* XXX: this should happen BEFORE we compile */
5683             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5684     else {
5685         regnode *first = ri->program + 1;
5686         U8 fop = OP(first);
5687
5688         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5689             r->extflags |= RXf_NULL;
5690         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5691             r->extflags |= RXf_START_ONLY;
5692         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5693                              && OP(regnext(first)) == END)
5694             r->extflags |= RXf_WHITE;
5695     }
5696 #endif
5697 #ifdef DEBUGGING
5698     if (RExC_paren_names) {
5699         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5700         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5701     } else
5702 #endif
5703         ri->name_list_idx = 0;
5704
5705     if (RExC_recurse_count) {
5706         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5707             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5708             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5709         }
5710     }
5711     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5712     /* assume we don't need to swap parens around before we match */
5713
5714     DEBUG_DUMP_r({
5715         PerlIO_printf(Perl_debug_log,"Final program:\n");
5716         regdump(r);
5717     });
5718 #ifdef RE_TRACK_PATTERN_OFFSETS
5719     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5720         const U32 len = ri->u.offsets[0];
5721         U32 i;
5722         GET_RE_DEBUG_FLAGS_DECL;
5723         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5724         for (i = 1; i <= len; i++) {
5725             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5726                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5727                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5728             }
5729         PerlIO_printf(Perl_debug_log, "\n");
5730     });
5731 #endif
5732     return rx;
5733 }
5734
5735 #undef RE_ENGINE_PTR
5736
5737
5738 SV*
5739 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5740                     const U32 flags)
5741 {
5742     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5743
5744     PERL_UNUSED_ARG(value);
5745
5746     if (flags & RXapif_FETCH) {
5747         return reg_named_buff_fetch(rx, key, flags);
5748     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5749         Perl_croak_no_modify(aTHX);
5750         return NULL;
5751     } else if (flags & RXapif_EXISTS) {
5752         return reg_named_buff_exists(rx, key, flags)
5753             ? &PL_sv_yes
5754             : &PL_sv_no;
5755     } else if (flags & RXapif_REGNAMES) {
5756         return reg_named_buff_all(rx, flags);
5757     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5758         return reg_named_buff_scalar(rx, flags);
5759     } else {
5760         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5761         return NULL;
5762     }
5763 }
5764
5765 SV*
5766 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5767                          const U32 flags)
5768 {
5769     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5770     PERL_UNUSED_ARG(lastkey);
5771
5772     if (flags & RXapif_FIRSTKEY)
5773         return reg_named_buff_firstkey(rx, flags);
5774     else if (flags & RXapif_NEXTKEY)
5775         return reg_named_buff_nextkey(rx, flags);
5776     else {
5777         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5778         return NULL;
5779     }
5780 }
5781
5782 SV*
5783 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5784                           const U32 flags)
5785 {
5786     AV *retarray = NULL;
5787     SV *ret;
5788     struct regexp *const rx = (struct regexp *)SvANY(r);
5789
5790     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5791
5792     if (flags & RXapif_ALL)
5793         retarray=newAV();
5794
5795     if (rx && RXp_PAREN_NAMES(rx)) {
5796         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5797         if (he_str) {
5798             IV i;
5799             SV* sv_dat=HeVAL(he_str);
5800             I32 *nums=(I32*)SvPVX(sv_dat);
5801             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5802                 if ((I32)(rx->nparens) >= nums[i]
5803                     && rx->offs[nums[i]].start != -1
5804                     && rx->offs[nums[i]].end != -1)
5805                 {
5806                     ret = newSVpvs("");
5807                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5808                     if (!retarray)
5809                         return ret;
5810                 } else {
5811                     if (retarray)
5812                         ret = newSVsv(&PL_sv_undef);
5813                 }
5814                 if (retarray)
5815                     av_push(retarray, ret);
5816             }
5817             if (retarray)
5818                 return newRV_noinc(MUTABLE_SV(retarray));
5819         }
5820     }
5821     return NULL;
5822 }
5823
5824 bool
5825 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5826                            const U32 flags)
5827 {
5828     struct regexp *const rx = (struct regexp *)SvANY(r);
5829
5830     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5831
5832     if (rx && RXp_PAREN_NAMES(rx)) {
5833         if (flags & RXapif_ALL) {
5834             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5835         } else {
5836             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5837             if (sv) {
5838                 SvREFCNT_dec(sv);
5839                 return TRUE;
5840             } else {
5841                 return FALSE;
5842             }
5843         }
5844     } else {
5845         return FALSE;
5846     }
5847 }
5848
5849 SV*
5850 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5851 {
5852     struct regexp *const rx = (struct regexp *)SvANY(r);
5853
5854     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5855
5856     if ( rx && RXp_PAREN_NAMES(rx) ) {
5857         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5858
5859         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5860     } else {
5861         return FALSE;
5862     }
5863 }
5864
5865 SV*
5866 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5867 {
5868     struct regexp *const rx = (struct regexp *)SvANY(r);
5869     GET_RE_DEBUG_FLAGS_DECL;
5870
5871     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5872
5873     if (rx && RXp_PAREN_NAMES(rx)) {
5874         HV *hv = RXp_PAREN_NAMES(rx);
5875         HE *temphe;
5876         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5877             IV i;
5878             IV parno = 0;
5879             SV* sv_dat = HeVAL(temphe);
5880             I32 *nums = (I32*)SvPVX(sv_dat);
5881             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5882                 if ((I32)(rx->lastparen) >= nums[i] &&
5883                     rx->offs[nums[i]].start != -1 &&
5884                     rx->offs[nums[i]].end != -1)
5885                 {
5886                     parno = nums[i];
5887                     break;
5888                 }
5889             }
5890             if (parno || flags & RXapif_ALL) {
5891                 return newSVhek(HeKEY_hek(temphe));
5892             }
5893         }
5894     }
5895     return NULL;
5896 }
5897
5898 SV*
5899 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5900 {
5901     SV *ret;
5902     AV *av;
5903     I32 length;
5904     struct regexp *const rx = (struct regexp *)SvANY(r);
5905
5906     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5907
5908     if (rx && RXp_PAREN_NAMES(rx)) {
5909         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5910             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5911         } else if (flags & RXapif_ONE) {
5912             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5913             av = MUTABLE_AV(SvRV(ret));
5914             length = av_len(av);
5915             SvREFCNT_dec(ret);
5916             return newSViv(length + 1);
5917         } else {
5918             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5919             return NULL;
5920         }
5921     }
5922     return &PL_sv_undef;
5923 }
5924
5925 SV*
5926 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5927 {
5928     struct regexp *const rx = (struct regexp *)SvANY(r);
5929     AV *av = newAV();
5930
5931     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5932
5933     if (rx && RXp_PAREN_NAMES(rx)) {
5934         HV *hv= RXp_PAREN_NAMES(rx);
5935         HE *temphe;
5936         (void)hv_iterinit(hv);
5937         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5938             IV i;
5939             IV parno = 0;
5940             SV* sv_dat = HeVAL(temphe);
5941             I32 *nums = (I32*)SvPVX(sv_dat);
5942             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5943                 if ((I32)(rx->lastparen) >= nums[i] &&
5944                     rx->offs[nums[i]].start != -1 &&
5945                     rx->offs[nums[i]].end != -1)
5946                 {
5947                     parno = nums[i];
5948                     break;
5949                 }
5950             }
5951             if (parno || flags & RXapif_ALL) {
5952                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5953             }
5954         }
5955     }
5956
5957     return newRV_noinc(MUTABLE_SV(av));
5958 }
5959
5960 void
5961 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5962                              SV * const sv)
5963 {
5964     struct regexp *const rx = (struct regexp *)SvANY(r);
5965     char *s = NULL;
5966     I32 i = 0;
5967     I32 s1, t1;
5968
5969     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5970
5971     if (!rx->subbeg) {
5972         sv_setsv(sv,&PL_sv_undef);
5973         return;
5974     }
5975     else
5976     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5977         /* $` */
5978         i = rx->offs[0].start;
5979         s = rx->subbeg;
5980     }
5981     else
5982     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5983         /* $' */
5984         s = rx->subbeg + rx->offs[0].end;
5985         i = rx->sublen - rx->offs[0].end;
5986     }
5987     else
5988     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5989         (s1 = rx->offs[paren].start) != -1 &&
5990         (t1 = rx->offs[paren].end) != -1)
5991     {
5992         /* $& $1 ... */
5993         i = t1 - s1;
5994         s = rx->subbeg + s1;
5995     } else {
5996         sv_setsv(sv,&PL_sv_undef);
5997         return;
5998     }
5999     assert(rx->sublen >= (s - rx->subbeg) + i );
6000     if (i >= 0) {
6001         const int oldtainted = PL_tainted;
6002         TAINT_NOT;
6003         sv_setpvn(sv, s, i);
6004         PL_tainted = oldtainted;
6005         if ( (rx->extflags & RXf_CANY_SEEN)
6006             ? (RXp_MATCH_UTF8(rx)
6007                         && (!i || is_utf8_string((U8*)s, i)))
6008             : (RXp_MATCH_UTF8(rx)) )
6009         {
6010             SvUTF8_on(sv);
6011         }
6012         else
6013             SvUTF8_off(sv);
6014         if (PL_tainting) {
6015             if (RXp_MATCH_TAINTED(rx)) {
6016                 if (SvTYPE(sv) >= SVt_PVMG) {
6017                     MAGIC* const mg = SvMAGIC(sv);
6018                     MAGIC* mgt;
6019                     PL_tainted = 1;
6020                     SvMAGIC_set(sv, mg->mg_moremagic);
6021                     SvTAINT(sv);
6022                     if ((mgt = SvMAGIC(sv))) {
6023                         mg->mg_moremagic = mgt;
6024                         SvMAGIC_set(sv, mg);
6025                     }
6026                 } else {
6027                     PL_tainted = 1;
6028                     SvTAINT(sv);
6029                 }
6030             } else
6031                 SvTAINTED_off(sv);
6032         }
6033     } else {
6034         sv_setsv(sv,&PL_sv_undef);
6035         return;
6036     }
6037 }
6038
6039 void
6040 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6041                                                          SV const * const value)
6042 {
6043     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6044
6045     PERL_UNUSED_ARG(rx);
6046     PERL_UNUSED_ARG(paren);
6047     PERL_UNUSED_ARG(value);
6048
6049     if (!PL_localizing)
6050         Perl_croak_no_modify(aTHX);
6051 }
6052
6053 I32
6054 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6055                               const I32 paren)
6056 {
6057     struct regexp *const rx = (struct regexp *)SvANY(r);
6058     I32 i;
6059     I32 s1, t1;
6060
6061     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6062
6063     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6064         switch (paren) {
6065       /* $` / ${^PREMATCH} */
6066       case RX_BUFF_IDX_PREMATCH:
6067         if (rx->offs[0].start != -1) {
6068                         i = rx->offs[0].start;
6069                         if (i > 0) {
6070                                 s1 = 0;
6071                                 t1 = i;
6072                                 goto getlen;
6073                         }
6074             }
6075         return 0;
6076       /* $' / ${^POSTMATCH} */
6077       case RX_BUFF_IDX_POSTMATCH:
6078             if (rx->offs[0].end != -1) {
6079                         i = rx->sublen - rx->offs[0].end;
6080                         if (i > 0) {
6081                                 s1 = rx->offs[0].end;
6082                                 t1 = rx->sublen;
6083                                 goto getlen;
6084                         }
6085             }
6086         return 0;
6087       /* $& / ${^MATCH}, $1, $2, ... */
6088       default:
6089             if (paren <= (I32)rx->nparens &&
6090             (s1 = rx->offs[paren].start) != -1 &&
6091             (t1 = rx->offs[paren].end) != -1)
6092             {
6093             i = t1 - s1;
6094             goto getlen;
6095         } else {
6096             if (ckWARN(WARN_UNINITIALIZED))
6097                 report_uninit((const SV *)sv);
6098             return 0;
6099         }
6100     }
6101   getlen:
6102     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6103         const char * const s = rx->subbeg + s1;
6104         const U8 *ep;
6105         STRLEN el;
6106
6107         i = t1 - s1;
6108         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6109                         i = el;
6110     }
6111     return i;
6112 }
6113
6114 SV*
6115 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6116 {
6117     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6118         PERL_UNUSED_ARG(rx);
6119         if (0)
6120             return NULL;
6121         else
6122             return newSVpvs("Regexp");
6123 }
6124
6125 /* Scans the name of a named buffer from the pattern.
6126  * If flags is REG_RSN_RETURN_NULL returns null.
6127  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6128  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6129  * to the parsed name as looked up in the RExC_paren_names hash.
6130  * If there is an error throws a vFAIL().. type exception.
6131  */
6132
6133 #define REG_RSN_RETURN_NULL    0
6134 #define REG_RSN_RETURN_NAME    1
6135 #define REG_RSN_RETURN_DATA    2
6136
6137 STATIC SV*
6138 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6139 {
6140     char *name_start = RExC_parse;
6141
6142     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6143
6144     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6145          /* skip IDFIRST by using do...while */
6146         if (UTF)
6147             do {
6148                 RExC_parse += UTF8SKIP(RExC_parse);
6149             } while (isALNUM_utf8((U8*)RExC_parse));
6150         else
6151             do {
6152                 RExC_parse++;
6153             } while (isALNUM(*RExC_parse));
6154     }
6155
6156     if ( flags ) {
6157         SV* sv_name
6158             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6159                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6160         if ( flags == REG_RSN_RETURN_NAME)
6161             return sv_name;
6162         else if (flags==REG_RSN_RETURN_DATA) {
6163             HE *he_str = NULL;
6164             SV *sv_dat = NULL;
6165             if ( ! sv_name )      /* should not happen*/
6166                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6167             if (RExC_paren_names)
6168                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6169             if ( he_str )
6170                 sv_dat = HeVAL(he_str);
6171             if ( ! sv_dat )
6172                 vFAIL("Reference to nonexistent named group");
6173             return sv_dat;
6174         }
6175         else {
6176             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6177                        (unsigned long) flags);
6178         }
6179         /* NOT REACHED */
6180     }
6181     return NULL;
6182 }
6183
6184 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6185     int rem=(int)(RExC_end - RExC_parse);                       \
6186     int cut;                                                    \
6187     int num;                                                    \
6188     int iscut=0;                                                \
6189     if (rem>10) {                                               \
6190         rem=10;                                                 \
6191         iscut=1;                                                \
6192     }                                                           \
6193     cut=10-rem;                                                 \
6194     if (RExC_lastparse!=RExC_parse)                             \
6195         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6196             rem, RExC_parse,                                    \
6197             cut + 4,                                            \
6198             iscut ? "..." : "<"                                 \
6199         );                                                      \
6200     else                                                        \
6201         PerlIO_printf(Perl_debug_log,"%16s","");                \
6202                                                                 \
6203     if (SIZE_ONLY)                                              \
6204        num = RExC_size + 1;                                     \
6205     else                                                        \
6206        num=REG_NODE_NUM(RExC_emit);                             \
6207     if (RExC_lastnum!=num)                                      \
6208        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6209     else                                                        \
6210        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6211     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6212         (int)((depth*2)), "",                                   \
6213         (funcname)                                              \
6214     );                                                          \
6215     RExC_lastnum=num;                                           \
6216     RExC_lastparse=RExC_parse;                                  \
6217 })
6218
6219
6220
6221 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6222     DEBUG_PARSE_MSG((funcname));                            \
6223     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6224 })
6225 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6226     DEBUG_PARSE_MSG((funcname));                            \
6227     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6228 })
6229
6230 /* This section of code defines the inversion list object and its methods.  The
6231  * interfaces are highly subject to change, so as much as possible is static to
6232  * this file.  An inversion list is here implemented as a malloc'd C UV array
6233  * with some added info that is placed as UVs at the beginning in a header
6234  * portion.  An inversion list for Unicode is an array of code points, sorted
6235  * by ordinal number.  The zeroth element is the first code point in the list.
6236  * The 1th element is the first element beyond that not in the list.  In other
6237  * words, the first range is
6238  *  invlist[0]..(invlist[1]-1)
6239  * The other ranges follow.  Thus every element whose index is divisible by two
6240  * marks the beginning of a range that is in the list, and every element not
6241  * divisible by two marks the beginning of a range not in the list.  A single
6242  * element inversion list that contains the single code point N generally
6243  * consists of two elements
6244  *  invlist[0] == N
6245  *  invlist[1] == N+1
6246  * (The exception is when N is the highest representable value on the
6247  * machine, in which case the list containing just it would be a single
6248  * element, itself.  By extension, if the last range in the list extends to
6249  * infinity, then the first element of that range will be in the inversion list
6250  * at a position that is divisible by two, and is the final element in the
6251  * list.)
6252  * Taking the complement (inverting) an inversion list is quite simple, if the
6253  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6254  * This implementation reserves an element at the beginning of each inversion list
6255  * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
6256  * beginning of the list is either that element if 0, or the next one if 1.
6257  *
6258  * More about inversion lists can be found in "Unicode Demystified"
6259  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6260  * More will be coming when functionality is added later.
6261  *
6262  * The inversion list data structure is currently implemented as an SV pointing
6263  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6264  * array of UV whose memory management is automatically handled by the existing
6265  * facilities for SV's.
6266  *
6267  * Some of the methods should always be private to the implementation, and some
6268  * should eventually be made public */
6269
6270 #define INVLIST_LEN_OFFSET 0    /* Number of elements in the inversion list */
6271 #define INVLIST_ITER_OFFSET 1   /* Current iteration position */
6272
6273 /* This is a combination of a version and data structure type, so that one
6274  * being passed in can be validated to be an inversion list of the correct
6275  * vintage.  When the structure of the header is changed, a new random number
6276  * in the range 2**31-1 should be generated and the new() method changed to
6277  * insert that at this location.  Then, if an auxiliary program doesn't change
6278  * correspondingly, it will be discovered immediately */
6279 #define INVLIST_VERSION_ID_OFFSET 2
6280 #define INVLIST_VERSION_ID 1064334010
6281
6282 /* For safety, when adding new elements, remember to #undef them at the end of
6283  * the inversion list code section */
6284
6285 #define INVLIST_ZERO_OFFSET 3   /* 0 or 1; must be last element in header */
6286 /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
6287  * contains the code point U+00000, and begins here.  If 1, the inversion list
6288  * doesn't contain U+0000, and it begins at the next UV in the array.
6289  * Inverting an inversion list consists of adding or removing the 0 at the
6290  * beginning of it.  By reserving a space for that 0, inversion can be made
6291  * very fast */
6292
6293 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6294
6295 /* Internally things are UVs */
6296 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6297 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6298
6299 #define INVLIST_INITIAL_LEN 10
6300
6301 PERL_STATIC_INLINE UV*
6302 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6303 {
6304     /* Returns a pointer to the first element in the inversion list's array.
6305      * This is called upon initialization of an inversion list.  Where the
6306      * array begins depends on whether the list has the code point U+0000
6307      * in it or not.  The other parameter tells it whether the code that
6308      * follows this call is about to put a 0 in the inversion list or not.
6309      * The first element is either the element with 0, if 0, or the next one,
6310      * if 1 */
6311
6312     UV* zero = get_invlist_zero_addr(invlist);
6313
6314     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6315
6316     /* Must be empty */
6317     assert(! *get_invlist_len_addr(invlist));
6318
6319     /* 1^1 = 0; 1^0 = 1 */
6320     *zero = 1 ^ will_have_0;
6321     return zero + *zero;
6322 }
6323
6324 PERL_STATIC_INLINE UV*
6325 S_invlist_array(pTHX_ SV* const invlist)
6326 {
6327     /* Returns the pointer to the inversion list's array.  Every time the
6328      * length changes, this needs to be called in case malloc or realloc moved
6329      * it */
6330
6331     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6332
6333     /* Must not be empty.  If these fail, you probably didn't check for <len>
6334      * being non-zero before trying to get the array */
6335     assert(*get_invlist_len_addr(invlist));
6336     assert(*get_invlist_zero_addr(invlist) == 0
6337            || *get_invlist_zero_addr(invlist) == 1);
6338
6339     /* The array begins either at the element reserved for zero if the
6340      * list contains 0 (that element will be set to 0), or otherwise the next
6341      * element (in which case the reserved element will be set to 1). */
6342     return (UV *) (get_invlist_zero_addr(invlist)
6343                    + *get_invlist_zero_addr(invlist));
6344 }
6345
6346 PERL_STATIC_INLINE UV*
6347 S_get_invlist_len_addr(pTHX_ SV* invlist)
6348 {
6349     /* Return the address of the UV that contains the current number
6350      * of used elements in the inversion list */
6351
6352     PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6353
6354     return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6355 }
6356
6357 PERL_STATIC_INLINE UV
6358 S_invlist_len(pTHX_ SV* const invlist)
6359 {
6360     /* Returns the current number of elements stored in the inversion list's
6361      * array */
6362
6363     PERL_ARGS_ASSERT_INVLIST_LEN;
6364
6365     return *get_invlist_len_addr(invlist);
6366 }
6367
6368 PERL_STATIC_INLINE void
6369 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6370 {
6371     /* Sets the current number of elements stored in the inversion list */
6372
6373     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6374
6375     *get_invlist_len_addr(invlist) = len;
6376
6377     assert(len <= SvLEN(invlist));
6378
6379     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6380     /* If the list contains U+0000, that element is part of the header,
6381      * and should not be counted as part of the array.  It will contain
6382      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
6383      * subtract:
6384      *  SvCUR_set(invlist,
6385      *            TO_INTERNAL_SIZE(len
6386      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
6387      * But, this is only valid if len is not 0.  The consequences of not doing
6388      * this is that the memory allocation code may think that 1 more UV is
6389      * being used than actually is, and so might do an unnecessary grow.  That
6390      * seems worth not bothering to make this the precise amount.
6391      *
6392      * Note that when inverting, SvCUR shouldn't change */
6393 }
6394
6395 PERL_STATIC_INLINE UV
6396 S_invlist_max(pTHX_ SV* const invlist)
6397 {
6398     /* Returns the maximum number of elements storable in the inversion list's
6399      * array, without having to realloc() */
6400
6401     PERL_ARGS_ASSERT_INVLIST_MAX;
6402
6403     return FROM_INTERNAL_SIZE(SvLEN(invlist));
6404 }
6405
6406 PERL_STATIC_INLINE UV*
6407 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6408 {
6409     /* Return the address of the UV that is reserved to hold 0 if the inversion
6410      * list contains 0.  This has to be the last element of the heading, as the
6411      * list proper starts with either it if 0, or the next element if not.
6412      * (But we force it to contain either 0 or 1) */
6413
6414     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6415
6416     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6417 }
6418
6419 #ifndef PERL_IN_XSUB_RE
6420 SV*
6421 Perl__new_invlist(pTHX_ IV initial_size)
6422 {
6423
6424     /* Return a pointer to a newly constructed inversion list, with enough
6425      * space to store 'initial_size' elements.  If that number is negative, a
6426      * system default is used instead */
6427
6428     SV* new_list;
6429
6430     if (initial_size < 0) {
6431         initial_size = INVLIST_INITIAL_LEN;
6432     }
6433
6434     /* Allocate the initial space */
6435     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6436     invlist_set_len(new_list, 0);
6437
6438     /* Force iterinit() to be used to get iteration to work */
6439     *get_invlist_iter_addr(new_list) = UV_MAX;
6440
6441     /* This should force a segfault if a method doesn't initialize this
6442      * properly */
6443     *get_invlist_zero_addr(new_list) = UV_MAX;
6444
6445     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6446 #if HEADER_LENGTH != 4
6447 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6448 #endif
6449
6450     return new_list;
6451 }
6452 #endif
6453
6454 STATIC SV*
6455 S__new_invlist_C_array(pTHX_ UV* list)
6456 {
6457     /* Return a pointer to a newly constructed inversion list, initialized to
6458      * point to <list>, which has to be in the exact correct inversion list
6459      * form, including internal fields.  Thus this is a dangerous routine that
6460      * should not be used in the wrong hands */
6461
6462     SV* invlist = newSV_type(SVt_PV);
6463
6464     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6465
6466     SvPV_set(invlist, (char *) list);
6467     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
6468                                shouldn't touch it */
6469     SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6470
6471     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6472         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6473     }
6474
6475     return invlist;
6476 }
6477
6478 STATIC void
6479 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6480 {
6481     /* Grow the maximum size of an inversion list */
6482
6483     PERL_ARGS_ASSERT_INVLIST_EXTEND;
6484
6485     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6486 }
6487
6488 PERL_STATIC_INLINE void
6489 S_invlist_trim(pTHX_ SV* const invlist)
6490 {
6491     PERL_ARGS_ASSERT_INVLIST_TRIM;
6492
6493     /* Change the length of the inversion list to how many entries it currently
6494      * has */
6495
6496     SvPV_shrink_to_cur((SV *) invlist);
6497 }
6498
6499 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6500  * etc */
6501 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6502 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6503
6504 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6505
6506 #ifndef PERL_IN_XSUB_RE
6507 void
6508 Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6509 {
6510    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6511     * the end of the inversion list.  The range must be above any existing
6512     * ones. */
6513
6514     UV* array;
6515     UV max = invlist_max(invlist);
6516     UV len = invlist_len(invlist);
6517
6518     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6519
6520     if (len == 0) { /* Empty lists must be initialized */
6521         array = _invlist_array_init(invlist, start == 0);
6522     }
6523     else {
6524         /* Here, the existing list is non-empty. The current max entry in the
6525          * list is generally the first value not in the set, except when the
6526          * set extends to the end of permissible values, in which case it is
6527          * the first entry in that final set, and so this call is an attempt to
6528          * append out-of-order */
6529
6530         UV final_element = len - 1;
6531         array = invlist_array(invlist);
6532         if (array[final_element] > start
6533             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6534         {
6535             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6536                        array[final_element], start,
6537                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6538         }
6539
6540         /* Here, it is a legal append.  If the new range begins with the first
6541          * value not in the set, it is extending the set, so the new first
6542          * value not in the set is one greater than the newly extended range.
6543          * */
6544         if (array[final_element] == start) {
6545             if (end != UV_MAX) {
6546                 array[final_element] = end + 1;
6547             }
6548             else {
6549                 /* But if the end is the maximum representable on the machine,
6550                  * just let the range that this would extend to have no end */
6551                 invlist_set_len(invlist, len - 1);
6552             }
6553             return;
6554         }
6555     }
6556
6557     /* Here the new range doesn't extend any existing set.  Add it */
6558
6559     len += 2;   /* Includes an element each for the start and end of range */
6560
6561     /* If overflows the existing space, extend, which may cause the array to be
6562      * moved */
6563     if (max < len) {
6564         invlist_extend(invlist, len);
6565         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
6566                                            failure in invlist_array() */
6567         array = invlist_array(invlist);
6568     }
6569     else {
6570         invlist_set_len(invlist, len);
6571     }
6572
6573     /* The next item on the list starts the range, the one after that is
6574      * one past the new range.  */
6575     array[len - 2] = start;
6576     if (end != UV_MAX) {
6577         array[len - 1] = end + 1;
6578     }
6579     else {
6580         /* But if the end is the maximum representable on the machine, just let
6581          * the range have no end */
6582         invlist_set_len(invlist, len - 1);
6583     }
6584 }
6585
6586 STATIC IV
6587 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6588 {
6589     /* Searches the inversion list for the entry that contains the input code
6590      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
6591      * return value is the index into the list's array of the range that
6592      * contains <cp> */
6593
6594     IV low = 0;
6595     IV high = invlist_len(invlist);
6596     const UV * const array = invlist_array(invlist);
6597
6598     PERL_ARGS_ASSERT_INVLIST_SEARCH;
6599
6600     /* If list is empty or the code point is before the first element, return
6601      * failure. */
6602     if (high == 0 || cp < array[0]) {
6603         return -1;
6604     }
6605
6606     /* Binary search.  What we are looking for is <i> such that
6607      *  array[i] <= cp < array[i+1]
6608      * The loop below converges on the i+1. */
6609     while (low < high) {
6610         IV mid = (low + high) / 2;
6611         if (array[mid] <= cp) {
6612             low = mid + 1;
6613
6614             /* We could do this extra test to exit the loop early.
6615             if (cp < array[low]) {
6616                 return mid;
6617             }
6618             */
6619         }
6620         else { /* cp < array[mid] */
6621             high = mid;
6622         }
6623     }
6624
6625     return high - 1;
6626 }
6627
6628 void
6629 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6630 {
6631     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6632      * but is used when the swash has an inversion list.  This makes this much
6633      * faster, as it uses a binary search instead of a linear one.  This is
6634      * intimately tied to that function, and perhaps should be in utf8.c,
6635      * except it is intimately tied to inversion lists as well.  It assumes
6636      * that <swatch> is all 0's on input */
6637
6638     UV current = start;
6639     const IV len = invlist_len(invlist);
6640     IV i;
6641     const UV * array;
6642
6643     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6644
6645     if (len == 0) { /* Empty inversion list */
6646         return;
6647     }
6648
6649     array = invlist_array(invlist);
6650
6651     /* Find which element it is */
6652     i = invlist_search(invlist, start);
6653
6654     /* We populate from <start> to <end> */
6655     while (current < end) {
6656         UV upper;
6657
6658         /* The inversion list gives the results for every possible code point
6659          * after the first one in the list.  Only those ranges whose index is
6660          * even are ones that the inversion list matches.  For the odd ones,
6661          * and if the initial code point is not in the list, we have to skip
6662          * forward to the next element */
6663         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6664             i++;
6665             if (i >= len) { /* Finished if beyond the end of the array */
6666                 return;
6667             }
6668             current = array[i];
6669             if (current >= end) {   /* Finished if beyond the end of what we
6670                                        are populating */
6671                 return;
6672             }
6673         }
6674         assert(current >= start);
6675
6676         /* The current range ends one below the next one, except don't go past
6677          * <end> */
6678         i++;
6679         upper = (i < len && array[i] < end) ? array[i] : end;
6680
6681         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
6682          * for each code point in it */
6683         for (; current < upper; current++) {
6684             const STRLEN offset = (STRLEN)(current - start);
6685             swatch[offset >> 3] |= 1 << (offset & 7);
6686         }
6687
6688         /* Quit if at the end of the list */
6689         if (i >= len) {
6690
6691             /* But first, have to deal with the highest possible code point on
6692              * the platform.  The previous code assumes that <end> is one
6693              * beyond where we want to populate, but that is impossible at the
6694              * platform's infinity, so have to handle it specially */
6695             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6696             {
6697                 const STRLEN offset = (STRLEN)(end - start);
6698                 swatch[offset >> 3] |= 1 << (offset & 7);
6699             }
6700             return;
6701         }
6702
6703         /* Advance to the next range, which will be for code points not in the
6704          * inversion list */
6705         current = array[i];
6706     }
6707
6708     return;
6709 }
6710
6711
6712 void
6713 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
6714 {
6715     /* Take the union of two inversion lists and point <output> to it.  *output
6716      * should be defined upon input, and if it points to one of the two lists,
6717      * the reference count to that list will be decremented.  The first list,
6718      * <a>, may be NULL, in which case a copy of the second list is returned.
6719      * If <complement_b> is TRUE, the union is taken of the complement
6720      * (inversion) of <b> instead of b itself.
6721      *
6722      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6723      * Richard Gillam, published by Addison-Wesley, and explained at some
6724      * length there.  The preface says to incorporate its examples into your
6725      * code at your own risk.
6726      *
6727      * The algorithm is like a merge sort.
6728      *
6729      * XXX A potential performance improvement is to keep track as we go along
6730      * if only one of the inputs contributes to the result, meaning the other
6731      * is a subset of that one.  In that case, we can skip the final copy and
6732      * return the larger of the input lists, but then outside code might need
6733      * to keep track of whether to free the input list or not */
6734
6735     UV* array_a;    /* a's array */
6736     UV* array_b;
6737     UV len_a;       /* length of a's array */
6738     UV len_b;
6739
6740     SV* u;                      /* the resulting union */
6741     UV* array_u;
6742     UV len_u;
6743
6744     UV i_a = 0;             /* current index into a's array */
6745     UV i_b = 0;
6746     UV i_u = 0;
6747
6748     /* running count, as explained in the algorithm source book; items are
6749      * stopped accumulating and are output when the count changes to/from 0.
6750      * The count is incremented when we start a range that's in the set, and
6751      * decremented when we start a range that's not in the set.  So its range
6752      * is 0 to 2.  Only when the count is zero is something not in the set.
6753      */
6754     UV count = 0;
6755
6756     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
6757     assert(a != b);
6758
6759     /* If either one is empty, the union is the other one */
6760     if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
6761         if (*output == a) {
6762             if (a != NULL) {
6763                 SvREFCNT_dec(a);
6764             }
6765         }
6766         if (*output != b) {
6767             *output = invlist_clone(b);
6768             if (complement_b) {
6769                 _invlist_invert(*output);
6770             }
6771         } /* else *output already = b; */
6772         return;
6773     }
6774     else if ((len_b = invlist_len(b)) == 0) {
6775         if (*output == b) {
6776             SvREFCNT_dec(b);
6777         }
6778
6779         /* The complement of an empty list is a list that has everything in it,
6780          * so the union with <a> includes everything too */
6781         if (complement_b) {
6782             if (a == *output) {
6783                 SvREFCNT_dec(a);
6784             }
6785             *output = _new_invlist(1);
6786             _append_range_to_invlist(*output, 0, UV_MAX);
6787         }
6788         else if (*output != a) {
6789             *output = invlist_clone(a);
6790         }
6791         /* else *output already = a; */
6792         return;
6793     }
6794
6795     /* Here both lists exist and are non-empty */
6796     array_a = invlist_array(a);
6797     array_b = invlist_array(b);
6798
6799     /* If are to take the union of 'a' with the complement of b, set it
6800      * up so are looking at b's complement. */
6801     if (complement_b) {
6802
6803         /* To complement, we invert: if the first element is 0, remove it.  To
6804          * do this, we just pretend the array starts one later, and clear the
6805          * flag as we don't have to do anything else later */
6806         if (array_b[0] == 0) {
6807             array_b++;
6808             len_b--;
6809             complement_b = FALSE;
6810         }
6811         else {
6812
6813             /* But if the first element is not zero, we unshift a 0 before the
6814              * array.  The data structure reserves a space for that 0 (which
6815              * should be a '1' right now), so physical shifting is unneeded,
6816              * but temporarily change that element to 0.  Before exiting the
6817              * routine, we must restore the element to '1' */
6818             array_b--;
6819             len_b++;
6820             array_b[0] = 0;
6821         }
6822     }
6823
6824     /* Size the union for the worst case: that the sets are completely
6825      * disjoint */
6826     u = _new_invlist(len_a + len_b);
6827
6828     /* Will contain U+0000 if either component does */
6829     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6830                                       || (len_b > 0 && array_b[0] == 0));
6831
6832     /* Go through each list item by item, stopping when exhausted one of
6833      * them */
6834     while (i_a < len_a && i_b < len_b) {
6835         UV cp;      /* The element to potentially add to the union's array */
6836         bool cp_in_set;   /* is it in the the input list's set or not */
6837
6838         /* We need to take one or the other of the two inputs for the union.
6839          * Since we are merging two sorted lists, we take the smaller of the
6840          * next items.  In case of a tie, we take the one that is in its set
6841          * first.  If we took one not in the set first, it would decrement the
6842          * count, possibly to 0 which would cause it to be output as ending the
6843          * range, and the next time through we would take the same number, and
6844          * output it again as beginning the next range.  By doing it the
6845          * opposite way, there is no possibility that the count will be
6846          * momentarily decremented to 0, and thus the two adjoining ranges will
6847          * be seamlessly merged.  (In a tie and both are in the set or both not
6848          * in the set, it doesn't matter which we take first.) */
6849         if (array_a[i_a] < array_b[i_b]
6850             || (array_a[i_a] == array_b[i_b]
6851                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6852         {
6853             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6854             cp= array_a[i_a++];
6855         }
6856         else {
6857             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6858             cp= array_b[i_b++];
6859         }
6860
6861         /* Here, have chosen which of the two inputs to look at.  Only output
6862          * if the running count changes to/from 0, which marks the
6863          * beginning/end of a range in that's in the set */
6864         if (cp_in_set) {
6865             if (count == 0) {
6866                 array_u[i_u++] = cp;
6867             }
6868             count++;
6869         }
6870         else {
6871             count--;
6872             if (count == 0) {
6873                 array_u[i_u++] = cp;
6874             }
6875         }
6876     }
6877
6878     /* Here, we are finished going through at least one of the lists, which
6879      * means there is something remaining in at most one.  We check if the list
6880      * that hasn't been exhausted is positioned such that we are in the middle
6881      * of a range in its set or not.  (i_a and i_b point to the element beyond
6882      * the one we care about.) If in the set, we decrement 'count'; if 0, there
6883      * is potentially more to output.
6884      * There are four cases:
6885      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6886      *     in the union is entirely from the non-exhausted set.
6887      *  2) Both were in their sets, count is 2.  Nothing further should
6888      *     be output, as everything that remains will be in the exhausted
6889      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6890      *     that
6891      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6892      *     Nothing further should be output because the union includes
6893      *     everything from the exhausted set.  Not decrementing ensures that.
6894      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6895      *     decrementing to 0 insures that we look at the remainder of the
6896      *     non-exhausted set */
6897     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6898         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6899     {
6900         count--;
6901     }
6902
6903     /* The final length is what we've output so far, plus what else is about to
6904      * be output.  (If 'count' is non-zero, then the input list we exhausted
6905      * has everything remaining up to the machine's limit in its set, and hence
6906      * in the union, so there will be no further output. */
6907     len_u = i_u;
6908     if (count == 0) {
6909         /* At most one of the subexpressions will be non-zero */
6910         len_u += (len_a - i_a) + (len_b - i_b);
6911     }
6912
6913     /* Set result to final length, which can change the pointer to array_u, so
6914      * re-find it */
6915     if (len_u != invlist_len(u)) {
6916         invlist_set_len(u, len_u);
6917         invlist_trim(u);
6918         array_u = invlist_array(u);
6919     }
6920
6921     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6922      * the other) ended with everything above it not in its set.  That means
6923      * that the remaining part of the union is precisely the same as the
6924      * non-exhausted list, so can just copy it unchanged.  (If both list were
6925      * exhausted at the same time, then the operations below will be both 0.)
6926      */
6927     if (count == 0) {
6928         IV copy_count; /* At most one will have a non-zero copy count */
6929         if ((copy_count = len_a - i_a) > 0) {
6930             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6931         }
6932         else if ((copy_count = len_b - i_b) > 0) {
6933             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6934         }
6935     }
6936
6937     /*  We may be removing a reference to one of the inputs */
6938     if (a == *output || b == *output) {
6939         SvREFCNT_dec(*output);
6940     }
6941
6942     /* If we've changed b, restore it */
6943     if (complement_b) {
6944         array_b[0] = 1;
6945     }
6946
6947     *output = u;
6948     return;
6949 }
6950
6951 void
6952 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
6953 {
6954     /* Take the intersection of two inversion lists and point <i> to it.  *i
6955      * should be defined upon input, and if it points to one of the two lists,
6956      * the reference count to that list will be decremented.
6957      * If <complement_b> is TRUE, the result will be the intersection of <a>
6958      * and the complement (or inversion) of <b> instead of <b> directly.
6959      *
6960      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6961      * Richard Gillam, published by Addison-Wesley, and explained at some
6962      * length there.  The preface says to incorporate its examples into your
6963      * code at your own risk.  In fact, it had bugs
6964      *
6965      * The algorithm is like a merge sort, and is essentially the same as the
6966      * union above
6967      */
6968
6969     UV* array_a;                /* a's array */
6970     UV* array_b;
6971     UV len_a;   /* length of a's array */
6972     UV len_b;
6973
6974     SV* r;                   /* the resulting intersection */
6975     UV* array_r;
6976     UV len_r;
6977
6978     UV i_a = 0;             /* current index into a's array */
6979     UV i_b = 0;
6980     UV i_r = 0;
6981
6982     /* running count, as explained in the algorithm source book; items are
6983      * stopped accumulating and are output when the count changes to/from 2.
6984      * The count is incremented when we start a range that's in the set, and
6985      * decremented when we start a range that's not in the set.  So its range
6986      * is 0 to 2.  Only when the count is 2 is something in the intersection.
6987      */
6988     UV count = 0;
6989
6990     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
6991     assert(a != b);
6992
6993     /* Special case if either one is empty */
6994     len_a = invlist_len(a);
6995     if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
6996
6997         if (len_a != 0 && complement_b) {
6998
6999             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7000              * be empty.  Here, also we are using 'b's complement, which hence
7001              * must be every possible code point.  Thus the intersection is
7002              * simply 'a'. */
7003             if (*i != a) {
7004                 *i = invlist_clone(a);
7005
7006                 if (*i == b) {
7007                     SvREFCNT_dec(b);
7008                 }
7009             }
7010             /* else *i is already 'a' */
7011             return;
7012         }
7013
7014         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7015          * intersection must be empty */
7016         if (*i == a) {
7017             SvREFCNT_dec(a);
7018         }
7019         else if (*i == b) {
7020             SvREFCNT_dec(b);
7021         }
7022         *i = _new_invlist(0);
7023         return;
7024     }
7025
7026     /* Here both lists exist and are non-empty */
7027     array_a = invlist_array(a);
7028     array_b = invlist_array(b);
7029
7030     /* If are to take the intersection of 'a' with the complement of b, set it
7031      * up so are looking at b's complement. */
7032     if (complement_b) {
7033
7034         /* To complement, we invert: if the first element is 0, remove it.  To
7035          * do this, we just pretend the array starts one later, and clear the
7036          * flag as we don't have to do anything else later */
7037         if (array_b[0] == 0) {
7038             array_b++;
7039             len_b--;
7040             complement_b = FALSE;
7041         }
7042         else {
7043
7044             /* But if the first element is not zero, we unshift a 0 before the
7045              * array.  The data structure reserves a space for that 0 (which
7046              * should be a '1' right now), so physical shifting is unneeded,
7047              * but temporarily change that element to 0.  Before exiting the
7048              * routine, we must restore the element to '1' */
7049             array_b--;
7050             len_b++;
7051             array_b[0] = 0;
7052         }
7053     }
7054
7055     /* Size the intersection for the worst case: that the intersection ends up
7056      * fragmenting everything to be completely disjoint */
7057     r= _new_invlist(len_a + len_b);
7058
7059     /* Will contain U+0000 iff both components do */
7060     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7061                                      && len_b > 0 && array_b[0] == 0);
7062
7063     /* Go through each list item by item, stopping when exhausted one of
7064      * them */
7065     while (i_a < len_a && i_b < len_b) {
7066         UV cp;      /* The element to potentially add to the intersection's
7067                        array */
7068         bool cp_in_set; /* Is it in the input list's set or not */
7069
7070         /* We need to take one or the other of the two inputs for the
7071          * intersection.  Since we are merging two sorted lists, we take the
7072          * smaller of the next items.  In case of a tie, we take the one that
7073          * is not in its set first (a difference from the union algorithm).  If
7074          * we took one in the set first, it would increment the count, possibly
7075          * to 2 which would cause it to be output as starting a range in the
7076          * intersection, and the next time through we would take that same
7077          * number, and output it again as ending the set.  By doing it the
7078          * opposite of this, there is no possibility that the count will be
7079          * momentarily incremented to 2.  (In a tie and both are in the set or
7080          * both not in the set, it doesn't matter which we take first.) */
7081         if (array_a[i_a] < array_b[i_b]
7082             || (array_a[i_a] == array_b[i_b]
7083                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7084         {
7085             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7086             cp= array_a[i_a++];
7087         }
7088         else {
7089             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7090             cp= array_b[i_b++];
7091         }
7092
7093         /* Here, have chosen which of the two inputs to look at.  Only output
7094          * if the running count changes to/from 2, which marks the
7095          * beginning/end of a range that's in the intersection */
7096         if (cp_in_set) {
7097             count++;
7098             if (count == 2) {
7099                 array_r[i_r++] = cp;
7100             }
7101         }
7102         else {
7103             if (count == 2) {
7104                 array_r[i_r++] = cp;
7105             }
7106             count--;
7107         }
7108     }
7109
7110     /* Here, we are finished going through at least one of the lists, which
7111      * means there is something remaining in at most one.  We check if the list
7112      * that has been exhausted is positioned such that we are in the middle
7113      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7114      * the ones we care about.)  There are four cases:
7115      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7116      *     nothing left in the intersection.
7117      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7118      *     above 2.  What should be output is exactly that which is in the
7119      *     non-exhausted set, as everything it has is also in the intersection
7120      *     set, and everything it doesn't have can't be in the intersection
7121      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7122      *     gets incremented to 2.  Like the previous case, the intersection is
7123      *     everything that remains in the non-exhausted set.
7124      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7125      *     remains 1.  And the intersection has nothing more. */
7126     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7127         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7128     {
7129         count++;
7130     }
7131
7132     /* The final length is what we've output so far plus what else is in the
7133      * intersection.  At most one of the subexpressions below will be non-zero */
7134     len_r = i_r;
7135     if (count >= 2) {
7136         len_r += (len_a - i_a) + (len_b - i_b);
7137     }
7138
7139     /* Set result to final length, which can change the pointer to array_r, so
7140      * re-find it */
7141     if (len_r != invlist_len(r)) {
7142         invlist_set_len(r, len_r);
7143         invlist_trim(r);
7144         array_r = invlist_array(r);
7145     }
7146
7147     /* Finish outputting any remaining */
7148     if (count >= 2) { /* At most one will have a non-zero copy count */
7149         IV copy_count;
7150         if ((copy_count = len_a - i_a) > 0) {
7151             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7152         }
7153         else if ((copy_count = len_b - i_b) > 0) {
7154             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7155         }
7156     }
7157
7158     /*  We may be removing a reference to one of the inputs */
7159     if (a == *i || b == *i) {
7160         SvREFCNT_dec(*i);
7161     }
7162
7163     /* If we've changed b, restore it */
7164     if (complement_b) {
7165         array_b[0] = 1;
7166     }
7167
7168     *i = r;
7169     return;
7170 }
7171
7172 #endif
7173
7174 STATIC SV*
7175 S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7176 {
7177     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7178      * set.  A pointer to the inversion list is returned.  This may actually be
7179      * a new list, in which case the passed in one has been destroyed.  The
7180      * passed in inversion list can be NULL, in which case a new one is created
7181      * with just the one range in it */
7182
7183     SV* range_invlist;
7184     UV len;
7185
7186     if (invlist == NULL) {
7187         invlist = _new_invlist(2);
7188         len = 0;
7189     }
7190     else {
7191         len = invlist_len(invlist);
7192     }
7193
7194     /* If comes after the final entry, can just append it to the end */
7195     if (len == 0
7196         || start >= invlist_array(invlist)
7197                                     [invlist_len(invlist) - 1])
7198     {
7199         _append_range_to_invlist(invlist, start, end);
7200         return invlist;
7201     }
7202
7203     /* Here, can't just append things, create and return a new inversion list
7204      * which is the union of this range and the existing inversion list */
7205     range_invlist = _new_invlist(2);
7206     _append_range_to_invlist(range_invlist, start, end);
7207
7208     _invlist_union(invlist, range_invlist, &invlist);
7209
7210     /* The temporary can be freed */
7211     SvREFCNT_dec(range_invlist);
7212
7213     return invlist;
7214 }
7215
7216 PERL_STATIC_INLINE SV*
7217 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7218     return add_range_to_invlist(invlist, cp, cp);
7219 }
7220
7221 #ifndef PERL_IN_XSUB_RE
7222 void
7223 Perl__invlist_invert(pTHX_ SV* const invlist)
7224 {
7225     /* Complement the input inversion list.  This adds a 0 if the list didn't
7226      * have a zero; removes it otherwise.  As described above, the data
7227      * structure is set up so that this is very efficient */
7228
7229     UV* len_pos = get_invlist_len_addr(invlist);
7230
7231     PERL_ARGS_ASSERT__INVLIST_INVERT;
7232
7233     /* The inverse of matching nothing is matching everything */
7234     if (*len_pos == 0) {
7235         _append_range_to_invlist(invlist, 0, UV_MAX);
7236         return;
7237     }
7238
7239     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7240      * zero element was a 0, so it is being removed, so the length decrements
7241      * by 1; and vice-versa.  SvCUR is unaffected */
7242     if (*get_invlist_zero_addr(invlist) ^= 1) {
7243         (*len_pos)--;
7244     }
7245     else {
7246         (*len_pos)++;
7247     }
7248 }
7249
7250 void
7251 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7252 {
7253     /* Complement the input inversion list (which must be a Unicode property,
7254      * all of which don't match above the Unicode maximum code point.)  And
7255      * Perl has chosen to not have the inversion match above that either.  This
7256      * adds a 0x110000 if the list didn't end with it, and removes it if it did
7257      */
7258
7259     UV len;
7260     UV* array;
7261
7262     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7263
7264     _invlist_invert(invlist);
7265
7266     len = invlist_len(invlist);
7267
7268     if (len != 0) { /* If empty do nothing */
7269         array = invlist_array(invlist);
7270         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7271             /* Add 0x110000.  First, grow if necessary */
7272             len++;
7273             if (invlist_max(invlist) < len) {
7274                 invlist_extend(invlist, len);
7275                 array = invlist_array(invlist);
7276             }
7277             invlist_set_len(invlist, len);
7278             array[len - 1] = PERL_UNICODE_MAX + 1;
7279         }
7280         else {  /* Remove the 0x110000 */
7281             invlist_set_len(invlist, len - 1);
7282         }
7283     }
7284
7285     return;
7286 }
7287 #endif
7288
7289 PERL_STATIC_INLINE SV*
7290 S_invlist_clone(pTHX_ SV* const invlist)
7291 {
7292
7293     /* Return a new inversion list that is a copy of the input one, which is
7294      * unchanged */
7295
7296     /* Need to allocate extra space to accommodate Perl's addition of a
7297      * trailing NUL to SvPV's, since it thinks they are always strings */
7298     SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7299     STRLEN length = SvCUR(invlist);
7300
7301     PERL_ARGS_ASSERT_INVLIST_CLONE;
7302
7303     SvCUR_set(new_invlist, length); /* This isn't done automatically */
7304     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7305
7306     return new_invlist;
7307 }
7308
7309 PERL_STATIC_INLINE UV*
7310 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7311 {
7312     /* Return the address of the UV that contains the current iteration
7313      * position */
7314
7315     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7316
7317     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7318 }
7319
7320 PERL_STATIC_INLINE UV*
7321 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7322 {
7323     /* Return the address of the UV that contains the version id. */
7324
7325     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7326
7327     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7328 }
7329
7330 PERL_STATIC_INLINE void
7331 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
7332 {
7333     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7334
7335     *get_invlist_iter_addr(invlist) = 0;
7336 }
7337
7338 STATIC bool
7339 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7340 {
7341     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7342      * This call sets in <*start> and <*end>, the next range in <invlist>.
7343      * Returns <TRUE> if successful and the next call will return the next
7344      * range; <FALSE> if was already at the end of the list.  If the latter,
7345      * <*start> and <*end> are unchanged, and the next call to this function
7346      * will start over at the beginning of the list */
7347
7348     UV* pos = get_invlist_iter_addr(invlist);
7349     UV len = invlist_len(invlist);
7350     UV *array;
7351
7352     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7353
7354     if (*pos >= len) {
7355         *pos = UV_MAX;  /* Force iternit() to be required next time */
7356         return FALSE;
7357     }
7358
7359     array = invlist_array(invlist);
7360
7361     *start = array[(*pos)++];
7362
7363     if (*pos >= len) {
7364         *end = UV_MAX;
7365     }
7366     else {
7367         *end = array[(*pos)++] - 1;
7368     }
7369
7370     return TRUE;
7371 }
7372
7373 #ifndef PERL_IN_XSUB_RE
7374 SV *
7375 Perl__invlist_contents(pTHX_ SV* const invlist)
7376 {
7377     /* Get the contents of an inversion list into a string SV so that they can
7378      * be printed out.  It uses the format traditionally done for debug tracing
7379      */
7380
7381     UV start, end;
7382     SV* output = newSVpvs("\n");
7383
7384     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7385
7386     invlist_iterinit(invlist);
7387     while (invlist_iternext(invlist, &start, &end)) {
7388         if (end == UV_MAX) {
7389             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7390         }
7391         else if (end != start) {
7392             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7393                     start,       end);
7394         }
7395         else {
7396             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7397         }
7398     }
7399
7400     return output;
7401 }
7402 #endif
7403
7404 #if 0
7405 void
7406 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7407 {
7408     /* Dumps out the ranges in an inversion list.  The string 'header'
7409      * if present is output on a line before the first range */
7410
7411     UV start, end;
7412
7413     if (header && strlen(header)) {
7414         PerlIO_printf(Perl_debug_log, "%s\n", header);
7415     }
7416     invlist_iterinit(invlist);
7417     while (invlist_iternext(invlist, &start, &end)) {
7418         if (end == UV_MAX) {
7419             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7420         }
7421         else {
7422             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7423         }
7424     }
7425 }
7426 #endif
7427
7428 #undef HEADER_LENGTH
7429 #undef INVLIST_INITIAL_LENGTH
7430 #undef TO_INTERNAL_SIZE
7431 #undef FROM_INTERNAL_SIZE
7432 #undef INVLIST_LEN_OFFSET
7433 #undef INVLIST_ZERO_OFFSET
7434 #undef INVLIST_ITER_OFFSET
7435 #undef INVLIST_VERSION_ID
7436
7437 /* End of inversion list object */
7438
7439 /*
7440  - reg - regular expression, i.e. main body or parenthesized thing
7441  *
7442  * Caller must absorb opening parenthesis.
7443  *
7444  * Combining parenthesis handling with the base level of regular expression
7445  * is a trifle forced, but the need to tie the tails of the branches to what
7446  * follows makes it hard to avoid.
7447  */
7448 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7449 #ifdef DEBUGGING
7450 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7451 #else
7452 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7453 #endif
7454
7455 STATIC regnode *
7456 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7457     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7458 {
7459     dVAR;
7460     register regnode *ret;              /* Will be the head of the group. */
7461     register regnode *br;
7462     register regnode *lastbr;
7463     register regnode *ender = NULL;
7464     register I32 parno = 0;
7465     I32 flags;
7466     U32 oregflags = RExC_flags;
7467     bool have_branch = 0;
7468     bool is_open = 0;
7469     I32 freeze_paren = 0;
7470     I32 after_freeze = 0;
7471
7472     /* for (?g), (?gc), and (?o) warnings; warning
7473        about (?c) will warn about (?g) -- japhy    */
7474
7475 #define WASTED_O  0x01
7476 #define WASTED_G  0x02
7477 #define WASTED_C  0x04
7478 #define WASTED_GC (0x02|0x04)
7479     I32 wastedflags = 0x00;
7480
7481     char * parse_start = RExC_parse; /* MJD */
7482     char * const oregcomp_parse = RExC_parse;
7483
7484     GET_RE_DEBUG_FLAGS_DECL;
7485
7486     PERL_ARGS_ASSERT_REG;
7487     DEBUG_PARSE("reg ");
7488
7489     *flagp = 0;                         /* Tentatively. */
7490
7491
7492     /* Make an OPEN node, if parenthesized. */
7493     if (paren) {
7494         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7495             char *start_verb = RExC_parse;
7496             STRLEN verb_len = 0;
7497             char *start_arg = NULL;
7498             unsigned char op = 0;
7499             int argok = 1;
7500             int internal_argval = 0; /* internal_argval is only useful if !argok */
7501             while ( *RExC_parse && *RExC_parse != ')' ) {
7502                 if ( *RExC_parse == ':' ) {
7503                     start_arg = RExC_parse + 1;
7504                     break;
7505                 }
7506                 RExC_parse++;
7507             }
7508             ++start_verb;
7509             verb_len = RExC_parse - start_verb;
7510             if ( start_arg ) {
7511                 RExC_parse++;
7512                 while ( *RExC_parse && *RExC_parse != ')' )
7513                     RExC_parse++;
7514                 if ( *RExC_parse != ')' )
7515                     vFAIL("Unterminated verb pattern argument");
7516                 if ( RExC_parse == start_arg )
7517                     start_arg = NULL;
7518             } else {
7519                 if ( *RExC_parse != ')' )
7520                     vFAIL("Unterminated verb pattern");
7521             }
7522
7523             switch ( *start_verb ) {
7524             case 'A':  /* (*ACCEPT) */
7525                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7526                     op = ACCEPT;
7527                     internal_argval = RExC_nestroot;
7528                 }
7529                 break;
7530             case 'C':  /* (*COMMIT) */
7531                 if ( memEQs(start_verb,verb_len,"COMMIT") )
7532                     op = COMMIT;
7533                 break;
7534             case 'F':  /* (*FAIL) */
7535                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7536                     op = OPFAIL;
7537                     argok = 0;
7538                 }
7539                 break;
7540             case ':':  /* (*:NAME) */
7541             case 'M':  /* (*MARK:NAME) */
7542                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7543                     op = MARKPOINT;
7544                     argok = -1;
7545                 }
7546                 break;
7547             case 'P':  /* (*PRUNE) */
7548                 if ( memEQs(start_verb,verb_len,"PRUNE") )
7549                     op = PRUNE;
7550                 break;
7551             case 'S':   /* (*SKIP) */
7552                 if ( memEQs(start_verb,verb_len,"SKIP") )
7553                     op = SKIP;
7554                 break;
7555             case 'T':  /* (*THEN) */
7556                 /* [19:06] <TimToady> :: is then */
7557                 if ( memEQs(start_verb,verb_len,"THEN") ) {
7558                     op = CUTGROUP;
7559                     RExC_seen |= REG_SEEN_CUTGROUP;
7560                 }
7561                 break;
7562             }
7563             if ( ! op ) {
7564                 RExC_parse++;
7565                 vFAIL3("Unknown verb pattern '%.*s'",
7566                     verb_len, start_verb);
7567             }
7568             if ( argok ) {
7569                 if ( start_arg && internal_argval ) {
7570                     vFAIL3("Verb pattern '%.*s' may not have an argument",
7571                         verb_len, start_verb);
7572                 } else if ( argok < 0 && !start_arg ) {
7573                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7574                         verb_len, start_verb);
7575                 } else {
7576                     ret = reganode(pRExC_state, op, internal_argval);
7577                     if ( ! internal_argval && ! SIZE_ONLY ) {
7578                         if (start_arg) {
7579                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7580                             ARG(ret) = add_data( pRExC_state, 1, "S" );
7581                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
7582                             ret->flags = 0;
7583                         } else {
7584                             ret->flags = 1;
7585                         }
7586                     }
7587                 }
7588                 if (!internal_argval)
7589                     RExC_seen |= REG_SEEN_VERBARG;
7590             } else if ( start_arg ) {
7591                 vFAIL3("Verb pattern '%.*s' may not have an argument",
7592                         verb_len, start_verb);
7593             } else {
7594                 ret = reg_node(pRExC_state, op);
7595             }
7596             nextchar(pRExC_state);
7597             return ret;
7598         } else
7599         if (*RExC_parse == '?') { /* (?...) */
7600             bool is_logical = 0;
7601             const char * const seqstart = RExC_parse;
7602             bool has_use_defaults = FALSE;
7603
7604             RExC_parse++;
7605             paren = *RExC_parse++;
7606             ret = NULL;                 /* For look-ahead/behind. */
7607             switch (paren) {
7608
7609             case 'P':   /* (?P...) variants for those used to PCRE/Python */
7610                 paren = *RExC_parse++;
7611                 if ( paren == '<')         /* (?P<...>) named capture */
7612                     goto named_capture;
7613                 else if (paren == '>') {   /* (?P>name) named recursion */
7614                     goto named_recursion;
7615                 }
7616                 else if (paren == '=') {   /* (?P=...)  named backref */
7617                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
7618                        you change this make sure you change that */
7619                     char* name_start = RExC_parse;
7620                     U32 num = 0;
7621                     SV *sv_dat = reg_scan_name(pRExC_state,
7622                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7623                     if (RExC_parse == name_start || *RExC_parse != ')')
7624                         vFAIL2("Sequence %.3s... not terminated",parse_start);
7625
7626                     if (!SIZE_ONLY) {
7627                         num = add_data( pRExC_state, 1, "S" );
7628                         RExC_rxi->data->data[num]=(void*)sv_dat;
7629                         SvREFCNT_inc_simple_void(sv_dat);
7630                     }
7631                     RExC_sawback = 1;
7632                     ret = reganode(pRExC_state,
7633                                    ((! FOLD)
7634                                      ? NREF
7635                                      : (MORE_ASCII_RESTRICTED)
7636                                        ? NREFFA
7637                                        : (AT_LEAST_UNI_SEMANTICS)
7638                                          ? NREFFU
7639                                          : (LOC)
7640                                            ? NREFFL
7641                                            : NREFF),
7642                                     num);
7643                     *flagp |= HASWIDTH;
7644
7645                     Set_Node_Offset(ret, parse_start+1);
7646                     Set_Node_Cur_Length(ret); /* MJD */
7647
7648                     nextchar(pRExC_state);
7649                     return ret;
7650                 }
7651                 RExC_parse++;
7652                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7653                 /*NOTREACHED*/
7654             case '<':           /* (?<...) */
7655                 if (*RExC_parse == '!')
7656                     paren = ',';
7657                 else if (*RExC_parse != '=')
7658               named_capture:
7659                 {               /* (?<...>) */
7660                     char *name_start;
7661                     SV *svname;
7662                     paren= '>';
7663             case '\'':          /* (?'...') */
7664                     name_start= RExC_parse;
7665                     svname = reg_scan_name(pRExC_state,
7666                         SIZE_ONLY ?  /* reverse test from the others */
7667                         REG_RSN_RETURN_NAME :
7668                         REG_RSN_RETURN_NULL);
7669                     if (RExC_parse == name_start) {
7670                         RExC_parse++;
7671                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7672                         /*NOTREACHED*/
7673                     }
7674                     if (*RExC_parse != paren)
7675                         vFAIL2("Sequence (?%c... not terminated",
7676                             paren=='>' ? '<' : paren);
7677                     if (SIZE_ONLY) {
7678                         HE *he_str;
7679                         SV *sv_dat = NULL;
7680                         if (!svname) /* shouldn't happen */
7681                             Perl_croak(aTHX_
7682                                 "panic: reg_scan_name returned NULL");
7683                         if (!RExC_paren_names) {
7684                             RExC_paren_names= newHV();
7685                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
7686 #ifdef DEBUGGING
7687                             RExC_paren_name_list= newAV();
7688                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
7689 #endif
7690                         }
7691                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
7692                         if ( he_str )
7693                             sv_dat = HeVAL(he_str);
7694                         if ( ! sv_dat ) {
7695                             /* croak baby croak */
7696                             Perl_croak(aTHX_
7697                                 "panic: paren_name hash element allocation failed");
7698                         } else if ( SvPOK(sv_dat) ) {
7699                             /* (?|...) can mean we have dupes so scan to check
7700                                its already been stored. Maybe a flag indicating
7701                                we are inside such a construct would be useful,
7702                                but the arrays are likely to be quite small, so
7703                                for now we punt -- dmq */
7704                             IV count = SvIV(sv_dat);
7705                             I32 *pv = (I32*)SvPVX(sv_dat);
7706                             IV i;
7707                             for ( i = 0 ; i < count ; i++ ) {
7708                                 if ( pv[i] == RExC_npar ) {
7709                                     count = 0;
7710                                     break;
7711                                 }
7712                             }
7713                             if ( count ) {
7714                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7715                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7716                                 pv[count] = RExC_npar;
7717                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
7718                             }
7719                         } else {
7720                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
7721                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7722                             SvIOK_on(sv_dat);
7723                             SvIV_set(sv_dat, 1);
7724                         }
7725 #ifdef DEBUGGING
7726                         /* Yes this does cause a memory leak in debugging Perls */
7727                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7728                             SvREFCNT_dec(svname);
7729 #endif
7730
7731                         /*sv_dump(sv_dat);*/
7732                     }
7733                     nextchar(pRExC_state);
7734                     paren = 1;
7735                     goto capturing_parens;
7736                 }
7737                 RExC_seen |= REG_SEEN_LOOKBEHIND;
7738                 RExC_in_lookbehind++;
7739                 RExC_parse++;
7740             case '=':           /* (?=...) */
7741                 RExC_seen_zerolen++;
7742                 break;
7743             case '!':           /* (?!...) */
7744                 RExC_seen_zerolen++;
7745                 if (*RExC_parse == ')') {
7746                     ret=reg_node(pRExC_state, OPFAIL);
7747                     nextchar(pRExC_state);
7748                     return ret;
7749                 }
7750                 break;
7751             case '|':           /* (?|...) */
7752                 /* branch reset, behave like a (?:...) except that
7753                    buffers in alternations share the same numbers */
7754                 paren = ':';
7755                 after_freeze = freeze_paren = RExC_npar;
7756                 break;
7757             case ':':           /* (?:...) */
7758             case '>':           /* (?>...) */
7759                 break;
7760             case '$':           /* (?$...) */
7761             case '@':           /* (?@...) */
7762                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
7763                 break;
7764             case '#':           /* (?#...) */
7765                 while (*RExC_parse && *RExC_parse != ')')
7766                     RExC_parse++;
7767                 if (*RExC_parse != ')')
7768                     FAIL("Sequence (?#... not terminated");
7769                 nextchar(pRExC_state);
7770                 *flagp = TRYAGAIN;
7771                 return NULL;
7772             case '0' :           /* (?0) */
7773             case 'R' :           /* (?R) */
7774                 if (*RExC_parse != ')')
7775                     FAIL("Sequence (?R) not terminated");
7776                 ret = reg_node(pRExC_state, GOSTART);
7777                 *flagp |= POSTPONED;
7778                 nextchar(pRExC_state);
7779                 return ret;
7780                 /*notreached*/
7781             { /* named and numeric backreferences */
7782                 I32 num;
7783             case '&':            /* (?&NAME) */
7784                 parse_start = RExC_parse - 1;
7785               named_recursion:
7786                 {
7787                     SV *sv_dat = reg_scan_name(pRExC_state,
7788                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7789                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7790                 }
7791                 goto gen_recurse_regop;
7792                 /* NOT REACHED */
7793             case '+':
7794                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7795                     RExC_parse++;
7796                     vFAIL("Illegal pattern");
7797                 }
7798                 goto parse_recursion;
7799                 /* NOT REACHED*/
7800             case '-': /* (?-1) */
7801                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7802                     RExC_parse--; /* rewind to let it be handled later */
7803                     goto parse_flags;
7804                 }
7805                 /*FALLTHROUGH */
7806             case '1': case '2': case '3': case '4': /* (?1) */
7807             case '5': case '6': case '7': case '8': case '9':
7808                 RExC_parse--;
7809               parse_recursion:
7810                 num = atoi(RExC_parse);
7811                 parse_start = RExC_parse - 1; /* MJD */
7812                 if (*RExC_parse == '-')
7813                     RExC_parse++;
7814                 while (isDIGIT(*RExC_parse))
7815                         RExC_parse++;
7816                 if (*RExC_parse!=')')
7817                     vFAIL("Expecting close bracket");
7818
7819               gen_recurse_regop:
7820                 if ( paren == '-' ) {
7821                     /*
7822                     Diagram of capture buffer numbering.
7823                     Top line is the normal capture buffer numbers
7824                     Bottom line is the negative indexing as from
7825                     the X (the (?-2))
7826
7827                     +   1 2    3 4 5 X          6 7
7828                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7829                     -   5 4    3 2 1 X          x x
7830
7831                     */
7832                     num = RExC_npar + num;
7833                     if (num < 1)  {
7834                         RExC_parse++;
7835                         vFAIL("Reference to nonexistent group");
7836                     }
7837                 } else if ( paren == '+' ) {
7838                     num = RExC_npar + num - 1;
7839                 }
7840
7841                 ret = reganode(pRExC_state, GOSUB, num);
7842                 if (!SIZE_ONLY) {
7843                     if (num > (I32)RExC_rx->nparens) {
7844                         RExC_parse++;
7845                         vFAIL("Reference to nonexistent group");
7846                     }
7847                     ARG2L_SET( ret, RExC_recurse_count++);
7848                     RExC_emit++;
7849                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7850                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
7851                 } else {
7852                     RExC_size++;
7853                 }
7854                 RExC_seen |= REG_SEEN_RECURSE;
7855                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
7856                 Set_Node_Offset(ret, parse_start); /* MJD */
7857
7858                 *flagp |= POSTPONED;
7859                 nextchar(pRExC_state);
7860                 return ret;
7861             } /* named and numeric backreferences */
7862             /* NOT REACHED */
7863
7864             case '?':           /* (??...) */
7865                 is_logical = 1;
7866                 if (*RExC_parse != '{') {
7867                     RExC_parse++;
7868                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7869                     /*NOTREACHED*/
7870                 }
7871                 *flagp |= POSTPONED;
7872                 paren = *RExC_parse++;
7873                 /* FALL THROUGH */
7874             case '{':           /* (?{...}) */
7875             {
7876                 I32 count = 1;
7877                 U32 n = 0;
7878                 char c;
7879                 char *s = RExC_parse;
7880
7881                 RExC_seen_zerolen++;
7882                 RExC_seen |= REG_SEEN_EVAL;
7883                 while (count && (c = *RExC_parse)) {
7884                     if (c == '\\') {
7885                         if (RExC_parse[1])
7886                             RExC_parse++;
7887                     }
7888                     else if (c == '{')
7889                         count++;
7890                     else if (c == '}')
7891                         count--;
7892                     RExC_parse++;
7893                 }
7894                 if (*RExC_parse != ')') {
7895                     RExC_parse = s;
7896                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7897                 }
7898                 if (!SIZE_ONLY) {
7899                     PAD *pad;
7900                     OP_4tree *sop, *rop;
7901                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
7902
7903                     ENTER;
7904                     Perl_save_re_context(aTHX);
7905                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
7906                     sop->op_private |= OPpREFCOUNTED;
7907                     /* re_dup will OpREFCNT_inc */
7908                     OpREFCNT_set(sop, 1);
7909                     LEAVE;
7910
7911                     n = add_data(pRExC_state, 3, "nop");
7912                     RExC_rxi->data->data[n] = (void*)rop;
7913                     RExC_rxi->data->data[n+1] = (void*)sop;
7914                     RExC_rxi->data->data[n+2] = (void*)pad;
7915                     SvREFCNT_dec(sv);
7916                 }
7917                 else {                                          /* First pass */
7918                     if (PL_reginterp_cnt < ++RExC_seen_evals
7919                         && IN_PERL_RUNTIME)
7920                         /* No compiled RE interpolated, has runtime
7921                            components ===> unsafe.  */
7922                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
7923                     if (PL_tainting && PL_tainted)
7924                         FAIL("Eval-group in insecure regular expression");
7925 #if PERL_VERSION > 8
7926                     if (IN_PERL_COMPILETIME)
7927                         PL_cv_has_eval = 1;
7928 #endif
7929                 }
7930
7931                 nextchar(pRExC_state);
7932                 if (is_logical) {
7933                     ret = reg_node(pRExC_state, LOGICAL);
7934                     if (!SIZE_ONLY)
7935                         ret->flags = 2;
7936                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
7937                     /* deal with the length of this later - MJD */
7938                     return ret;
7939                 }
7940                 ret = reganode(pRExC_state, EVAL, n);
7941                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7942                 Set_Node_Offset(ret, parse_start);
7943                 return ret;
7944             }
7945             case '(':           /* (?(?{...})...) and (?(?=...)...) */
7946             {
7947                 int is_define= 0;
7948                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
7949                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7950                         || RExC_parse[1] == '<'
7951                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
7952                         I32 flag;
7953
7954                         ret = reg_node(pRExC_state, LOGICAL);
7955                         if (!SIZE_ONLY)
7956                             ret->flags = 1;
7957                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
7958                         goto insert_if;
7959                     }
7960                 }
7961                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
7962                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
7963                 {
7964                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
7965                     char *name_start= RExC_parse++;
7966                     U32 num = 0;
7967                     SV *sv_dat=reg_scan_name(pRExC_state,
7968                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7969                     if (RExC_parse == name_start || *RExC_parse != ch)
7970                         vFAIL2("Sequence (?(%c... not terminated",
7971                             (ch == '>' ? '<' : ch));
7972                     RExC_parse++;
7973                     if (!SIZE_ONLY) {
7974                         num = add_data( pRExC_state, 1, "S" );
7975                         RExC_rxi->data->data[num]=(void*)sv_dat;
7976                         SvREFCNT_inc_simple_void(sv_dat);
7977                     }
7978                     ret = reganode(pRExC_state,NGROUPP,num);
7979                     goto insert_if_check_paren;
7980                 }
7981                 else if (RExC_parse[0] == 'D' &&
7982                          RExC_parse[1] == 'E' &&
7983                          RExC_parse[2] == 'F' &&
7984                          RExC_parse[3] == 'I' &&
7985                          RExC_parse[4] == 'N' &&
7986                          RExC_parse[5] == 'E')
7987                 {
7988                     ret = reganode(pRExC_state,DEFINEP,0);
7989                     RExC_parse +=6 ;
7990                     is_define = 1;
7991                     goto insert_if_check_paren;
7992                 }
7993                 else if (RExC_parse[0] == 'R') {
7994                     RExC_parse++;
7995                     parno = 0;
7996                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7997                         parno = atoi(RExC_parse++);
7998                         while (isDIGIT(*RExC_parse))
7999                             RExC_parse++;
8000                     } else if (RExC_parse[0] == '&') {
8001                         SV *sv_dat;
8002                         RExC_parse++;
8003                         sv_dat = reg_scan_name(pRExC_state,
8004                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8005                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8006                     }
8007                     ret = reganode(pRExC_state,INSUBP,parno);
8008                     goto insert_if_check_paren;
8009                 }
8010                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8011                     /* (?(1)...) */
8012                     char c;
8013                     parno = atoi(RExC_parse++);
8014
8015                     while (isDIGIT(*RExC_parse))
8016                         RExC_parse++;
8017                     ret = reganode(pRExC_state, GROUPP, parno);
8018
8019                  insert_if_check_paren:
8020                     if ((c = *nextchar(pRExC_state)) != ')')
8021                         vFAIL("Switch condition not recognized");
8022                   insert_if:
8023                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8024                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8025                     if (br == NULL)
8026                         br = reganode(pRExC_state, LONGJMP, 0);
8027                     else
8028                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8029                     c = *nextchar(pRExC_state);
8030                     if (flags&HASWIDTH)
8031                         *flagp |= HASWIDTH;
8032                     if (c == '|') {
8033                         if (is_define)
8034                             vFAIL("(?(DEFINE)....) does not allow branches");
8035                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8036                         regbranch(pRExC_state, &flags, 1,depth+1);
8037                         REGTAIL(pRExC_state, ret, lastbr);
8038                         if (flags&HASWIDTH)
8039                             *flagp |= HASWIDTH;
8040                         c = *nextchar(pRExC_state);
8041                     }
8042                     else
8043                         lastbr = NULL;
8044                     if (c != ')')
8045                         vFAIL("Switch (?(condition)... contains too many branches");
8046                     ender = reg_node(pRExC_state, TAIL);
8047                     REGTAIL(pRExC_state, br, ender);
8048                     if (lastbr) {
8049                         REGTAIL(pRExC_state, lastbr, ender);
8050                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8051                     }
8052                     else
8053                         REGTAIL(pRExC_state, ret, ender);
8054                     RExC_size++; /* XXX WHY do we need this?!!
8055                                     For large programs it seems to be required
8056                                     but I can't figure out why. -- dmq*/
8057                     return ret;
8058                 }
8059                 else {
8060                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8061                 }
8062             }
8063             case 0:
8064                 RExC_parse--; /* for vFAIL to print correctly */
8065                 vFAIL("Sequence (? incomplete");
8066                 break;
8067             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8068                                        that follow */
8069                 has_use_defaults = TRUE;
8070                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8071                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8072                                                 ? REGEX_UNICODE_CHARSET
8073                                                 : REGEX_DEPENDS_CHARSET);
8074                 goto parse_flags;
8075             default:
8076                 --RExC_parse;
8077                 parse_flags:      /* (?i) */
8078             {
8079                 U32 posflags = 0, negflags = 0;
8080                 U32 *flagsp = &posflags;
8081                 char has_charset_modifier = '\0';
8082                 regex_charset cs = get_regex_charset(RExC_flags);
8083                 if (cs == REGEX_DEPENDS_CHARSET
8084                     && (RExC_utf8 || RExC_uni_semantics))
8085                 {
8086                     cs = REGEX_UNICODE_CHARSET;
8087                 }
8088
8089                 while (*RExC_parse) {
8090                     /* && strchr("iogcmsx", *RExC_parse) */
8091                     /* (?g), (?gc) and (?o) are useless here
8092                        and must be globally applied -- japhy */
8093                     switch (*RExC_parse) {
8094                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8095                     case LOCALE_PAT_MOD:
8096                         if (has_charset_modifier) {
8097                             goto excess_modifier;
8098                         }
8099                         else if (flagsp == &negflags) {
8100                             goto neg_modifier;
8101                         }
8102                         cs = REGEX_LOCALE_CHARSET;
8103                         has_charset_modifier = LOCALE_PAT_MOD;
8104                         RExC_contains_locale = 1;
8105                         break;
8106                     case UNICODE_PAT_MOD:
8107                         if (has_charset_modifier) {
8108                             goto excess_modifier;
8109                         }
8110                         else if (flagsp == &negflags) {
8111                             goto neg_modifier;
8112                         }
8113                         cs = REGEX_UNICODE_CHARSET;
8114                         has_charset_modifier = UNICODE_PAT_MOD;
8115                         break;
8116                     case ASCII_RESTRICT_PAT_MOD:
8117                         if (flagsp == &negflags) {
8118                             goto neg_modifier;
8119                         }
8120                         if (has_charset_modifier) {
8121                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8122                                 goto excess_modifier;
8123                             }
8124                             /* Doubled modifier implies more restricted */
8125                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8126                         }
8127                         else {
8128                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8129                         }
8130                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8131                         break;
8132                     case DEPENDS_PAT_MOD:
8133                         if (has_use_defaults) {
8134                             goto fail_modifiers;
8135                         }
8136                         else if (flagsp == &negflags) {
8137                             goto neg_modifier;
8138                         }
8139                         else if (has_charset_modifier) {
8140                             goto excess_modifier;
8141                         }
8142
8143                         /* The dual charset means unicode semantics if the
8144                          * pattern (or target, not known until runtime) are
8145                          * utf8, or something in the pattern indicates unicode
8146                          * semantics */
8147                         cs = (RExC_utf8 || RExC_uni_semantics)
8148                              ? REGEX_UNICODE_CHARSET
8149                              : REGEX_DEPENDS_CHARSET;
8150                         has_charset_modifier = DEPENDS_PAT_MOD;
8151                         break;
8152                     excess_modifier:
8153                         RExC_parse++;
8154                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8155                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8156                         }
8157                         else if (has_charset_modifier == *(RExC_parse - 1)) {
8158                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8159                         }
8160                         else {
8161                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8162                         }
8163                         /*NOTREACHED*/
8164                     neg_modifier:
8165                         RExC_parse++;
8166                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8167                         /*NOTREACHED*/
8168                     case ONCE_PAT_MOD: /* 'o' */
8169                     case GLOBAL_PAT_MOD: /* 'g' */
8170                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8171                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8172                             if (! (wastedflags & wflagbit) ) {
8173                                 wastedflags |= wflagbit;
8174                                 vWARN5(
8175                                     RExC_parse + 1,
8176                                     "Useless (%s%c) - %suse /%c modifier",
8177                                     flagsp == &negflags ? "?-" : "?",
8178                                     *RExC_parse,
8179                                     flagsp == &negflags ? "don't " : "",
8180                                     *RExC_parse
8181                                 );
8182                             }
8183                         }
8184                         break;
8185
8186                     case CONTINUE_PAT_MOD: /* 'c' */
8187                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8188                             if (! (wastedflags & WASTED_C) ) {
8189                                 wastedflags |= WASTED_GC;
8190                                 vWARN3(
8191                                     RExC_parse + 1,
8192                                     "Useless (%sc) - %suse /gc modifier",
8193                                     flagsp == &negflags ? "?-" : "?",
8194                                     flagsp == &negflags ? "don't " : ""
8195                                 );
8196                             }
8197                         }
8198                         break;
8199                     case KEEPCOPY_PAT_MOD: /* 'p' */
8200                         if (flagsp == &negflags) {
8201                             if (SIZE_ONLY)
8202                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8203                         } else {
8204                             *flagsp |= RXf_PMf_KEEPCOPY;
8205                         }
8206                         break;
8207                     case '-':
8208                         /* A flag is a default iff it is following a minus, so
8209                          * if there is a minus, it means will be trying to
8210                          * re-specify a default which is an error */
8211                         if (has_use_defaults || flagsp == &negflags) {
8212             fail_modifiers:
8213                             RExC_parse++;
8214                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8215                             /*NOTREACHED*/
8216                         }
8217                         flagsp = &negflags;
8218                         wastedflags = 0;  /* reset so (?g-c) warns twice */
8219                         break;
8220                     case ':':
8221                         paren = ':';
8222                         /*FALLTHROUGH*/
8223                     case ')':
8224                         RExC_flags |= posflags;
8225                         RExC_flags &= ~negflags;
8226                         set_regex_charset(&RExC_flags, cs);
8227                         if (paren != ':') {
8228                             oregflags |= posflags;
8229                             oregflags &= ~negflags;
8230                             set_regex_charset(&oregflags, cs);
8231                         }
8232                         nextchar(pRExC_state);
8233                         if (paren != ':') {
8234                             *flagp = TRYAGAIN;
8235                             return NULL;
8236                         } else {
8237                             ret = NULL;
8238                             goto parse_rest;
8239                         }
8240                         /*NOTREACHED*/
8241                     default:
8242                         RExC_parse++;
8243                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8244                         /*NOTREACHED*/
8245                     }
8246                     ++RExC_parse;
8247                 }
8248             }} /* one for the default block, one for the switch */
8249         }
8250         else {                  /* (...) */
8251           capturing_parens:
8252             parno = RExC_npar;
8253             RExC_npar++;
8254
8255             ret = reganode(pRExC_state, OPEN, parno);
8256             if (!SIZE_ONLY ){
8257                 if (!RExC_nestroot)
8258                     RExC_nestroot = parno;
8259                 if (RExC_seen & REG_SEEN_RECURSE
8260                     && !RExC_open_parens[parno-1])
8261                 {
8262                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8263                         "Setting open paren #%"IVdf" to %d\n",
8264                         (IV)parno, REG_NODE_NUM(ret)));
8265                     RExC_open_parens[parno-1]= ret;
8266                 }
8267             }
8268             Set_Node_Length(ret, 1); /* MJD */
8269             Set_Node_Offset(ret, RExC_parse); /* MJD */
8270             is_open = 1;
8271         }
8272     }
8273     else                        /* ! paren */
8274         ret = NULL;
8275
8276    parse_rest:
8277     /* Pick up the branches, linking them together. */
8278     parse_start = RExC_parse;   /* MJD */
8279     br = regbranch(pRExC_state, &flags, 1,depth+1);
8280
8281     /*     branch_len = (paren != 0); */
8282
8283     if (br == NULL)
8284         return(NULL);
8285     if (*RExC_parse == '|') {
8286         if (!SIZE_ONLY && RExC_extralen) {
8287             reginsert(pRExC_state, BRANCHJ, br, depth+1);
8288         }
8289         else {                  /* MJD */
8290             reginsert(pRExC_state, BRANCH, br, depth+1);
8291             Set_Node_Length(br, paren != 0);
8292             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8293         }
8294         have_branch = 1;
8295         if (SIZE_ONLY)
8296             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
8297     }
8298     else if (paren == ':') {
8299         *flagp |= flags&SIMPLE;
8300     }
8301     if (is_open) {                              /* Starts with OPEN. */
8302         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
8303     }
8304     else if (paren != '?')              /* Not Conditional */
8305         ret = br;
8306     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8307     lastbr = br;
8308     while (*RExC_parse == '|') {
8309         if (!SIZE_ONLY && RExC_extralen) {
8310             ender = reganode(pRExC_state, LONGJMP,0);
8311             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8312         }
8313         if (SIZE_ONLY)
8314             RExC_extralen += 2;         /* Account for LONGJMP. */
8315         nextchar(pRExC_state);
8316         if (freeze_paren) {
8317             if (RExC_npar > after_freeze)
8318                 after_freeze = RExC_npar;
8319             RExC_npar = freeze_paren;
8320         }
8321         br = regbranch(pRExC_state, &flags, 0, depth+1);
8322
8323         if (br == NULL)
8324             return(NULL);
8325         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
8326         lastbr = br;
8327         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8328     }
8329
8330     if (have_branch || paren != ':') {
8331         /* Make a closing node, and hook it on the end. */
8332         switch (paren) {
8333         case ':':
8334             ender = reg_node(pRExC_state, TAIL);
8335             break;
8336         case 1:
8337             ender = reganode(pRExC_state, CLOSE, parno);
8338             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8339                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8340                         "Setting close paren #%"IVdf" to %d\n",
8341                         (IV)parno, REG_NODE_NUM(ender)));
8342                 RExC_close_parens[parno-1]= ender;
8343                 if (RExC_nestroot == parno)
8344                     RExC_nestroot = 0;
8345             }
8346             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8347             Set_Node_Length(ender,1); /* MJD */
8348             break;
8349         case '<':
8350         case ',':
8351         case '=':
8352         case '!':
8353             *flagp &= ~HASWIDTH;
8354             /* FALL THROUGH */
8355         case '>':
8356             ender = reg_node(pRExC_state, SUCCEED);
8357             break;
8358         case 0:
8359             ender = reg_node(pRExC_state, END);
8360             if (!SIZE_ONLY) {
8361                 assert(!RExC_opend); /* there can only be one! */
8362                 RExC_opend = ender;
8363             }
8364             break;
8365         }
8366         REGTAIL(pRExC_state, lastbr, ender);
8367
8368         if (have_branch && !SIZE_ONLY) {
8369             if (depth==1)
8370                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8371
8372             /* Hook the tails of the branches to the closing node. */
8373             for (br = ret; br; br = regnext(br)) {
8374                 const U8 op = PL_regkind[OP(br)];
8375                 if (op == BRANCH) {
8376                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8377                 }
8378                 else if (op == BRANCHJ) {
8379                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8380                 }
8381             }
8382         }
8383     }
8384
8385     {
8386         const char *p;
8387         static const char parens[] = "=!<,>";
8388
8389         if (paren && (p = strchr(parens, paren))) {
8390             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8391             int flag = (p - parens) > 1;
8392
8393             if (paren == '>')
8394                 node = SUSPEND, flag = 0;
8395             reginsert(pRExC_state, node,ret, depth+1);
8396             Set_Node_Cur_Length(ret);
8397             Set_Node_Offset(ret, parse_start + 1);
8398             ret->flags = flag;
8399             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8400         }
8401     }
8402
8403     /* Check for proper termination. */
8404     if (paren) {
8405         RExC_flags = oregflags;
8406         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8407             RExC_parse = oregcomp_parse;
8408             vFAIL("Unmatched (");
8409         }
8410     }
8411     else if (!paren && RExC_parse < RExC_end) {
8412         if (*RExC_parse == ')') {
8413             RExC_parse++;
8414             vFAIL("Unmatched )");
8415         }
8416         else
8417             FAIL("Junk on end of regexp");      /* "Can't happen". */
8418         /* NOTREACHED */
8419     }
8420
8421     if (RExC_in_lookbehind) {
8422         RExC_in_lookbehind--;
8423     }
8424     if (after_freeze > RExC_npar)
8425         RExC_npar = after_freeze;
8426     return(ret);
8427 }
8428
8429 /*
8430  - regbranch - one alternative of an | operator
8431  *
8432  * Implements the concatenation operator.
8433  */
8434 STATIC regnode *
8435 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8436 {
8437     dVAR;
8438     register regnode *ret;
8439     register regnode *chain = NULL;
8440     register regnode *latest;
8441     I32 flags = 0, c = 0;
8442     GET_RE_DEBUG_FLAGS_DECL;
8443
8444     PERL_ARGS_ASSERT_REGBRANCH;
8445
8446     DEBUG_PARSE("brnc");
8447
8448     if (first)
8449         ret = NULL;
8450     else {
8451         if (!SIZE_ONLY && RExC_extralen)
8452             ret = reganode(pRExC_state, BRANCHJ,0);
8453         else {
8454             ret = reg_node(pRExC_state, BRANCH);
8455             Set_Node_Length(ret, 1);
8456         }
8457     }
8458
8459     if (!first && SIZE_ONLY)
8460         RExC_extralen += 1;                     /* BRANCHJ */
8461
8462     *flagp = WORST;                     /* Tentatively. */
8463
8464     RExC_parse--;
8465     nextchar(pRExC_state);
8466     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8467         flags &= ~TRYAGAIN;
8468         latest = regpiece(pRExC_state, &flags,depth+1);
8469         if (latest == NULL) {
8470             if (flags & TRYAGAIN)
8471                 continue;
8472             return(NULL);
8473         }
8474         else if (ret == NULL)
8475             ret = latest;
8476         *flagp |= flags&(HASWIDTH|POSTPONED);
8477         if (chain == NULL)      /* First piece. */
8478             *flagp |= flags&SPSTART;
8479         else {
8480             RExC_naughty++;
8481             REGTAIL(pRExC_state, chain, latest);
8482         }
8483         chain = latest;
8484         c++;
8485     }
8486     if (chain == NULL) {        /* Loop ran zero times. */
8487         chain = reg_node(pRExC_state, NOTHING);
8488         if (ret == NULL)
8489             ret = chain;
8490     }
8491     if (c == 1) {
8492         *flagp |= flags&SIMPLE;
8493     }
8494
8495     return ret;
8496 }
8497
8498 /*
8499  - regpiece - something followed by possible [*+?]
8500  *
8501  * Note that the branching code sequences used for ? and the general cases
8502  * of * and + are somewhat optimized:  they use the same NOTHING node as
8503  * both the endmarker for their branch list and the body of the last branch.
8504  * It might seem that this node could be dispensed with entirely, but the
8505  * endmarker role is not redundant.
8506  */
8507 STATIC regnode *
8508 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8509 {
8510     dVAR;
8511     register regnode *ret;
8512     register char op;
8513     register char *next;
8514     I32 flags;
8515     const char * const origparse = RExC_parse;
8516     I32 min;
8517     I32 max = REG_INFTY;
8518 #ifdef RE_TRACK_PATTERN_OFFSETS
8519     char *parse_start;
8520 #endif
8521     const char *maxpos = NULL;
8522     GET_RE_DEBUG_FLAGS_DECL;
8523
8524     PERL_ARGS_ASSERT_REGPIECE;
8525
8526     DEBUG_PARSE("piec");
8527
8528     ret = regatom(pRExC_state, &flags,depth+1);
8529     if (ret == NULL) {
8530         if (flags & TRYAGAIN)
8531             *flagp |= TRYAGAIN;
8532         return(NULL);
8533     }
8534
8535     op = *RExC_parse;
8536
8537     if (op == '{' && regcurly(RExC_parse)) {
8538         maxpos = NULL;
8539 #ifdef RE_TRACK_PATTERN_OFFSETS
8540         parse_start = RExC_parse; /* MJD */
8541 #endif
8542         next = RExC_parse + 1;
8543         while (isDIGIT(*next) || *next == ',') {
8544             if (*next == ',') {
8545                 if (maxpos)
8546                     break;
8547                 else
8548                     maxpos = next;
8549             }
8550             next++;
8551         }
8552         if (*next == '}') {             /* got one */
8553             if (!maxpos)
8554                 maxpos = next;
8555             RExC_parse++;
8556             min = atoi(RExC_parse);
8557             if (*maxpos == ',')
8558                 maxpos++;
8559             else
8560                 maxpos = RExC_parse;
8561             max = atoi(maxpos);
8562             if (!max && *maxpos != '0')
8563                 max = REG_INFTY;                /* meaning "infinity" */
8564             else if (max >= REG_INFTY)
8565                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
8566             RExC_parse = next;
8567             nextchar(pRExC_state);
8568
8569         do_curly:
8570             if ((flags&SIMPLE)) {
8571                 RExC_naughty += 2 + RExC_naughty / 2;
8572                 reginsert(pRExC_state, CURLY, ret, depth+1);
8573                 Set_Node_Offset(ret, parse_start+1); /* MJD */
8574                 Set_Node_Cur_Length(ret);
8575             }
8576             else {
8577                 regnode * const w = reg_node(pRExC_state, WHILEM);
8578
8579                 w->flags = 0;
8580                 REGTAIL(pRExC_state, ret, w);
8581                 if (!SIZE_ONLY && RExC_extralen) {
8582                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
8583                     reginsert(pRExC_state, NOTHING,ret, depth+1);
8584                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
8585                 }
8586                 reginsert(pRExC_state, CURLYX,ret, depth+1);
8587                                 /* MJD hk */
8588                 Set_Node_Offset(ret, parse_start+1);
8589                 Set_Node_Length(ret,
8590                                 op == '{' ? (RExC_parse - parse_start) : 1);
8591
8592                 if (!SIZE_ONLY && RExC_extralen)
8593                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
8594                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
8595                 if (SIZE_ONLY)
8596                     RExC_whilem_seen++, RExC_extralen += 3;
8597                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
8598             }
8599             ret->flags = 0;
8600
8601             if (min > 0)
8602                 *flagp = WORST;
8603             if (max > 0)
8604                 *flagp |= HASWIDTH;
8605             if (max < min)
8606                 vFAIL("Can't do {n,m} with n > m");
8607             if (!SIZE_ONLY) {
8608                 ARG1_SET(ret, (U16)min);
8609                 ARG2_SET(ret, (U16)max);
8610             }
8611
8612             goto nest_check;
8613         }
8614     }
8615
8616     if (!ISMULT1(op)) {
8617         *flagp = flags;
8618         return(ret);
8619     }
8620
8621 #if 0                           /* Now runtime fix should be reliable. */
8622
8623     /* if this is reinstated, don't forget to put this back into perldiag:
8624
8625             =item Regexp *+ operand could be empty at {#} in regex m/%s/
8626
8627            (F) The part of the regexp subject to either the * or + quantifier
8628            could match an empty string. The {#} shows in the regular
8629            expression about where the problem was discovered.
8630
8631     */
8632
8633     if (!(flags&HASWIDTH) && op != '?')
8634       vFAIL("Regexp *+ operand could be empty");
8635 #endif
8636
8637 #ifdef RE_TRACK_PATTERN_OFFSETS
8638     parse_start = RExC_parse;
8639 #endif
8640     nextchar(pRExC_state);
8641
8642     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
8643
8644     if (op == '*' && (flags&SIMPLE)) {
8645         reginsert(pRExC_state, STAR, ret, depth+1);
8646         ret->flags = 0;
8647         RExC_naughty += 4;
8648     }
8649     else if (op == '*') {
8650         min = 0;
8651         goto do_curly;
8652     }
8653     else if (op == '+' && (flags&SIMPLE)) {
8654         reginsert(pRExC_state, PLUS, ret, depth+1);
8655         ret->flags = 0;
8656         RExC_naughty += 3;
8657     }
8658     else if (op == '+') {
8659         min = 1;
8660         goto do_curly;
8661     }
8662     else if (op == '?') {
8663         min = 0; max = 1;
8664         goto do_curly;
8665     }
8666   nest_check:
8667     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8668         ckWARN3reg(RExC_parse,
8669                    "%.*s matches null string many times",
8670                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8671                    origparse);
8672     }
8673
8674     if (RExC_parse < RExC_end && *RExC_parse == '?') {
8675         nextchar(pRExC_state);
8676         reginsert(pRExC_state, MINMOD, ret, depth+1);
8677         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
8678     }
8679 #ifndef REG_ALLOW_MINMOD_SUSPEND
8680     else
8681 #endif
8682     if (RExC_parse < RExC_end && *RExC_parse == '+') {
8683         regnode *ender;
8684         nextchar(pRExC_state);
8685         ender = reg_node(pRExC_state, SUCCEED);
8686         REGTAIL(pRExC_state, ret, ender);
8687         reginsert(pRExC_state, SUSPEND, ret, depth+1);
8688         ret->flags = 0;
8689         ender = reg_node(pRExC_state, TAIL);
8690         REGTAIL(pRExC_state, ret, ender);
8691         /*ret= ender;*/
8692     }
8693
8694     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
8695         RExC_parse++;
8696         vFAIL("Nested quantifiers");
8697     }
8698
8699     return(ret);
8700 }
8701
8702
8703 /* reg_namedseq(pRExC_state,UVp, UV depth)
8704
8705    This is expected to be called by a parser routine that has
8706    recognized '\N' and needs to handle the rest. RExC_parse is
8707    expected to point at the first char following the N at the time
8708    of the call.
8709
8710    The \N may be inside (indicated by valuep not being NULL) or outside a
8711    character class.
8712
8713    \N may begin either a named sequence, or if outside a character class, mean
8714    to match a non-newline.  For non single-quoted regexes, the tokenizer has
8715    attempted to decide which, and in the case of a named sequence converted it
8716    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8717    where c1... are the characters in the sequence.  For single-quoted regexes,
8718    the tokenizer passes the \N sequence through unchanged; this code will not
8719    attempt to determine this nor expand those.  The net effect is that if the
8720    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8721    signals that this \N occurrence means to match a non-newline.
8722
8723    Only the \N{U+...} form should occur in a character class, for the same
8724    reason that '.' inside a character class means to just match a period: it
8725    just doesn't make sense.
8726
8727    If valuep is non-null then it is assumed that we are parsing inside
8728    of a charclass definition and the first codepoint in the resolved
8729    string is returned via *valuep and the routine will return NULL.
8730    In this mode if a multichar string is returned from the charnames
8731    handler, a warning will be issued, and only the first char in the
8732    sequence will be examined. If the string returned is zero length
8733    then the value of *valuep is undefined and NON-NULL will
8734    be returned to indicate failure. (This will NOT be a valid pointer
8735    to a regnode.)
8736
8737    If valuep is null then it is assumed that we are parsing normal text and a
8738    new EXACT node is inserted into the program containing the resolved string,
8739    and a pointer to the new node is returned.  But if the string is zero length
8740    a NOTHING node is emitted instead.
8741
8742    On success RExC_parse is set to the char following the endbrace.
8743    Parsing failures will generate a fatal error via vFAIL(...)
8744  */
8745 STATIC regnode *
8746 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
8747 {
8748     char * endbrace;    /* '}' following the name */
8749     regnode *ret = NULL;
8750     char* p;
8751
8752     GET_RE_DEBUG_FLAGS_DECL;
8753
8754     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
8755
8756     GET_RE_DEBUG_FLAGS;
8757
8758     /* The [^\n] meaning of \N ignores spaces and comments under the /x
8759      * modifier.  The other meaning does not */
8760     p = (RExC_flags & RXf_PMf_EXTENDED)
8761         ? regwhite( pRExC_state, RExC_parse )
8762         : RExC_parse;
8763
8764     /* Disambiguate between \N meaning a named character versus \N meaning
8765      * [^\n].  The former is assumed when it can't be the latter. */
8766     if (*p != '{' || regcurly(p)) {
8767         RExC_parse = p;
8768         if (valuep) {
8769             /* no bare \N in a charclass */
8770             vFAIL("\\N in a character class must be a named character: \\N{...}");
8771         }
8772         nextchar(pRExC_state);
8773         ret = reg_node(pRExC_state, REG_ANY);
8774         *flagp |= HASWIDTH|SIMPLE;
8775         RExC_naughty++;
8776         RExC_parse--;
8777         Set_Node_Length(ret, 1); /* MJD */
8778         return ret;
8779     }
8780
8781     /* Here, we have decided it should be a named sequence */
8782
8783     /* The test above made sure that the next real character is a '{', but
8784      * under the /x modifier, it could be separated by space (or a comment and
8785      * \n) and this is not allowed (for consistency with \x{...} and the
8786      * tokenizer handling of \N{NAME}). */
8787     if (*RExC_parse != '{') {
8788         vFAIL("Missing braces on \\N{}");
8789     }
8790
8791     RExC_parse++;       /* Skip past the '{' */
8792
8793     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8794         || ! (endbrace == RExC_parse            /* nothing between the {} */
8795               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
8796                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8797     {
8798         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
8799         vFAIL("\\N{NAME} must be resolved by the lexer");
8800     }
8801
8802     if (endbrace == RExC_parse) {   /* empty: \N{} */
8803         if (! valuep) {
8804             RExC_parse = endbrace + 1;
8805             return reg_node(pRExC_state,NOTHING);
8806         }
8807
8808         if (SIZE_ONLY) {
8809             ckWARNreg(RExC_parse,
8810                     "Ignoring zero length \\N{} in character class"
8811             );
8812             RExC_parse = endbrace + 1;
8813         }
8814         *valuep = 0;
8815         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
8816     }
8817
8818     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
8819     RExC_parse += 2;    /* Skip past the 'U+' */
8820
8821     if (valuep) {   /* In a bracketed char class */
8822         /* We only pay attention to the first char of
8823         multichar strings being returned. I kinda wonder
8824         if this makes sense as it does change the behaviour
8825         from earlier versions, OTOH that behaviour was broken
8826         as well. XXX Solution is to recharacterize as
8827         [rest-of-class]|multi1|multi2... */
8828
8829         STRLEN length_of_hex;
8830         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8831             | PERL_SCAN_DISALLOW_PREFIX
8832             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8833
8834         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8835         if (endchar < endbrace) {
8836             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8837         }
8838
8839         length_of_hex = (STRLEN)(endchar - RExC_parse);
8840         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8841
8842         /* The tokenizer should have guaranteed validity, but it's possible to
8843          * bypass it by using single quoting, so check */
8844         if (length_of_hex == 0
8845             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8846         {
8847             RExC_parse += length_of_hex;        /* Includes all the valid */
8848             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
8849                             ? UTF8SKIP(RExC_parse)
8850                             : 1;
8851             /* Guard against malformed utf8 */
8852             if (RExC_parse >= endchar) RExC_parse = endchar;
8853             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8854         }
8855
8856         RExC_parse = endbrace + 1;
8857         if (endchar == endbrace) return NULL;
8858
8859         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
8860     }
8861     else {      /* Not a char class */
8862
8863         /* What is done here is to convert this to a sub-pattern of the form
8864          * (?:\x{char1}\x{char2}...)
8865          * and then call reg recursively.  That way, it retains its atomicness,
8866          * while not having to worry about special handling that some code
8867          * points may have.  toke.c has converted the original Unicode values
8868          * to native, so that we can just pass on the hex values unchanged.  We
8869          * do have to set a flag to keep recoding from happening in the
8870          * recursion */
8871
8872         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8873         STRLEN len;
8874         char *endchar;      /* Points to '.' or '}' ending cur char in the input
8875                                stream */
8876         char *orig_end = RExC_end;
8877
8878         while (RExC_parse < endbrace) {
8879
8880             /* Code points are separated by dots.  If none, there is only one
8881              * code point, and is terminated by the brace */
8882             endchar = RExC_parse + strcspn(RExC_parse, ".}");
8883
8884             /* Convert to notation the rest of the code understands */
8885             sv_catpv(substitute_parse, "\\x{");
8886             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8887             sv_catpv(substitute_parse, "}");
8888
8889             /* Point to the beginning of the next character in the sequence. */
8890             RExC_parse = endchar + 1;
8891         }
8892         sv_catpv(substitute_parse, ")");
8893
8894         RExC_parse = SvPV(substitute_parse, len);
8895
8896         /* Don't allow empty number */
8897         if (len < 8) {
8898             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8899         }
8900         RExC_end = RExC_parse + len;
8901
8902         /* The values are Unicode, and therefore not subject to recoding */
8903         RExC_override_recoding = 1;
8904
8905         ret = reg(pRExC_state, 1, flagp, depth+1);
8906
8907         RExC_parse = endbrace;
8908         RExC_end = orig_end;
8909         RExC_override_recoding = 0;
8910
8911         nextchar(pRExC_state);
8912     }
8913
8914     return ret;
8915 }
8916
8917
8918 /*
8919  * reg_recode
8920  *
8921  * It returns the code point in utf8 for the value in *encp.
8922  *    value: a code value in the source encoding
8923  *    encp:  a pointer to an Encode object
8924  *
8925  * If the result from Encode is not a single character,
8926  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8927  */
8928 STATIC UV
8929 S_reg_recode(pTHX_ const char value, SV **encp)
8930 {
8931     STRLEN numlen = 1;
8932     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
8933     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
8934     const STRLEN newlen = SvCUR(sv);
8935     UV uv = UNICODE_REPLACEMENT;
8936
8937     PERL_ARGS_ASSERT_REG_RECODE;
8938
8939     if (newlen)
8940         uv = SvUTF8(sv)
8941              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8942              : *(U8*)s;
8943
8944     if (!newlen || numlen != newlen) {
8945         uv = UNICODE_REPLACEMENT;
8946         *encp = NULL;
8947     }
8948     return uv;
8949 }
8950
8951
8952 /*
8953  - regatom - the lowest level
8954
8955    Try to identify anything special at the start of the pattern. If there
8956    is, then handle it as required. This may involve generating a single regop,
8957    such as for an assertion; or it may involve recursing, such as to
8958    handle a () structure.
8959
8960    If the string doesn't start with something special then we gobble up
8961    as much literal text as we can.
8962
8963    Once we have been able to handle whatever type of thing started the
8964    sequence, we return.
8965
8966    Note: we have to be careful with escapes, as they can be both literal
8967    and special, and in the case of \10 and friends can either, depending
8968    on context. Specifically there are two separate switches for handling
8969    escape sequences, with the one for handling literal escapes requiring
8970    a dummy entry for all of the special escapes that are actually handled
8971    by the other.
8972 */
8973
8974 STATIC regnode *
8975 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8976 {
8977     dVAR;
8978     register regnode *ret = NULL;
8979     I32 flags;
8980     char *parse_start = RExC_parse;
8981     U8 op;
8982     GET_RE_DEBUG_FLAGS_DECL;
8983     DEBUG_PARSE("atom");
8984     *flagp = WORST;             /* Tentatively. */
8985
8986     PERL_ARGS_ASSERT_REGATOM;
8987
8988 tryagain:
8989     switch ((U8)*RExC_parse) {
8990     case '^':
8991         RExC_seen_zerolen++;
8992         nextchar(pRExC_state);
8993         if (RExC_flags & RXf_PMf_MULTILINE)
8994             ret = reg_node(pRExC_state, MBOL);
8995         else if (RExC_flags & RXf_PMf_SINGLELINE)
8996             ret = reg_node(pRExC_state, SBOL);
8997         else
8998             ret = reg_node(pRExC_state, BOL);
8999         Set_Node_Length(ret, 1); /* MJD */
9000         break;
9001     case '$':
9002         nextchar(pRExC_state);
9003         if (*RExC_parse)
9004             RExC_seen_zerolen++;
9005         if (RExC_flags & RXf_PMf_MULTILINE)
9006             ret = reg_node(pRExC_state, MEOL);
9007         else if (RExC_flags & RXf_PMf_SINGLELINE)
9008             ret = reg_node(pRExC_state, SEOL);
9009         else
9010             ret = reg_node(pRExC_state, EOL);
9011         Set_Node_Length(ret, 1); /* MJD */
9012         break;
9013     case '.':
9014         nextchar(pRExC_state);
9015         if (RExC_flags & RXf_PMf_SINGLELINE)
9016             ret = reg_node(pRExC_state, SANY);
9017         else
9018             ret = reg_node(pRExC_state, REG_ANY);
9019         *flagp |= HASWIDTH|SIMPLE;
9020         RExC_naughty++;
9021         Set_Node_Length(ret, 1); /* MJD */
9022         break;
9023     case '[':
9024     {
9025         char * const oregcomp_parse = ++RExC_parse;
9026         ret = regclass(pRExC_state,depth+1);
9027         if (*RExC_parse != ']') {
9028             RExC_parse = oregcomp_parse;
9029             vFAIL("Unmatched [");
9030         }
9031         nextchar(pRExC_state);
9032         *flagp |= HASWIDTH|SIMPLE;
9033         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
9034         break;
9035     }
9036     case '(':
9037         nextchar(pRExC_state);
9038         ret = reg(pRExC_state, 1, &flags,depth+1);
9039         if (ret == NULL) {
9040                 if (flags & TRYAGAIN) {
9041                     if (RExC_parse == RExC_end) {
9042                          /* Make parent create an empty node if needed. */
9043                         *flagp |= TRYAGAIN;
9044                         return(NULL);
9045                     }
9046                     goto tryagain;
9047                 }
9048                 return(NULL);
9049         }
9050         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9051         break;
9052     case '|':
9053     case ')':
9054         if (flags & TRYAGAIN) {
9055             *flagp |= TRYAGAIN;
9056             return NULL;
9057         }
9058         vFAIL("Internal urp");
9059                                 /* Supposed to be caught earlier. */
9060         break;
9061     case '{':
9062         if (!regcurly(RExC_parse)) {
9063             RExC_parse++;
9064             goto defchar;
9065         }
9066         /* FALL THROUGH */
9067     case '?':
9068     case '+':
9069     case '*':
9070         RExC_parse++;
9071         vFAIL("Quantifier follows nothing");
9072         break;
9073     case '\\':
9074         /* Special Escapes
9075
9076            This switch handles escape sequences that resolve to some kind
9077            of special regop and not to literal text. Escape sequnces that
9078            resolve to literal text are handled below in the switch marked
9079            "Literal Escapes".
9080
9081            Every entry in this switch *must* have a corresponding entry
9082            in the literal escape switch. However, the opposite is not
9083            required, as the default for this switch is to jump to the
9084            literal text handling code.
9085         */
9086         switch ((U8)*++RExC_parse) {
9087         /* Special Escapes */
9088         case 'A':
9089             RExC_seen_zerolen++;
9090             ret = reg_node(pRExC_state, SBOL);
9091             *flagp |= SIMPLE;
9092             goto finish_meta_pat;
9093         case 'G':
9094             ret = reg_node(pRExC_state, GPOS);
9095             RExC_seen |= REG_SEEN_GPOS;
9096             *flagp |= SIMPLE;
9097             goto finish_meta_pat;
9098         case 'K':
9099             RExC_seen_zerolen++;
9100             ret = reg_node(pRExC_state, KEEPS);
9101             *flagp |= SIMPLE;
9102             /* XXX:dmq : disabling in-place substitution seems to
9103              * be necessary here to avoid cases of memory corruption, as
9104              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9105              */
9106             RExC_seen |= REG_SEEN_LOOKBEHIND;
9107             goto finish_meta_pat;
9108         case 'Z':
9109             ret = reg_node(pRExC_state, SEOL);
9110             *flagp |= SIMPLE;
9111             RExC_seen_zerolen++;                /* Do not optimize RE away */
9112             goto finish_meta_pat;
9113         case 'z':
9114             ret = reg_node(pRExC_state, EOS);
9115             *flagp |= SIMPLE;
9116             RExC_seen_zerolen++;                /* Do not optimize RE away */
9117             goto finish_meta_pat;
9118         case 'C':
9119             ret = reg_node(pRExC_state, CANY);
9120             RExC_seen |= REG_SEEN_CANY;
9121             *flagp |= HASWIDTH|SIMPLE;
9122             goto finish_meta_pat;
9123         case 'X':
9124             ret = reg_node(pRExC_state, CLUMP);
9125             *flagp |= HASWIDTH;
9126             goto finish_meta_pat;
9127         case 'w':
9128             switch (get_regex_charset(RExC_flags)) {
9129                 case REGEX_LOCALE_CHARSET:
9130                     op = ALNUML;
9131                     break;
9132                 case REGEX_UNICODE_CHARSET:
9133                     op = ALNUMU;
9134                     break;
9135                 case REGEX_ASCII_RESTRICTED_CHARSET:
9136                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9137                     op = ALNUMA;
9138                     break;
9139                 case REGEX_DEPENDS_CHARSET:
9140                     op = ALNUM;
9141                     break;
9142                 default:
9143                     goto bad_charset;
9144             }
9145             ret = reg_node(pRExC_state, op);
9146             *flagp |= HASWIDTH|SIMPLE;
9147             goto finish_meta_pat;
9148         case 'W':
9149             switch (get_regex_charset(RExC_flags)) {
9150                 case REGEX_LOCALE_CHARSET:
9151                     op = NALNUML;
9152                     break;
9153                 case REGEX_UNICODE_CHARSET:
9154                     op = NALNUMU;
9155                     break;
9156                 case REGEX_ASCII_RESTRICTED_CHARSET:
9157                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9158                     op = NALNUMA;
9159                     break;
9160                 case REGEX_DEPENDS_CHARSET:
9161                     op = NALNUM;
9162                     break;
9163                 default:
9164                     goto bad_charset;
9165             }
9166             ret = reg_node(pRExC_state, op);
9167             *flagp |= HASWIDTH|SIMPLE;
9168             goto finish_meta_pat;
9169         case 'b':
9170             RExC_seen_zerolen++;
9171             RExC_seen |= REG_SEEN_LOOKBEHIND;
9172             switch (get_regex_charset(RExC_flags)) {
9173                 case REGEX_LOCALE_CHARSET:
9174                     op = BOUNDL;
9175                     break;
9176                 case REGEX_UNICODE_CHARSET:
9177                     op = BOUNDU;
9178                     break;
9179                 case REGEX_ASCII_RESTRICTED_CHARSET:
9180                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9181                     op = BOUNDA;
9182                     break;
9183                 case REGEX_DEPENDS_CHARSET:
9184                     op = BOUND;
9185                     break;
9186                 default:
9187                     goto bad_charset;
9188             }
9189             ret = reg_node(pRExC_state, op);
9190             FLAGS(ret) = get_regex_charset(RExC_flags);
9191             *flagp |= SIMPLE;
9192             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9193                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
9194             }
9195             goto finish_meta_pat;
9196         case 'B':
9197             RExC_seen_zerolen++;
9198             RExC_seen |= REG_SEEN_LOOKBEHIND;
9199             switch (get_regex_charset(RExC_flags)) {
9200                 case REGEX_LOCALE_CHARSET:
9201                     op = NBOUNDL;
9202                     break;
9203                 case REGEX_UNICODE_CHARSET:
9204                     op = NBOUNDU;
9205                     break;
9206                 case REGEX_ASCII_RESTRICTED_CHARSET:
9207                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9208                     op = NBOUNDA;
9209                     break;
9210                 case REGEX_DEPENDS_CHARSET:
9211                     op = NBOUND;
9212                     break;
9213                 default:
9214                     goto bad_charset;
9215             }
9216             ret = reg_node(pRExC_state, op);
9217             FLAGS(ret) = get_regex_charset(RExC_flags);
9218             *flagp |= SIMPLE;
9219             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9220                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
9221             }
9222             goto finish_meta_pat;
9223         case 's':
9224             switch (get_regex_charset(RExC_flags)) {
9225                 case REGEX_LOCALE_CHARSET:
9226                     op = SPACEL;
9227                     break;
9228                 case REGEX_UNICODE_CHARSET:
9229                     op = SPACEU;
9230                     break;
9231                 case REGEX_ASCII_RESTRICTED_CHARSET:
9232                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9233                     op = SPACEA;
9234                     break;
9235                 case REGEX_DEPENDS_CHARSET:
9236                     op = SPACE;
9237                     break;
9238                 default:
9239                     goto bad_charset;
9240             }
9241             ret = reg_node(pRExC_state, op);
9242             *flagp |= HASWIDTH|SIMPLE;
9243             goto finish_meta_pat;
9244         case 'S':
9245             switch (get_regex_charset(RExC_flags)) {
9246                 case REGEX_LOCALE_CHARSET:
9247                     op = NSPACEL;
9248                     break;
9249                 case REGEX_UNICODE_CHARSET:
9250                     op = NSPACEU;
9251                     break;
9252                 case REGEX_ASCII_RESTRICTED_CHARSET:
9253                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9254                     op = NSPACEA;
9255                     break;
9256                 case REGEX_DEPENDS_CHARSET:
9257                     op = NSPACE;
9258                     break;
9259                 default:
9260                     goto bad_charset;
9261             }
9262             ret = reg_node(pRExC_state, op);
9263             *flagp |= HASWIDTH|SIMPLE;
9264             goto finish_meta_pat;
9265         case 'd':
9266             switch (get_regex_charset(RExC_flags)) {
9267                 case REGEX_LOCALE_CHARSET:
9268                     op = DIGITL;
9269                     break;
9270                 case REGEX_ASCII_RESTRICTED_CHARSET:
9271                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9272                     op = DIGITA;
9273                     break;
9274                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9275                 case REGEX_UNICODE_CHARSET:
9276                     op = DIGIT;
9277                     break;
9278                 default:
9279                     goto bad_charset;
9280             }
9281             ret = reg_node(pRExC_state, op);
9282             *flagp |= HASWIDTH|SIMPLE;
9283             goto finish_meta_pat;
9284         case 'D':
9285             switch (get_regex_charset(RExC_flags)) {
9286                 case REGEX_LOCALE_CHARSET:
9287                     op = NDIGITL;
9288                     break;
9289                 case REGEX_ASCII_RESTRICTED_CHARSET:
9290                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9291                     op = NDIGITA;
9292                     break;
9293                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9294                 case REGEX_UNICODE_CHARSET:
9295                     op = NDIGIT;
9296                     break;
9297                 default:
9298                     goto bad_charset;
9299             }
9300             ret = reg_node(pRExC_state, op);
9301             *flagp |= HASWIDTH|SIMPLE;
9302             goto finish_meta_pat;
9303         case 'R':
9304             ret = reg_node(pRExC_state, LNBREAK);
9305             *flagp |= HASWIDTH|SIMPLE;
9306             goto finish_meta_pat;
9307         case 'h':
9308             ret = reg_node(pRExC_state, HORIZWS);
9309             *flagp |= HASWIDTH|SIMPLE;
9310             goto finish_meta_pat;
9311         case 'H':
9312             ret = reg_node(pRExC_state, NHORIZWS);
9313             *flagp |= HASWIDTH|SIMPLE;
9314             goto finish_meta_pat;
9315         case 'v':
9316             ret = reg_node(pRExC_state, VERTWS);
9317             *flagp |= HASWIDTH|SIMPLE;
9318             goto finish_meta_pat;
9319         case 'V':
9320             ret = reg_node(pRExC_state, NVERTWS);
9321             *flagp |= HASWIDTH|SIMPLE;
9322          finish_meta_pat:
9323             nextchar(pRExC_state);
9324             Set_Node_Length(ret, 2); /* MJD */
9325             break;
9326         case 'p':
9327         case 'P':
9328             {
9329                 char* const oldregxend = RExC_end;
9330 #ifdef DEBUGGING
9331                 char* parse_start = RExC_parse - 2;
9332 #endif
9333
9334                 if (RExC_parse[1] == '{') {
9335                   /* a lovely hack--pretend we saw [\pX] instead */
9336                     RExC_end = strchr(RExC_parse, '}');
9337                     if (!RExC_end) {
9338                         const U8 c = (U8)*RExC_parse;
9339                         RExC_parse += 2;
9340                         RExC_end = oldregxend;
9341                         vFAIL2("Missing right brace on \\%c{}", c);
9342                     }
9343                     RExC_end++;
9344                 }
9345                 else {
9346                     RExC_end = RExC_parse + 2;
9347                     if (RExC_end > oldregxend)
9348                         RExC_end = oldregxend;
9349                 }
9350                 RExC_parse--;
9351
9352                 ret = regclass(pRExC_state,depth+1);
9353
9354                 RExC_end = oldregxend;
9355                 RExC_parse--;
9356
9357                 Set_Node_Offset(ret, parse_start + 2);
9358                 Set_Node_Cur_Length(ret);
9359                 nextchar(pRExC_state);
9360                 *flagp |= HASWIDTH|SIMPLE;
9361             }
9362             break;
9363         case 'N':
9364             /* Handle \N and \N{NAME} here and not below because it can be
9365             multicharacter. join_exact() will join them up later on.
9366             Also this makes sure that things like /\N{BLAH}+/ and
9367             \N{BLAH} being multi char Just Happen. dmq*/
9368             ++RExC_parse;
9369             ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9370             break;
9371         case 'k':    /* Handle \k<NAME> and \k'NAME' */
9372         parse_named_seq:
9373         {
9374             char ch= RExC_parse[1];
9375             if (ch != '<' && ch != '\'' && ch != '{') {
9376                 RExC_parse++;
9377                 vFAIL2("Sequence %.2s... not terminated",parse_start);
9378             } else {
9379                 /* this pretty much dupes the code for (?P=...) in reg(), if
9380                    you change this make sure you change that */
9381                 char* name_start = (RExC_parse += 2);
9382                 U32 num = 0;
9383                 SV *sv_dat = reg_scan_name(pRExC_state,
9384                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9385                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9386                 if (RExC_parse == name_start || *RExC_parse != ch)
9387                     vFAIL2("Sequence %.3s... not terminated",parse_start);
9388
9389                 if (!SIZE_ONLY) {
9390                     num = add_data( pRExC_state, 1, "S" );
9391                     RExC_rxi->data->data[num]=(void*)sv_dat;
9392                     SvREFCNT_inc_simple_void(sv_dat);
9393                 }
9394
9395                 RExC_sawback = 1;
9396                 ret = reganode(pRExC_state,
9397                                ((! FOLD)
9398                                  ? NREF
9399                                  : (MORE_ASCII_RESTRICTED)
9400                                    ? NREFFA
9401                                    : (AT_LEAST_UNI_SEMANTICS)
9402                                      ? NREFFU
9403                                      : (LOC)
9404                                        ? NREFFL
9405                                        : NREFF),
9406                                 num);
9407                 *flagp |= HASWIDTH;
9408
9409                 /* override incorrect value set in reganode MJD */
9410                 Set_Node_Offset(ret, parse_start+1);
9411                 Set_Node_Cur_Length(ret); /* MJD */
9412                 nextchar(pRExC_state);
9413
9414             }
9415             break;
9416         }
9417         case 'g':
9418         case '1': case '2': case '3': case '4':
9419         case '5': case '6': case '7': case '8': case '9':
9420             {
9421                 I32 num;
9422                 bool isg = *RExC_parse == 'g';
9423                 bool isrel = 0;
9424                 bool hasbrace = 0;
9425                 if (isg) {
9426                     RExC_parse++;
9427                     if (*RExC_parse == '{') {
9428                         RExC_parse++;
9429                         hasbrace = 1;
9430                     }
9431                     if (*RExC_parse == '-') {
9432                         RExC_parse++;
9433                         isrel = 1;
9434                     }
9435                     if (hasbrace && !isDIGIT(*RExC_parse)) {
9436                         if (isrel) RExC_parse--;
9437                         RExC_parse -= 2;
9438                         goto parse_named_seq;
9439                 }   }
9440                 num = atoi(RExC_parse);
9441                 if (isg && num == 0)
9442                     vFAIL("Reference to invalid group 0");
9443                 if (isrel) {
9444                     num = RExC_npar - num;
9445                     if (num < 1)
9446                         vFAIL("Reference to nonexistent or unclosed group");
9447                 }
9448                 if (!isg && num > 9 && num >= RExC_npar)
9449                     goto defchar;
9450                 else {
9451                     char * const parse_start = RExC_parse - 1; /* MJD */
9452                     while (isDIGIT(*RExC_parse))
9453                         RExC_parse++;
9454                     if (parse_start == RExC_parse - 1)
9455                         vFAIL("Unterminated \\g... pattern");
9456                     if (hasbrace) {
9457                         if (*RExC_parse != '}')
9458                             vFAIL("Unterminated \\g{...} pattern");
9459                         RExC_parse++;
9460                     }
9461                     if (!SIZE_ONLY) {
9462                         if (num > (I32)RExC_rx->nparens)
9463                             vFAIL("Reference to nonexistent group");
9464                     }
9465                     RExC_sawback = 1;
9466                     ret = reganode(pRExC_state,
9467                                    ((! FOLD)
9468                                      ? REF
9469                                      : (MORE_ASCII_RESTRICTED)
9470                                        ? REFFA
9471                                        : (AT_LEAST_UNI_SEMANTICS)
9472                                          ? REFFU
9473                                          : (LOC)
9474                                            ? REFFL
9475                                            : REFF),
9476                                     num);
9477                     *flagp |= HASWIDTH;
9478
9479                     /* override incorrect value set in reganode MJD */
9480                     Set_Node_Offset(ret, parse_start+1);
9481                     Set_Node_Cur_Length(ret); /* MJD */
9482                     RExC_parse--;
9483                     nextchar(pRExC_state);
9484                 }
9485             }
9486             break;
9487         case '\0':
9488             if (RExC_parse >= RExC_end)
9489                 FAIL("Trailing \\");
9490             /* FALL THROUGH */
9491         default:
9492             /* Do not generate "unrecognized" warnings here, we fall
9493                back into the quick-grab loop below */
9494             parse_start--;
9495             goto defchar;
9496         }
9497         break;
9498
9499     case '#':
9500         if (RExC_flags & RXf_PMf_EXTENDED) {
9501             if ( reg_skipcomment( pRExC_state ) )
9502                 goto tryagain;
9503         }
9504         /* FALL THROUGH */
9505
9506     default:
9507
9508             parse_start = RExC_parse - 1;
9509
9510             RExC_parse++;
9511
9512         defchar: {
9513             register STRLEN len;
9514             register UV ender;
9515             register char *p;
9516             char *s;
9517             STRLEN foldlen;
9518             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
9519             U8 node_type;
9520
9521             /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node?  If so,
9522              * it is folded to 'ss' even if not utf8 */
9523             bool is_exactfu_sharp_s;
9524
9525             ender = 0;
9526             node_type = ((! FOLD) ? EXACT
9527                         : (LOC)
9528                           ? EXACTFL
9529                           : (MORE_ASCII_RESTRICTED)
9530                             ? EXACTFA
9531                             : (AT_LEAST_UNI_SEMANTICS)
9532                               ? EXACTFU
9533                               : EXACTF);
9534             ret = reg_node(pRExC_state, node_type);
9535             s = STRING(ret);
9536
9537             /* XXX The node can hold up to 255 bytes, yet this only goes to
9538              * 127.  I (khw) do not know why.  Keeping it somewhat less than
9539              * 255 allows us to not have to worry about overflow due to
9540              * converting to utf8 and fold expansion, but that value is
9541              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
9542              * split up by this limit into a single one using the real max of
9543              * 255.  Even at 127, this breaks under rare circumstances.  If
9544              * folding, we do not want to split a node at a character that is a
9545              * non-final in a multi-char fold, as an input string could just
9546              * happen to want to match across the node boundary.  The join
9547              * would solve that problem if the join actually happens.  But a
9548              * series of more than two nodes in a row each of 127 would cause
9549              * the first join to succeed to get to 254, but then there wouldn't
9550              * be room for the next one, which could at be one of those split
9551              * multi-char folds.  I don't know of any fool-proof solution.  One
9552              * could back off to end with only a code point that isn't such a
9553              * non-final, but it is possible for there not to be any in the
9554              * entire node. */
9555             for (len = 0, p = RExC_parse - 1;
9556                  len < 127 && p < RExC_end;
9557                  len++)
9558             {
9559                 char * const oldp = p;
9560
9561                 if (RExC_flags & RXf_PMf_EXTENDED)
9562                     p = regwhite( pRExC_state, p );
9563                 switch ((U8)*p) {
9564                 case '^':
9565                 case '$':
9566                 case '.':
9567                 case '[':
9568                 case '(':
9569                 case ')':
9570                 case '|':
9571                     goto loopdone;
9572                 case '\\':
9573                     /* Literal Escapes Switch
9574
9575                        This switch is meant to handle escape sequences that
9576                        resolve to a literal character.
9577
9578                        Every escape sequence that represents something
9579                        else, like an assertion or a char class, is handled
9580                        in the switch marked 'Special Escapes' above in this
9581                        routine, but also has an entry here as anything that
9582                        isn't explicitly mentioned here will be treated as
9583                        an unescaped equivalent literal.
9584                     */
9585
9586                     switch ((U8)*++p) {
9587                     /* These are all the special escapes. */
9588                     case 'A':             /* Start assertion */
9589                     case 'b': case 'B':   /* Word-boundary assertion*/
9590                     case 'C':             /* Single char !DANGEROUS! */
9591                     case 'd': case 'D':   /* digit class */
9592                     case 'g': case 'G':   /* generic-backref, pos assertion */
9593                     case 'h': case 'H':   /* HORIZWS */
9594                     case 'k': case 'K':   /* named backref, keep marker */
9595                     case 'N':             /* named char sequence */
9596                     case 'p': case 'P':   /* Unicode property */
9597                               case 'R':   /* LNBREAK */
9598                     case 's': case 'S':   /* space class */
9599                     case 'v': case 'V':   /* VERTWS */
9600                     case 'w': case 'W':   /* word class */
9601                     case 'X':             /* eXtended Unicode "combining character sequence" */
9602                     case 'z': case 'Z':   /* End of line/string assertion */
9603                         --p;
9604                         goto loopdone;
9605
9606                     /* Anything after here is an escape that resolves to a
9607                        literal. (Except digits, which may or may not)
9608                      */
9609                     case 'n':
9610                         ender = '\n';
9611                         p++;
9612                         break;
9613                     case 'r':
9614                         ender = '\r';
9615                         p++;
9616                         break;
9617                     case 't':
9618                         ender = '\t';
9619                         p++;
9620                         break;
9621                     case 'f':
9622                         ender = '\f';
9623                         p++;
9624                         break;
9625                     case 'e':
9626                           ender = ASCII_TO_NATIVE('\033');
9627                         p++;
9628                         break;
9629                     case 'a':
9630                           ender = ASCII_TO_NATIVE('\007');
9631                         p++;
9632                         break;
9633                     case 'o':
9634                         {
9635                             STRLEN brace_len = len;
9636                             UV result;
9637                             const char* error_msg;
9638
9639                             bool valid = grok_bslash_o(p,
9640                                                        &result,
9641                                                        &brace_len,
9642                                                        &error_msg,
9643                                                        1);
9644                             p += brace_len;
9645                             if (! valid) {
9646                                 RExC_parse = p; /* going to die anyway; point
9647                                                    to exact spot of failure */
9648                                 vFAIL(error_msg);
9649                             }
9650                             else
9651                             {
9652                                 ender = result;
9653                             }
9654                             if (PL_encoding && ender < 0x100) {
9655                                 goto recode_encoding;
9656                             }
9657                             if (ender > 0xff) {
9658                                 REQUIRE_UTF8;
9659                             }
9660                             break;
9661                         }
9662                     case 'x':
9663                         if (*++p == '{') {
9664                             char* const e = strchr(p, '}');
9665
9666                             if (!e) {
9667                                 RExC_parse = p + 1;
9668                                 vFAIL("Missing right brace on \\x{}");
9669                             }
9670                             else {
9671                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9672                                     | PERL_SCAN_DISALLOW_PREFIX;
9673                                 STRLEN numlen = e - p - 1;
9674                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
9675                                 if (ender > 0xff)
9676                                     REQUIRE_UTF8;
9677                                 p = e + 1;
9678                             }
9679                         }
9680                         else {
9681                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9682                             STRLEN numlen = 2;
9683                             ender = grok_hex(p, &numlen, &flags, NULL);
9684                             p += numlen;
9685                         }
9686                         if (PL_encoding && ender < 0x100)
9687                             goto recode_encoding;
9688                         break;
9689                     case 'c':
9690                         p++;
9691                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
9692                         break;
9693                     case '0': case '1': case '2': case '3':case '4':
9694                     case '5': case '6': case '7': case '8':case '9':
9695                         if (*p == '0' ||
9696                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
9697                         {
9698                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9699                             STRLEN numlen = 3;
9700                             ender = grok_oct(p, &numlen, &flags, NULL);
9701                             if (ender > 0xff) {
9702                                 REQUIRE_UTF8;
9703                             }
9704                             p += numlen;
9705                         }
9706                         else {
9707                             --p;
9708                             goto loopdone;
9709                         }
9710                         if (PL_encoding && ender < 0x100)
9711                             goto recode_encoding;
9712                         break;
9713                     recode_encoding:
9714                         if (! RExC_override_recoding) {
9715                             SV* enc = PL_encoding;
9716                             ender = reg_recode((const char)(U8)ender, &enc);
9717                             if (!enc && SIZE_ONLY)
9718                                 ckWARNreg(p, "Invalid escape in the specified encoding");
9719                             REQUIRE_UTF8;
9720                         }
9721                         break;
9722                     case '\0':
9723                         if (p >= RExC_end)
9724                             FAIL("Trailing \\");
9725                         /* FALL THROUGH */
9726                     default:
9727                         if (!SIZE_ONLY&& isALPHA(*p)) {
9728                             /* Include any { following the alpha to emphasize
9729                              * that it could be part of an escape at some point
9730                              * in the future */
9731                             int len = (*(p + 1) == '{') ? 2 : 1;
9732                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
9733                         }
9734                         goto normal_default;
9735                     }
9736                     break;
9737                 default:
9738                   normal_default:
9739                     if (UTF8_IS_START(*p) && UTF) {
9740                         STRLEN numlen;
9741                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9742                                                &numlen, UTF8_ALLOW_DEFAULT);
9743                         p += numlen;
9744                     }
9745                     else
9746                         ender = (U8) *p++;
9747                     break;
9748                 } /* End of switch on the literal */
9749
9750                 is_exactfu_sharp_s = (node_type == EXACTFU
9751                                       && ender == LATIN_SMALL_LETTER_SHARP_S);
9752                 if ( RExC_flags & RXf_PMf_EXTENDED)
9753                     p = regwhite( pRExC_state, p );
9754                 if ((UTF && FOLD) || is_exactfu_sharp_s) {
9755                     /* Prime the casefolded buffer.  Locale rules, which apply
9756                      * only to code points < 256, aren't known until execution,
9757                      * so for them, just output the original character using
9758                      * utf8.  If we start to fold non-UTF patterns, be sure to
9759                      * update join_exact() */
9760                     if (LOC && ender < 256) {
9761                         if (UNI_IS_INVARIANT(ender)) {
9762                             *tmpbuf = (U8) ender;
9763                             foldlen = 1;
9764                         } else {
9765                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9766                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9767                             foldlen = 2;
9768                         }
9769                     }
9770                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
9771                                                  */
9772                         ender = toLOWER(ender);
9773                         *tmpbuf = (U8) ender;
9774                         foldlen = 1;
9775                     }
9776                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9777
9778                         /* Locale and /aa require more selectivity about the
9779                          * fold, so are handled below.  Otherwise, here, just
9780                          * use the fold */
9781                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9782                     }
9783                     else {
9784                         /* Under locale rules or /aa we are not to mix,
9785                          * respectively, ords < 256 or ASCII with non-.  So
9786                          * reject folds that mix them, using only the
9787                          * non-folded code point.  So do the fold to a
9788                          * temporary, and inspect each character in it. */
9789                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9790                         U8* s = trialbuf;
9791                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9792                         U8* e = s + foldlen;
9793                         bool fold_ok = TRUE;
9794
9795                         while (s < e) {
9796                             if (isASCII(*s)
9797                                 || (LOC && (UTF8_IS_INVARIANT(*s)
9798                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
9799                             {
9800                                 fold_ok = FALSE;
9801                                 break;
9802                             }
9803                             s += UTF8SKIP(s);
9804                         }
9805                         if (fold_ok) {
9806                             Copy(trialbuf, tmpbuf, foldlen, U8);
9807                             ender = tmpender;
9808                         }
9809                         else {
9810                             uvuni_to_utf8(tmpbuf, ender);
9811                             foldlen = UNISKIP(ender);
9812                         }
9813                     }
9814                 }
9815                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
9816                     if (len)
9817                         p = oldp;
9818                     else if (UTF || is_exactfu_sharp_s) {
9819                          if (FOLD) {
9820                               /* Emit all the Unicode characters. */
9821                               STRLEN numlen;
9822                               for (foldbuf = tmpbuf;
9823                                    foldlen;
9824                                    foldlen -= numlen) {
9825                                    ender = utf8_to_uvchr(foldbuf, &numlen);
9826                                    if (numlen > 0) {
9827                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
9828                                         s       += unilen;
9829                                         len     += unilen;
9830                                         /* In EBCDIC the numlen
9831                                          * and unilen can differ. */
9832                                         foldbuf += numlen;
9833                                         if (numlen >= foldlen)
9834                                              break;
9835                                    }
9836                                    else
9837                                         break; /* "Can't happen." */
9838                               }
9839                          }
9840                          else {
9841                               const STRLEN unilen = reguni(pRExC_state, ender, s);
9842                               if (unilen > 0) {
9843                                    s   += unilen;
9844                                    len += unilen;
9845                               }
9846                          }
9847                     }
9848                     else {
9849                         len++;
9850                         REGC((char)ender, s++);
9851                     }
9852                     break;
9853                 }
9854                 if (UTF || is_exactfu_sharp_s) {
9855                      if (FOLD) {
9856                           /* Emit all the Unicode characters. */
9857                           STRLEN numlen;
9858                           for (foldbuf = tmpbuf;
9859                                foldlen;
9860                                foldlen -= numlen) {
9861                                ender = utf8_to_uvchr(foldbuf, &numlen);
9862                                if (numlen > 0) {
9863                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
9864                                     len     += unilen;
9865                                     s       += unilen;
9866                                     /* In EBCDIC the numlen
9867                                      * and unilen can differ. */
9868                                     foldbuf += numlen;
9869                                     if (numlen >= foldlen)
9870                                          break;
9871                                }
9872                                else
9873                                     break;
9874                           }
9875                      }
9876                      else {
9877                           const STRLEN unilen = reguni(pRExC_state, ender, s);
9878                           if (unilen > 0) {
9879                                s   += unilen;
9880                                len += unilen;
9881                           }
9882                      }
9883                      len--;
9884                 }
9885                 else {
9886                     REGC((char)ender, s++);
9887                 }
9888             }
9889         loopdone:   /* Jumped to when encounters something that shouldn't be in
9890                        the node */
9891             RExC_parse = p - 1;
9892             Set_Node_Cur_Length(ret); /* MJD */
9893             nextchar(pRExC_state);
9894             {
9895                 /* len is STRLEN which is unsigned, need to copy to signed */
9896                 IV iv = len;
9897                 if (iv < 0)
9898                     vFAIL("Internal disaster");
9899             }
9900             if (len > 0)
9901                 *flagp |= HASWIDTH;
9902             if (len == 1 && UNI_IS_INVARIANT(ender))
9903                 *flagp |= SIMPLE;
9904
9905             if (SIZE_ONLY)
9906                 RExC_size += STR_SZ(len);
9907             else {
9908                 STR_LEN(ret) = len;
9909                 RExC_emit += STR_SZ(len);
9910             }
9911         }
9912         break;
9913     }
9914
9915     return(ret);
9916
9917 /* Jumped to when an unrecognized character set is encountered */
9918 bad_charset:
9919     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9920     return(NULL);
9921 }
9922
9923 STATIC char *
9924 S_regwhite( RExC_state_t *pRExC_state, char *p )
9925 {
9926     const char *e = RExC_end;
9927
9928     PERL_ARGS_ASSERT_REGWHITE;
9929
9930     while (p < e) {
9931         if (isSPACE(*p))
9932             ++p;
9933         else if (*p == '#') {
9934             bool ended = 0;
9935             do {
9936                 if (*p++ == '\n') {
9937                     ended = 1;
9938                     break;
9939                 }
9940             } while (p < e);
9941             if (!ended)
9942                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9943         }
9944         else
9945             break;
9946     }
9947     return p;
9948 }
9949
9950 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9951    Character classes ([:foo:]) can also be negated ([:^foo:]).
9952    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9953    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9954    but trigger failures because they are currently unimplemented. */
9955
9956 #define POSIXCC_DONE(c)   ((c) == ':')
9957 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9958 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9959
9960 STATIC I32
9961 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9962 {
9963     dVAR;
9964     I32 namedclass = OOB_NAMEDCLASS;
9965
9966     PERL_ARGS_ASSERT_REGPPOSIXCC;
9967
9968     if (value == '[' && RExC_parse + 1 < RExC_end &&
9969         /* I smell either [: or [= or [. -- POSIX has been here, right? */
9970         POSIXCC(UCHARAT(RExC_parse))) {
9971         const char c = UCHARAT(RExC_parse);
9972         char* const s = RExC_parse++;
9973
9974         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9975             RExC_parse++;
9976         if (RExC_parse == RExC_end)
9977             /* Grandfather lone [:, [=, [. */
9978             RExC_parse = s;
9979         else {
9980             const char* const t = RExC_parse++; /* skip over the c */
9981             assert(*t == c);
9982
9983             if (UCHARAT(RExC_parse) == ']') {
9984                 const char *posixcc = s + 1;
9985                 RExC_parse++; /* skip over the ending ] */
9986
9987                 if (*s == ':') {
9988                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9989                     const I32 skip = t - posixcc;
9990
9991                     /* Initially switch on the length of the name.  */
9992                     switch (skip) {
9993                     case 4:
9994                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9995                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9996                         break;
9997                     case 5:
9998                         /* Names all of length 5.  */
9999                         /* alnum alpha ascii blank cntrl digit graph lower
10000                            print punct space upper  */
10001                         /* Offset 4 gives the best switch position.  */
10002                         switch (posixcc[4]) {
10003                         case 'a':
10004                             if (memEQ(posixcc, "alph", 4)) /* alpha */
10005                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
10006                             break;
10007                         case 'e':
10008                             if (memEQ(posixcc, "spac", 4)) /* space */
10009                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
10010                             break;
10011                         case 'h':
10012                             if (memEQ(posixcc, "grap", 4)) /* graph */
10013                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
10014                             break;
10015                         case 'i':
10016                             if (memEQ(posixcc, "asci", 4)) /* ascii */
10017                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
10018                             break;
10019                         case 'k':
10020                             if (memEQ(posixcc, "blan", 4)) /* blank */
10021                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
10022                             break;
10023                         case 'l':
10024                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10025                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
10026                             break;
10027                         case 'm':
10028                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
10029                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
10030                             break;
10031                         case 'r':
10032                             if (memEQ(posixcc, "lowe", 4)) /* lower */
10033                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10034                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
10035                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
10036                             break;
10037                         case 't':
10038                             if (memEQ(posixcc, "digi", 4)) /* digit */
10039                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10040                             else if (memEQ(posixcc, "prin", 4)) /* print */
10041                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10042                             else if (memEQ(posixcc, "punc", 4)) /* punct */
10043                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
10044                             break;
10045                         }
10046                         break;
10047                     case 6:
10048                         if (memEQ(posixcc, "xdigit", 6))
10049                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
10050                         break;
10051                     }
10052
10053                     if (namedclass == OOB_NAMEDCLASS)
10054                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10055                                       t - s - 1, s + 1);
10056                     assert (posixcc[skip] == ':');
10057                     assert (posixcc[skip+1] == ']');
10058                 } else if (!SIZE_ONLY) {
10059                     /* [[=foo=]] and [[.foo.]] are still future. */
10060
10061                     /* adjust RExC_parse so the warning shows after
10062                        the class closes */
10063                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
10064                         RExC_parse++;
10065                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10066                 }
10067             } else {
10068                 /* Maternal grandfather:
10069                  * "[:" ending in ":" but not in ":]" */
10070                 RExC_parse = s;
10071             }
10072         }
10073     }
10074
10075     return namedclass;
10076 }
10077
10078 STATIC void
10079 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10080 {
10081     dVAR;
10082
10083     PERL_ARGS_ASSERT_CHECKPOSIXCC;
10084
10085     if (POSIXCC(UCHARAT(RExC_parse))) {
10086         const char *s = RExC_parse;
10087         const char  c = *s++;
10088
10089         while (isALNUM(*s))
10090             s++;
10091         if (*s && c == *s && s[1] == ']') {
10092             ckWARN3reg(s+2,
10093                        "POSIX syntax [%c %c] belongs inside character classes",
10094                        c, c);
10095
10096             /* [[=foo=]] and [[.foo.]] are still future. */
10097             if (POSIXCC_NOTYET(c)) {
10098                 /* adjust RExC_parse so the error shows after
10099                    the class closes */
10100                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10101                     NOOP;
10102                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10103             }
10104         }
10105     }
10106 }
10107
10108 /* Generate the code to add a full posix character <class> to the bracketed
10109  * character class given by <node>.  (<node> is needed only under locale rules)
10110  * destlist     is the inversion list for non-locale rules that this class is
10111  *              to be added to
10112  * sourcelist   is the ASCII-range inversion list to add under /a rules
10113  * Xsourcelist  is the full Unicode range list to use otherwise. */
10114 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
10115     if (LOC) {                                                             \
10116         SV* scratch_list = NULL;                                           \
10117                                                                            \
10118         /* Set this class in the node for runtime matching */              \
10119         ANYOF_CLASS_SET(node, class);                                      \
10120                                                                            \
10121         /* For above Latin1 code points, we use the full Unicode range */  \
10122         _invlist_intersection(PL_AboveLatin1,                              \
10123                               Xsourcelist,                                 \
10124                               &scratch_list);                              \
10125         /* And set the output to it, adding instead if there already is an \
10126          * output.  Checking if <destlist> is NULL first saves an extra    \
10127          * clone.  Its reference count will be decremented at the next     \
10128          * union, etc, or if this is the only instance, at the end of the  \
10129          * routine */                                                      \
10130         if (! destlist) {                                                  \
10131             destlist = scratch_list;                                       \
10132         }                                                                  \
10133         else {                                                             \
10134             _invlist_union(destlist, scratch_list, &destlist);             \
10135             SvREFCNT_dec(scratch_list);                                    \
10136         }                                                                  \
10137     }                                                                      \
10138     else {                                                                 \
10139         /* For non-locale, just add it to any existing list */             \
10140         _invlist_union(destlist,                                           \
10141                        (AT_LEAST_ASCII_RESTRICTED)                         \
10142                            ? sourcelist                                    \
10143                            : Xsourcelist,                                  \
10144                        &destlist);                                         \
10145     }
10146
10147 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10148  */
10149 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
10150     if (LOC) {                                                             \
10151         SV* scratch_list = NULL;                                           \
10152         ANYOF_CLASS_SET(node, class);                                      \
10153         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
10154         if (! destlist) {                                                  \
10155             destlist = scratch_list;                                       \
10156         }                                                                  \
10157         else {                                                             \
10158             _invlist_union(destlist, scratch_list, &destlist);             \
10159             SvREFCNT_dec(scratch_list);                                    \
10160         }                                                                  \
10161     }                                                                      \
10162     else {                                                                 \
10163         _invlist_union_complement_2nd(destlist,                            \
10164                                     (AT_LEAST_ASCII_RESTRICTED)            \
10165                                         ? sourcelist                       \
10166                                         : Xsourcelist,                     \
10167                                     &destlist);                            \
10168         /* Under /d, everything in the upper half of the Latin1 range      \
10169          * matches this complement */                                      \
10170         if (DEPENDS_SEMANTICS) {                                           \
10171             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
10172         }                                                                  \
10173     }
10174
10175 /* Generate the code to add a posix character <class> to the bracketed
10176  * character class given by <node>.  (<node> is needed only under locale rules)
10177  * destlist       is the inversion list for non-locale rules that this class is
10178  *                to be added to
10179  * sourcelist     is the ASCII-range inversion list to add under /a rules
10180  * l1_sourcelist  is the Latin1 range list to use otherwise.
10181  * Xpropertyname  is the name to add to <run_time_list> of the property to
10182  *                specify the code points above Latin1 that will have to be
10183  *                determined at run-time
10184  * run_time_list  is a SV* that contains text names of properties that are to
10185  *                be computed at run time.  This concatenates <Xpropertyname>
10186  *                to it, apppropriately
10187  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10188  * time */
10189 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
10190                               l1_sourcelist, Xpropertyname, run_time_list) \
10191     /* If not /a matching, there are going to be code points we will have  \
10192      * to defer to runtime to look-up */                                   \
10193     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
10194         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10195     }                                                                      \
10196     if (LOC) {                                                             \
10197         ANYOF_CLASS_SET(node, class);                                      \
10198     }                                                                      \
10199     else {                                                                 \
10200         _invlist_union(destlist,                                           \
10201                        (AT_LEAST_ASCII_RESTRICTED)                         \
10202                            ? sourcelist                                    \
10203                            : l1_sourcelist,                                \
10204                        &destlist);                                         \
10205     }
10206
10207 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
10208  * this and DO_N_POSIX */
10209 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
10210                               l1_sourcelist, Xpropertyname, run_time_list) \
10211     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
10212         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
10213     }                                                                      \
10214     else {                                                                 \
10215         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10216         if (LOC) {                                                         \
10217             ANYOF_CLASS_SET(node, namedclass);                             \
10218         }                                                                  \
10219         else {                                                             \
10220             SV* scratch_list = NULL;                                       \
10221             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
10222             if (! destlist) {                                              \
10223                 destlist = scratch_list;                                   \
10224             }                                                              \
10225             else {                                                         \
10226                 _invlist_union(destlist, scratch_list, &destlist);         \
10227                 SvREFCNT_dec(scratch_list);                                \
10228             }                                                              \
10229             if (DEPENDS_SEMANTICS) {                                       \
10230                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
10231             }                                                              \
10232         }                                                                  \
10233     }
10234
10235 STATIC U8
10236 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10237 {
10238
10239     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10240      * Locale folding is done at run-time, so this function should not be
10241      * called for nodes that are for locales.
10242      *
10243      * This function sets the bit corresponding to the fold of the input
10244      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
10245      * 'F' is 'f'.
10246      *
10247      * It also knows about the characters that are in the bitmap that have
10248      * folds that are matchable only outside it, and sets the appropriate lists
10249      * and flags.
10250      *
10251      * It returns the number of bits that actually changed from 0 to 1 */
10252
10253     U8 stored = 0;
10254     U8 fold;
10255
10256     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10257
10258     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10259                                     : PL_fold[value];
10260
10261     /* It assumes the bit for 'value' has already been set */
10262     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10263         ANYOF_BITMAP_SET(node, fold);
10264         stored++;
10265     }
10266     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10267         /* Certain Latin1 characters have matches outside the bitmap.  To get
10268          * here, 'value' is one of those characters.   None of these matches is
10269          * valid for ASCII characters under /aa, which have been excluded by
10270          * the 'if' above.  The matches fall into three categories:
10271          * 1) They are singly folded-to or -from an above 255 character, as
10272          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10273          *    WITH DIAERESIS;
10274          * 2) They are part of a multi-char fold with another character in the
10275          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10276          * 3) They are part of a multi-char fold with a character not in the
10277          *    bitmap, such as various ligatures.
10278          * We aren't dealing fully with multi-char folds, except we do deal
10279          * with the pattern containing a character that has a multi-char fold
10280          * (not so much the inverse).
10281          * For types 1) and 3), the matches only happen when the target string
10282          * is utf8; that's not true for 2), and we set a flag for it.
10283          *
10284          * The code below adds to the passed in inversion list the single fold
10285          * closures for 'value'.  The values are hard-coded here so that an
10286          * innocent-looking character class, like /[ks]/i won't have to go out
10287          * to disk to find the possible matches.  XXX It would be better to
10288          * generate these via regen, in case a new version of the Unicode
10289          * standard adds new mappings, though that is not really likely. */
10290         switch (value) {
10291             case 'k':
10292             case 'K':
10293                 /* KELVIN SIGN */
10294                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10295                 break;
10296             case 's':
10297             case 'S':
10298                 /* LATIN SMALL LETTER LONG S */
10299                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10300                 break;
10301             case MICRO_SIGN:
10302                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10303                                                  GREEK_SMALL_LETTER_MU);
10304                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10305                                                  GREEK_CAPITAL_LETTER_MU);
10306                 break;
10307             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10308             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10309                 /* ANGSTROM SIGN */
10310                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10311                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
10312                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10313                                                      PL_fold_latin1[value]);
10314                 }
10315                 break;
10316             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10317                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10318                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10319                 break;
10320             case LATIN_SMALL_LETTER_SHARP_S:
10321                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10322                                         LATIN_CAPITAL_LETTER_SHARP_S);
10323
10324                 /* Under /a, /d, and /u, this can match the two chars "ss" */
10325                 if (! MORE_ASCII_RESTRICTED) {
10326                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
10327
10328                     /* And under /u or /a, it can match even if the target is
10329                      * not utf8 */
10330                     if (AT_LEAST_UNI_SEMANTICS) {
10331                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10332                     }
10333                 }
10334                 break;
10335             case 'F': case 'f':
10336             case 'I': case 'i':
10337             case 'L': case 'l':
10338             case 'T': case 't':
10339             case 'A': case 'a':
10340             case 'H': case 'h':
10341             case 'J': case 'j':
10342             case 'N': case 'n':
10343             case 'W': case 'w':
10344             case 'Y': case 'y':
10345                 /* These all are targets of multi-character folds from code
10346                  * points that require UTF8 to express, so they can't match
10347                  * unless the target string is in UTF-8, so no action here is
10348                  * necessary, as regexec.c properly handles the general case
10349                  * for UTF-8 matching */
10350                 break;
10351             default:
10352                 /* Use deprecated warning to increase the chances of this
10353                  * being output */
10354                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10355                 break;
10356         }
10357     }
10358     else if (DEPENDS_SEMANTICS
10359             && ! isASCII(value)
10360             && PL_fold_latin1[value] != value)
10361     {
10362            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10363             * folds only when the target string is in UTF-8.  We add the fold
10364             * here to the list of things to match outside the bitmap, which
10365             * won't be looked at unless it is UTF8 (or else if something else
10366             * says to look even if not utf8, but those things better not happen
10367             * under DEPENDS semantics. */
10368         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10369     }
10370
10371     return stored;
10372 }
10373
10374
10375 PERL_STATIC_INLINE U8
10376 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10377 {
10378     /* This inline function sets a bit in the bitmap if not already set, and if
10379      * appropriate, its fold, returning the number of bits that actually
10380      * changed from 0 to 1 */
10381
10382     U8 stored;
10383
10384     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10385
10386     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
10387         return 0;
10388     }
10389
10390     ANYOF_BITMAP_SET(node, value);
10391     stored = 1;
10392
10393     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
10394         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10395     }
10396
10397     return stored;
10398 }
10399
10400 STATIC void
10401 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10402 {
10403     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10404      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
10405      * the multi-character folds of characters in the node */
10406     SV *sv;
10407
10408     PERL_ARGS_ASSERT_ADD_ALTERNATE;
10409
10410     if (! *alternate_ptr) {
10411         *alternate_ptr = newAV();
10412     }
10413     sv = newSVpvn_utf8((char*)string, len, TRUE);
10414     av_push(*alternate_ptr, sv);
10415     return;
10416 }
10417
10418 /*
10419    parse a class specification and produce either an ANYOF node that
10420    matches the pattern or perhaps will be optimized into an EXACTish node
10421    instead. The node contains a bit map for the first 256 characters, with the
10422    corresponding bit set if that character is in the list.  For characters
10423    above 255, a range list is used */
10424
10425 STATIC regnode *
10426 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10427 {
10428     dVAR;
10429     register UV nextvalue;
10430     register IV prevvalue = OOB_UNICODE;
10431     register IV range = 0;
10432     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10433     register regnode *ret;
10434     STRLEN numlen;
10435     IV namedclass;
10436     char *rangebegin = NULL;
10437     bool need_class = 0;
10438     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
10439     SV *listsv = NULL;
10440     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10441                                       than just initialized.  */
10442     SV* properties = NULL;    /* Code points that match \p{} \P{} */
10443     UV element_count = 0;   /* Number of distinct elements in the class.
10444                                Optimizations may be possible if this is tiny */
10445     UV n;
10446
10447     /* Unicode properties are stored in a swash; this holds the current one
10448      * being parsed.  If this swash is the only above-latin1 component of the
10449      * character class, an optimization is to pass it directly on to the
10450      * execution engine.  Otherwise, it is set to NULL to indicate that there
10451      * are other things in the class that have to be dealt with at execution
10452      * time */
10453     SV* swash = NULL;           /* Code points that match \p{} \P{} */
10454
10455     /* Set if a component of this character class is user-defined; just passed
10456      * on to the engine */
10457     UV has_user_defined_property = 0;
10458
10459     /* code points this node matches that can't be stored in the bitmap */
10460     SV* nonbitmap = NULL;
10461
10462     /* The items that are to match that aren't stored in the bitmap, but are a
10463      * result of things that are stored there.  This is the fold closure of
10464      * such a character, either because it has DEPENDS semantics and shouldn't
10465      * be matched unless the target string is utf8, or is a code point that is
10466      * too large for the bit map, as for example, the fold of the MICRO SIGN is
10467      * above 255.  This all is solely for performance reasons.  By having this
10468      * code know the outside-the-bitmap folds that the bitmapped characters are
10469      * involved with, we don't have to go out to disk to find the list of
10470      * matches, unless the character class includes code points that aren't
10471      * storable in the bit map.  That means that a character class with an 's'
10472      * in it, for example, doesn't need to go out to disk to find everything
10473      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
10474      * empty unless there is something whose fold we don't know about, and will
10475      * have to go out to the disk to find. */
10476     SV* l1_fold_invlist = NULL;
10477
10478     /* List of multi-character folds that are matched by this node */
10479     AV* unicode_alternate  = NULL;
10480 #ifdef EBCDIC
10481     UV literal_endpoint = 0;
10482 #endif
10483     UV stored = 0;  /* how many chars stored in the bitmap */
10484
10485     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
10486         case we need to change the emitted regop to an EXACT. */
10487     const char * orig_parse = RExC_parse;
10488     GET_RE_DEBUG_FLAGS_DECL;
10489
10490     PERL_ARGS_ASSERT_REGCLASS;
10491 #ifndef DEBUGGING
10492     PERL_UNUSED_ARG(depth);
10493 #endif
10494
10495     DEBUG_PARSE("clas");
10496
10497     /* Assume we are going to generate an ANYOF node. */
10498     ret = reganode(pRExC_state, ANYOF, 0);
10499
10500
10501     if (!SIZE_ONLY) {
10502         ANYOF_FLAGS(ret) = 0;
10503     }
10504
10505     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
10506         RExC_naughty++;
10507         RExC_parse++;
10508         if (!SIZE_ONLY)
10509             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
10510
10511         /* We have decided to not allow multi-char folds in inverted character
10512          * classes, due to the confusion that can happen, especially with
10513          * classes that are designed for a non-Unicode world:  You have the
10514          * peculiar case that:
10515             "s s" =~ /^[^\xDF]+$/i => Y
10516             "ss"  =~ /^[^\xDF]+$/i => N
10517          *
10518          * See [perl #89750] */
10519         allow_full_fold = FALSE;
10520     }
10521
10522     if (SIZE_ONLY) {
10523         RExC_size += ANYOF_SKIP;
10524         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10525     }
10526     else {
10527         RExC_emit += ANYOF_SKIP;
10528         if (LOC) {
10529             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
10530         }
10531         ANYOF_BITMAP_ZERO(ret);
10532         listsv = newSVpvs("# comment\n");
10533         initial_listsv_len = SvCUR(listsv);
10534     }
10535
10536     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10537
10538     if (!SIZE_ONLY && POSIXCC(nextvalue))
10539         checkposixcc(pRExC_state);
10540
10541     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10542     if (UCHARAT(RExC_parse) == ']')
10543         goto charclassloop;
10544
10545 parseit:
10546     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
10547
10548     charclassloop:
10549
10550         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10551
10552         if (!range) {
10553             rangebegin = RExC_parse;
10554             element_count++;
10555         }
10556         if (UTF) {
10557             value = utf8n_to_uvchr((U8*)RExC_parse,
10558                                    RExC_end - RExC_parse,
10559                                    &numlen, UTF8_ALLOW_DEFAULT);
10560             RExC_parse += numlen;
10561         }
10562         else
10563             value = UCHARAT(RExC_parse++);
10564
10565         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10566         if (value == '[' && POSIXCC(nextvalue))
10567             namedclass = regpposixcc(pRExC_state, value);
10568         else if (value == '\\') {
10569             if (UTF) {
10570                 value = utf8n_to_uvchr((U8*)RExC_parse,
10571                                    RExC_end - RExC_parse,
10572                                    &numlen, UTF8_ALLOW_DEFAULT);
10573                 RExC_parse += numlen;
10574             }
10575             else
10576                 value = UCHARAT(RExC_parse++);
10577             /* Some compilers cannot handle switching on 64-bit integer
10578              * values, therefore value cannot be an UV.  Yes, this will
10579              * be a problem later if we want switch on Unicode.
10580              * A similar issue a little bit later when switching on
10581              * namedclass. --jhi */
10582             switch ((I32)value) {
10583             case 'w':   namedclass = ANYOF_ALNUM;       break;
10584             case 'W':   namedclass = ANYOF_NALNUM;      break;
10585             case 's':   namedclass = ANYOF_SPACE;       break;
10586             case 'S':   namedclass = ANYOF_NSPACE;      break;
10587             case 'd':   namedclass = ANYOF_DIGIT;       break;
10588             case 'D':   namedclass = ANYOF_NDIGIT;      break;
10589             case 'v':   namedclass = ANYOF_VERTWS;      break;
10590             case 'V':   namedclass = ANYOF_NVERTWS;     break;
10591             case 'h':   namedclass = ANYOF_HORIZWS;     break;
10592             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
10593             case 'N':  /* Handle \N{NAME} in class */
10594                 {
10595                     /* We only pay attention to the first char of
10596                     multichar strings being returned. I kinda wonder
10597                     if this makes sense as it does change the behaviour
10598                     from earlier versions, OTOH that behaviour was broken
10599                     as well. */
10600                     UV v; /* value is register so we cant & it /grrr */
10601                     if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
10602                         goto parseit;
10603                     }
10604                     value= v;
10605                 }
10606                 break;
10607             case 'p':
10608             case 'P':
10609                 {
10610                 char *e;
10611                 if (RExC_parse >= RExC_end)
10612                     vFAIL2("Empty \\%c{}", (U8)value);
10613                 if (*RExC_parse == '{') {
10614                     const U8 c = (U8)value;
10615                     e = strchr(RExC_parse++, '}');
10616                     if (!e)
10617                         vFAIL2("Missing right brace on \\%c{}", c);
10618                     while (isSPACE(UCHARAT(RExC_parse)))
10619                         RExC_parse++;
10620                     if (e == RExC_parse)
10621                         vFAIL2("Empty \\%c{}", c);
10622                     n = e - RExC_parse;
10623                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10624                         n--;
10625                 }
10626                 else {
10627                     e = RExC_parse;
10628                     n = 1;
10629                 }
10630                 if (!SIZE_ONLY) {
10631                     SV** invlistsvp;
10632                     SV* invlist;
10633                     char* name;
10634                     if (UCHARAT(RExC_parse) == '^') {
10635                          RExC_parse++;
10636                          n--;
10637                          value = value == 'p' ? 'P' : 'p'; /* toggle */
10638                          while (isSPACE(UCHARAT(RExC_parse))) {
10639                               RExC_parse++;
10640                               n--;
10641                          }
10642                     }
10643                     /* Try to get the definition of the property into
10644                      * <invlist>.  If /i is in effect, the effective property
10645                      * will have its name be <__NAME_i>.  The design is
10646                      * discussed in commit
10647                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10648                     Newx(name, n + sizeof("_i__\n"), char);
10649
10650                     sprintf(name, "%s%.*s%s\n",
10651                                     (FOLD) ? "__" : "",
10652                                     (int)n,
10653                                     RExC_parse,
10654                                     (FOLD) ? "_i" : ""
10655                     );
10656
10657                     /* Look up the property name, and get its swash and
10658                      * inversion list, if the property is found  */
10659                     if (swash) {
10660                         SvREFCNT_dec(swash);
10661                     }
10662                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
10663                                              1, /* binary */
10664                                              0, /* not tr/// */
10665                                              TRUE, /* this routine will handle
10666                                                       undefined properties */
10667                                              NULL, FALSE /* No inversion list */
10668                                             );
10669                     if (   ! swash
10670                         || ! SvROK(swash)
10671                         || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10672                         || ! (invlistsvp =
10673                                 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10674                                 "INVLIST", FALSE))
10675                         || ! (invlist = *invlistsvp))
10676                     {
10677                         if (swash) {
10678                             SvREFCNT_dec(swash);
10679                             swash = NULL;
10680                         }
10681
10682                         /* Here didn't find it.  It could be a user-defined
10683                          * property that will be available at run-time.  Add it
10684                          * to the list to look up then */
10685                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10686                                         (value == 'p' ? '+' : '!'),
10687                                         name);
10688                         has_user_defined_property = 1;
10689
10690                         /* We don't know yet, so have to assume that the
10691                          * property could match something in the Latin1 range,
10692                          * hence something that isn't utf8 */
10693                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10694                     }
10695                     else {
10696
10697                         /* Here, did get the swash and its inversion list.  If
10698                          * the swash is from a user-defined property, then this
10699                          * whole character class should be regarded as such */
10700                         SV** user_defined_svp =
10701                                             hv_fetchs(MUTABLE_HV(SvRV(swash)),
10702                                                         "USER_DEFINED", FALSE);
10703                         if (user_defined_svp) {
10704                             has_user_defined_property
10705                                                     |= SvUV(*user_defined_svp);
10706                         }
10707
10708                         /* Invert if asking for the complement */
10709                         if (value == 'P') {
10710                             _invlist_union_complement_2nd(properties, invlist, &properties);
10711
10712                             /* The swash can't be used as-is, because we've
10713                              * inverted things; delay removing it to here after
10714                              * have copied its invlist above */
10715                             SvREFCNT_dec(swash);
10716                             swash = NULL;
10717                         }
10718                         else {
10719                             _invlist_union(properties, invlist, &properties);
10720                         }
10721                     }
10722                     Safefree(name);
10723                 }
10724                 RExC_parse = e + 1;
10725                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
10726
10727                 /* \p means they want Unicode semantics */
10728                 RExC_uni_semantics = 1;
10729                 }
10730                 break;
10731             case 'n':   value = '\n';                   break;
10732             case 'r':   value = '\r';                   break;
10733             case 't':   value = '\t';                   break;
10734             case 'f':   value = '\f';                   break;
10735             case 'b':   value = '\b';                   break;
10736             case 'e':   value = ASCII_TO_NATIVE('\033');break;
10737             case 'a':   value = ASCII_TO_NATIVE('\007');break;
10738             case 'o':
10739                 RExC_parse--;   /* function expects to be pointed at the 'o' */
10740                 {
10741                     const char* error_msg;
10742                     bool valid = grok_bslash_o(RExC_parse,
10743                                                &value,
10744                                                &numlen,
10745                                                &error_msg,
10746                                                SIZE_ONLY);
10747                     RExC_parse += numlen;
10748                     if (! valid) {
10749                         vFAIL(error_msg);
10750                     }
10751                 }
10752                 if (PL_encoding && value < 0x100) {
10753                     goto recode_encoding;
10754                 }
10755                 break;
10756             case 'x':
10757                 if (*RExC_parse == '{') {
10758                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10759                         | PERL_SCAN_DISALLOW_PREFIX;
10760                     char * const e = strchr(RExC_parse++, '}');
10761                     if (!e)
10762                         vFAIL("Missing right brace on \\x{}");
10763
10764                     numlen = e - RExC_parse;
10765                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10766                     RExC_parse = e + 1;
10767                 }
10768                 else {
10769                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10770                     numlen = 2;
10771                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10772                     RExC_parse += numlen;
10773                 }
10774                 if (PL_encoding && value < 0x100)
10775                     goto recode_encoding;
10776                 break;
10777             case 'c':
10778                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
10779                 break;
10780             case '0': case '1': case '2': case '3': case '4':
10781             case '5': case '6': case '7':
10782                 {
10783                     /* Take 1-3 octal digits */
10784                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10785                     numlen = 3;
10786                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10787                     RExC_parse += numlen;
10788                     if (PL_encoding && value < 0x100)
10789                         goto recode_encoding;
10790                     break;
10791                 }
10792             recode_encoding:
10793                 if (! RExC_override_recoding) {
10794                     SV* enc = PL_encoding;
10795                     value = reg_recode((const char)(U8)value, &enc);
10796                     if (!enc && SIZE_ONLY)
10797                         ckWARNreg(RExC_parse,
10798                                   "Invalid escape in the specified encoding");
10799                     break;
10800                 }
10801             default:
10802                 /* Allow \_ to not give an error */
10803                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
10804                     ckWARN2reg(RExC_parse,
10805                                "Unrecognized escape \\%c in character class passed through",
10806                                (int)value);
10807                 }
10808                 break;
10809             }
10810         } /* end of \blah */
10811 #ifdef EBCDIC
10812         else
10813             literal_endpoint++;
10814 #endif
10815
10816         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10817
10818             /* What matches in a locale is not known until runtime, so need to
10819              * (one time per class) allocate extra space to pass to regexec.
10820              * The space will contain a bit for each named class that is to be
10821              * matched against.  This isn't needed for \p{} and pseudo-classes,
10822              * as they are not affected by locale, and hence are dealt with
10823              * separately */
10824             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10825                 need_class = 1;
10826                 if (SIZE_ONLY) {
10827                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10828                 }
10829                 else {
10830                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10831                     ANYOF_CLASS_ZERO(ret);
10832                 }
10833                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
10834             }
10835
10836             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
10837              * literal, as is the character that began the false range, i.e.
10838              * the 'a' in the examples */
10839             if (range) {
10840                 if (!SIZE_ONLY) {
10841                     const int w =
10842                         RExC_parse >= rangebegin ?
10843                         RExC_parse - rangebegin : 0;
10844                     ckWARN4reg(RExC_parse,
10845                                "False [] range \"%*.*s\"",
10846                                w, w, rangebegin);
10847
10848                     stored +=
10849                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10850                     if (prevvalue < 256) {
10851                         stored +=
10852                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
10853                     }
10854                     else {
10855                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
10856                     }
10857                 }
10858
10859                 range = 0; /* this was not a true range */
10860             }
10861
10862             if (!SIZE_ONLY) {
10863
10864                 /* Possible truncation here but in some 64-bit environments
10865                  * the compiler gets heartburn about switch on 64-bit values.
10866                  * A similar issue a little earlier when switching on value.
10867                  * --jhi */
10868                 switch ((I32)namedclass) {
10869
10870                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
10871                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10872                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10873                     break;
10874                 case ANYOF_NALNUMC:
10875                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10876                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10877                     break;
10878                 case ANYOF_ALPHA:
10879                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10880                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10881                     break;
10882                 case ANYOF_NALPHA:
10883                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10884                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10885                     break;
10886                 case ANYOF_ASCII:
10887                     if (LOC) {
10888                         ANYOF_CLASS_SET(ret, namedclass);
10889                     }
10890                     else {
10891                         _invlist_union(properties, PL_ASCII, &properties);
10892                     }
10893                     break;
10894                 case ANYOF_NASCII:
10895                     if (LOC) {
10896                         ANYOF_CLASS_SET(ret, namedclass);
10897                     }
10898                     else {
10899                         _invlist_union_complement_2nd(properties,
10900                                                     PL_ASCII, &properties);
10901                         if (DEPENDS_SEMANTICS) {
10902                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
10903                         }
10904                     }
10905                     break;
10906                 case ANYOF_BLANK:
10907                     DO_POSIX(ret, namedclass, properties,
10908                                             PL_PosixBlank, PL_XPosixBlank);
10909                     break;
10910                 case ANYOF_NBLANK:
10911                     DO_N_POSIX(ret, namedclass, properties,
10912                                             PL_PosixBlank, PL_XPosixBlank);
10913                     break;
10914                 case ANYOF_CNTRL:
10915                     DO_POSIX(ret, namedclass, properties,
10916                                             PL_PosixCntrl, PL_XPosixCntrl);
10917                     break;
10918                 case ANYOF_NCNTRL:
10919                     DO_N_POSIX(ret, namedclass, properties,
10920                                             PL_PosixCntrl, PL_XPosixCntrl);
10921                     break;
10922                 case ANYOF_DIGIT:
10923                     /* Ignore the compiler warning for this macro, planned to
10924                      * be eliminated later */
10925                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10926                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10927                     break;
10928                 case ANYOF_NDIGIT:
10929                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10930                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10931                     break;
10932                 case ANYOF_GRAPH:
10933                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10934                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10935                     break;
10936                 case ANYOF_NGRAPH:
10937                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10938                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10939                     break;
10940                 case ANYOF_HORIZWS:
10941                     /* For these, we use the nonbitmap, as /d doesn't make a
10942                      * difference in what these match.  There would be problems
10943                      * if these characters had folds other than themselves, as
10944                      * nonbitmap is subject to folding.  It turns out that \h
10945                      * is just a synonym for XPosixBlank */
10946                     _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
10947                     break;
10948                 case ANYOF_NHORIZWS:
10949                     _invlist_union_complement_2nd(nonbitmap,
10950                                                  PL_XPosixBlank, &nonbitmap);
10951                     break;
10952                 case ANYOF_LOWER:
10953                 case ANYOF_NLOWER:
10954                 {   /* These require special handling, as they differ under
10955                        folding, matching Cased there (which in the ASCII range
10956                        is the same as Alpha */
10957
10958                     SV* ascii_source;
10959                     SV* l1_source;
10960                     const char *Xname;
10961
10962                     if (FOLD && ! LOC) {
10963                         ascii_source = PL_PosixAlpha;
10964                         l1_source = PL_L1Cased;
10965                         Xname = "Cased";
10966                     }
10967                     else {
10968                         ascii_source = PL_PosixLower;
10969                         l1_source = PL_L1PosixLower;
10970                         Xname = "XPosixLower";
10971                     }
10972                     if (namedclass == ANYOF_LOWER) {
10973                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10974                                     ascii_source, l1_source, Xname, listsv);
10975                     }
10976                     else {
10977                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10978                             properties, ascii_source, l1_source, Xname, listsv);
10979                     }
10980                     break;
10981                 }
10982                 case ANYOF_PRINT:
10983                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10984                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10985                     break;
10986                 case ANYOF_NPRINT:
10987                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10988                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10989                     break;
10990                 case ANYOF_PUNCT:
10991                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10992                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10993                     break;
10994                 case ANYOF_NPUNCT:
10995                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10996                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10997                     break;
10998                 case ANYOF_PSXSPC:
10999                     DO_POSIX(ret, namedclass, properties,
11000                                             PL_PosixSpace, PL_XPosixSpace);
11001                     break;
11002                 case ANYOF_NPSXSPC:
11003                     DO_N_POSIX(ret, namedclass, properties,
11004                                             PL_PosixSpace, PL_XPosixSpace);
11005                     break;
11006                 case ANYOF_SPACE:
11007                     DO_POSIX(ret, namedclass, properties,
11008                                             PL_PerlSpace, PL_XPerlSpace);
11009                     break;
11010                 case ANYOF_NSPACE:
11011                     DO_N_POSIX(ret, namedclass, properties,
11012                                             PL_PerlSpace, PL_XPerlSpace);
11013                     break;
11014                 case ANYOF_UPPER:   /* Same as LOWER, above */
11015                 case ANYOF_NUPPER:
11016                 {
11017                     SV* ascii_source;
11018                     SV* l1_source;
11019                     const char *Xname;
11020
11021                     if (FOLD && ! LOC) {
11022                         ascii_source = PL_PosixAlpha;
11023                         l1_source = PL_L1Cased;
11024                         Xname = "Cased";
11025                     }
11026                     else {
11027                         ascii_source = PL_PosixUpper;
11028                         l1_source = PL_L1PosixUpper;
11029                         Xname = "XPosixUpper";
11030                     }
11031                     if (namedclass == ANYOF_UPPER) {
11032                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11033                                     ascii_source, l1_source, Xname, listsv);
11034                     }
11035                     else {
11036                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11037                         properties, ascii_source, l1_source, Xname, listsv);
11038                     }
11039                     break;
11040                 }
11041                 case ANYOF_ALNUM:   /* Really is 'Word' */
11042                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11043                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11044                     break;
11045                 case ANYOF_NALNUM:
11046                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11047                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11048                     break;
11049                 case ANYOF_VERTWS:
11050                     /* For these, we use the nonbitmap, as /d doesn't make a
11051                      * difference in what these match.  There would be problems
11052                      * if these characters had folds other than themselves, as
11053                      * nonbitmap is subject to folding */
11054                     _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
11055                     break;
11056                 case ANYOF_NVERTWS:
11057                     _invlist_union_complement_2nd(nonbitmap,
11058                                                     PL_VertSpace, &nonbitmap);
11059                     break;
11060                 case ANYOF_XDIGIT:
11061                     DO_POSIX(ret, namedclass, properties,
11062                                             PL_PosixXDigit, PL_XPosixXDigit);
11063                     break;
11064                 case ANYOF_NXDIGIT:
11065                     DO_N_POSIX(ret, namedclass, properties,
11066                                             PL_PosixXDigit, PL_XPosixXDigit);
11067                     break;
11068                 case ANYOF_MAX:
11069                     /* this is to handle \p and \P */
11070                     break;
11071                 default:
11072                     vFAIL("Invalid [::] class");
11073                     break;
11074                 }
11075
11076                 continue;
11077             }
11078         } /* end of namedclass \blah */
11079
11080         if (range) {
11081             if (prevvalue > (IV)value) /* b-a */ {
11082                 const int w = RExC_parse - rangebegin;
11083                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11084                 range = 0; /* not a valid range */
11085             }
11086         }
11087         else {
11088             prevvalue = value; /* save the beginning of the range */
11089             if (RExC_parse+1 < RExC_end
11090                 && *RExC_parse == '-'
11091                 && RExC_parse[1] != ']')
11092             {
11093                 RExC_parse++;
11094
11095                 /* a bad range like \w-, [:word:]- ? */
11096                 if (namedclass > OOB_NAMEDCLASS) {
11097                     if (ckWARN(WARN_REGEXP)) {
11098                         const int w =
11099                             RExC_parse >= rangebegin ?
11100                             RExC_parse - rangebegin : 0;
11101                         vWARN4(RExC_parse,
11102                                "False [] range \"%*.*s\"",
11103                                w, w, rangebegin);
11104                     }
11105                     if (!SIZE_ONLY)
11106                         stored +=
11107                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11108                 } else
11109                     range = 1;  /* yeah, it's a range! */
11110                 continue;       /* but do it the next time */
11111             }
11112         }
11113
11114         /* non-Latin1 code point implies unicode semantics.  Must be set in
11115          * pass1 so is there for the whole of pass 2 */
11116         if (value > 255) {
11117             RExC_uni_semantics = 1;
11118         }
11119
11120         /* now is the next time */
11121         if (!SIZE_ONLY) {
11122             if (prevvalue < 256) {
11123                 const IV ceilvalue = value < 256 ? value : 255;
11124                 IV i;
11125 #ifdef EBCDIC
11126                 /* In EBCDIC [\x89-\x91] should include
11127                  * the \x8e but [i-j] should not. */
11128                 if (literal_endpoint == 2 &&
11129                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11130                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11131                 {
11132                     if (isLOWER(prevvalue)) {
11133                         for (i = prevvalue; i <= ceilvalue; i++)
11134                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11135                                 stored +=
11136                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11137                             }
11138                     } else {
11139                         for (i = prevvalue; i <= ceilvalue; i++)
11140                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11141                                 stored +=
11142                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11143                             }
11144                     }
11145                 }
11146                 else
11147 #endif
11148                       for (i = prevvalue; i <= ceilvalue; i++) {
11149                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11150                       }
11151           }
11152           if (value > 255) {
11153             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
11154             const UV natvalue      = NATIVE_TO_UNI(value);
11155             nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11156         }
11157 #ifdef EBCDIC
11158             literal_endpoint = 0;
11159 #endif
11160         }
11161
11162         range = 0; /* this range (if it was one) is done now */
11163     }
11164
11165
11166
11167     if (SIZE_ONLY)
11168         return ret;
11169     /****** !SIZE_ONLY AFTER HERE *********/
11170
11171     /* If folding and there are code points above 255, we calculate all
11172      * characters that could fold to or from the ones already on the list */
11173     if (FOLD && nonbitmap) {
11174         UV start, end;  /* End points of code point ranges */
11175
11176         SV* fold_intersection = NULL;
11177
11178         /* This is a list of all the characters that participate in folds
11179             * (except marks, etc in multi-char folds */
11180         if (! PL_utf8_foldable) {
11181             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11182             PL_utf8_foldable = _swash_to_invlist(swash);
11183             SvREFCNT_dec(swash);
11184         }
11185
11186         /* This is a hash that for a particular fold gives all characters
11187             * that are involved in it */
11188         if (! PL_utf8_foldclosures) {
11189
11190             /* If we were unable to find any folds, then we likely won't be
11191              * able to find the closures.  So just create an empty list.
11192              * Folding will effectively be restricted to the non-Unicode rules
11193              * hard-coded into Perl.  (This case happens legitimately during
11194              * compilation of Perl itself before the Unicode tables are
11195              * generated) */
11196             if (invlist_len(PL_utf8_foldable) == 0) {
11197                 PL_utf8_foldclosures = newHV();
11198             } else {
11199                 /* If the folds haven't been read in, call a fold function
11200                     * to force that */
11201                 if (! PL_utf8_tofold) {
11202                     U8 dummy[UTF8_MAXBYTES+1];
11203                     STRLEN dummy_len;
11204
11205                     /* This particular string is above \xff in both UTF-8 and
11206                      * UTFEBCDIC */
11207                     to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11208                     assert(PL_utf8_tofold); /* Verify that worked */
11209                 }
11210                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11211             }
11212         }
11213
11214         /* Only the characters in this class that participate in folds need be
11215          * checked.  Get the intersection of this class and all the possible
11216          * characters that are foldable.  This can quickly narrow down a large
11217          * class */
11218         _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11219
11220         /* Now look at the foldable characters in this class individually */
11221         invlist_iterinit(fold_intersection);
11222         while (invlist_iternext(fold_intersection, &start, &end)) {
11223             UV j;
11224
11225             /* Look at every character in the range */
11226             for (j = start; j <= end; j++) {
11227
11228                 /* Get its fold */
11229                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11230                 STRLEN foldlen;
11231                 const UV f =
11232                     _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
11233
11234                 if (foldlen > (STRLEN)UNISKIP(f)) {
11235
11236                     /* Any multicharacter foldings (disallowed in lookbehind
11237                      * patterns) require the following transform: [ABCDEF] ->
11238                      * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11239                      * folds into "rst", all other characters fold to single
11240                      * characters.  We save away these multicharacter foldings,
11241                      * to be later saved as part of the additional "s" data. */
11242                     if (! RExC_in_lookbehind) {
11243                         U8* loc = foldbuf;
11244                         U8* e = foldbuf + foldlen;
11245
11246                         /* If any of the folded characters of this are in the
11247                          * Latin1 range, tell the regex engine that this can
11248                          * match a non-utf8 target string.  The only multi-byte
11249                          * fold whose source is in the Latin1 range (U+00DF)
11250                          * applies only when the target string is utf8, or
11251                          * under unicode rules */
11252                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11253                             while (loc < e) {
11254
11255                                 /* Can't mix ascii with non- under /aa */
11256                                 if (MORE_ASCII_RESTRICTED
11257                                     && (isASCII(*loc) != isASCII(j)))
11258                                 {
11259                                     goto end_multi_fold;
11260                                 }
11261                                 if (UTF8_IS_INVARIANT(*loc)
11262                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
11263                                 {
11264                                     /* Can't mix above and below 256 under LOC
11265                                      */
11266                                     if (LOC) {
11267                                         goto end_multi_fold;
11268                                     }
11269                                     ANYOF_FLAGS(ret)
11270                                             |= ANYOF_NONBITMAP_NON_UTF8;
11271                                     break;
11272                                 }
11273                                 loc += UTF8SKIP(loc);
11274                             }
11275                         }
11276
11277                         add_alternate(&unicode_alternate, foldbuf, foldlen);
11278                     end_multi_fold: ;
11279                     }
11280
11281                     /* This is special-cased, as it is the only letter which
11282                      * has both a multi-fold and single-fold in Latin1.  All
11283                      * the other chars that have single and multi-folds are
11284                      * always in utf8, and the utf8 folding algorithm catches
11285                      * them */
11286                     if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11287                         stored += set_regclass_bit(pRExC_state,
11288                                         ret,
11289                                         LATIN_SMALL_LETTER_SHARP_S,
11290                                         &l1_fold_invlist, &unicode_alternate);
11291                     }
11292                 }
11293                 else {
11294                     /* Single character fold.  Add everything in its fold
11295                      * closure to the list that this node should match */
11296                     SV** listp;
11297
11298                     /* The fold closures data structure is a hash with the keys
11299                      * being every character that is folded to, like 'k', and
11300                      * the values each an array of everything that folds to its
11301                      * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
11302                     if ((listp = hv_fetch(PL_utf8_foldclosures,
11303                                     (char *) foldbuf, foldlen, FALSE)))
11304                     {
11305                         AV* list = (AV*) *listp;
11306                         IV k;
11307                         for (k = 0; k <= av_len(list); k++) {
11308                             SV** c_p = av_fetch(list, k, FALSE);
11309                             UV c;
11310                             if (c_p == NULL) {
11311                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11312                             }
11313                             c = SvUV(*c_p);
11314
11315                             /* /aa doesn't allow folds between ASCII and non-;
11316                              * /l doesn't allow them between above and below
11317                              * 256 */
11318                             if ((MORE_ASCII_RESTRICTED
11319                                  && (isASCII(c) != isASCII(j)))
11320                                     || (LOC && ((c < 256) != (j < 256))))
11321                             {
11322                                 continue;
11323                             }
11324
11325                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11326                                 stored += set_regclass_bit(pRExC_state,
11327                                         ret,
11328                                         (U8) c,
11329                                         &l1_fold_invlist, &unicode_alternate);
11330                             }
11331                                 /* It may be that the code point is already in
11332                                  * this range or already in the bitmap, in
11333                                  * which case we need do nothing */
11334                             else if ((c < start || c > end)
11335                                         && (c > 255
11336                                             || ! ANYOF_BITMAP_TEST(ret, c)))
11337                             {
11338                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11339                             }
11340                         }
11341                     }
11342                 }
11343             }
11344         }
11345         SvREFCNT_dec(fold_intersection);
11346     }
11347
11348     /* Combine the two lists into one. */
11349     if (l1_fold_invlist) {
11350         if (nonbitmap) {
11351             _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11352             SvREFCNT_dec(l1_fold_invlist);
11353         }
11354         else {
11355             nonbitmap = l1_fold_invlist;
11356         }
11357     }
11358
11359     /* And combine the result (if any) with any inversion list from properties.
11360      * The lists are kept separate up to now because we don't want to fold the
11361      * properties */
11362     if (properties) {
11363         if (nonbitmap) {
11364             _invlist_union(nonbitmap, properties, &nonbitmap);
11365             SvREFCNT_dec(properties);
11366         }
11367         else {
11368             nonbitmap = properties;
11369         }
11370     }
11371
11372     /* Here, <nonbitmap> contains all the code points we can determine at
11373      * compile time that we haven't put into the bitmap.  Go through it, and
11374      * for things that belong in the bitmap, put them there, and delete from
11375      * <nonbitmap> */
11376     if (nonbitmap) {
11377
11378         /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11379          * possibly only should match when the target string is UTF-8 */
11380         UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11381
11382         /* This gets set if we actually need to modify things */
11383         bool change_invlist = FALSE;
11384
11385         UV start, end;
11386
11387         /* Start looking through <nonbitmap> */
11388         invlist_iterinit(nonbitmap);
11389         while (invlist_iternext(nonbitmap, &start, &end)) {
11390             UV high;
11391             int i;
11392
11393             /* Quit if are above what we should change */
11394             if (start > max_cp_to_set) {
11395                 break;
11396             }
11397
11398             change_invlist = TRUE;
11399
11400             /* Set all the bits in the range, up to the max that we are doing */
11401             high = (end < max_cp_to_set) ? end : max_cp_to_set;
11402             for (i = start; i <= (int) high; i++) {
11403                 if (! ANYOF_BITMAP_TEST(ret, i)) {
11404                     ANYOF_BITMAP_SET(ret, i);
11405                     stored++;
11406                     prevvalue = value;
11407                     value = i;
11408                 }
11409             }
11410         }
11411
11412         /* Done with loop; remove any code points that are in the bitmap from
11413          * <nonbitmap> */
11414         if (change_invlist) {
11415             _invlist_subtract(nonbitmap,
11416                               (DEPENDS_SEMANTICS)
11417                                 ? PL_ASCII
11418                                 : PL_Latin1,
11419                               &nonbitmap);
11420         }
11421
11422         /* If have completely emptied it, remove it completely */
11423         if (invlist_len(nonbitmap) == 0) {
11424             SvREFCNT_dec(nonbitmap);
11425             nonbitmap = NULL;
11426         }
11427     }
11428
11429     /* Here, we have calculated what code points should be in the character
11430      * class.  <nonbitmap> does not overlap the bitmap except possibly in the
11431      * case of DEPENDS rules.
11432      *
11433      * Now we can see about various optimizations.  Fold calculation (which we
11434      * did above) needs to take place before inversion.  Otherwise /[^k]/i
11435      * would invert to include K, which under /i would match k, which it
11436      * shouldn't. */
11437
11438     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
11439      * set the FOLD flag yet, so this does optimize those.  It doesn't
11440      * optimize locale.  Doing so perhaps could be done as long as there is
11441      * nothing like \w in it; some thought also would have to be given to the
11442      * interaction with above 0x100 chars */
11443     if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11444         && ! LOC
11445         && ! unicode_alternate
11446         /* In case of /d, there are some things that should match only when in
11447          * not in the bitmap, i.e., they require UTF8 to match.  These are
11448          * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11449          * case, they don't require UTF8, so can invert here */
11450         && (! nonbitmap
11451             || ! DEPENDS_SEMANTICS
11452             || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11453         && SvCUR(listsv) == initial_listsv_len)
11454     {
11455         int i;
11456         if (! nonbitmap) {
11457             for (i = 0; i < 256; ++i) {
11458                 if (ANYOF_BITMAP_TEST(ret, i)) {
11459                     ANYOF_BITMAP_CLEAR(ret, i);
11460                 }
11461                 else {
11462                     ANYOF_BITMAP_SET(ret, i);
11463                     prevvalue = value;
11464                     value = i;
11465                 }
11466             }
11467             /* The inversion means that everything above 255 is matched */
11468             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11469         }
11470         else {
11471             /* Here, also has things outside the bitmap that may overlap with
11472              * the bitmap.  We have to sync them up, so that they get inverted
11473              * in both places.  Earlier, we removed all overlaps except in the
11474              * case of /d rules, so no syncing is needed except for this case
11475              */
11476             SV *remove_list = NULL;
11477
11478             if (DEPENDS_SEMANTICS) {
11479                 UV start, end;
11480
11481                 /* Set the bits that correspond to the ones that aren't in the
11482                  * bitmap.  Otherwise, when we invert, we'll miss these.
11483                  * Earlier, we removed from the nonbitmap all code points
11484                  * < 128, so there is no extra work here */
11485                 invlist_iterinit(nonbitmap);
11486                 while (invlist_iternext(nonbitmap, &start, &end)) {
11487                     if (start > 255) {  /* The bit map goes to 255 */
11488                         break;
11489                     }
11490                     if (end > 255) {
11491                         end = 255;
11492                     }
11493                     for (i = start; i <= (int) end; ++i) {
11494                         ANYOF_BITMAP_SET(ret, i);
11495                         prevvalue = value;
11496                         value = i;
11497                     }
11498                 }
11499             }
11500
11501             /* Now invert both the bitmap and the nonbitmap.  Anything in the
11502              * bitmap has to also be removed from the non-bitmap, but again,
11503              * there should not be overlap unless is /d rules. */
11504             _invlist_invert(nonbitmap);
11505
11506             /* Any swash can't be used as-is, because we've inverted things */
11507             if (swash) {
11508                 SvREFCNT_dec(swash);
11509                 swash = NULL;
11510             }
11511
11512             for (i = 0; i < 256; ++i) {
11513                 if (ANYOF_BITMAP_TEST(ret, i)) {
11514                     ANYOF_BITMAP_CLEAR(ret, i);
11515                     if (DEPENDS_SEMANTICS) {
11516                         if (! remove_list) {
11517                             remove_list = _new_invlist(2);
11518                         }
11519                         remove_list = add_cp_to_invlist(remove_list, i);
11520                     }
11521                 }
11522                 else {
11523                     ANYOF_BITMAP_SET(ret, i);
11524                     prevvalue = value;
11525                     value = i;
11526                 }
11527             }
11528
11529             /* And do the removal */
11530             if (DEPENDS_SEMANTICS) {
11531                 if (remove_list) {
11532                     _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11533                     SvREFCNT_dec(remove_list);
11534                 }
11535             }
11536             else {
11537                 /* There is no overlap for non-/d, so just delete anything
11538                  * below 256 */
11539                 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
11540             }
11541         }
11542
11543         stored = 256 - stored;
11544
11545         /* Clear the invert flag since have just done it here */
11546         ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
11547     }
11548
11549     /* Folding in the bitmap is taken care of above, but not for locale (for
11550      * which we have to wait to see what folding is in effect at runtime), and
11551      * for some things not in the bitmap (only the upper latin folds in this
11552      * case, as all other single-char folding has been set above).  Set
11553      * run-time fold flag for these */
11554     if (FOLD && (LOC
11555                 || (DEPENDS_SEMANTICS
11556                     && nonbitmap
11557                     && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11558                 || unicode_alternate))
11559     {
11560         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
11561     }
11562
11563     /* A single character class can be "optimized" into an EXACTish node.
11564      * Note that since we don't currently count how many characters there are
11565      * outside the bitmap, we are XXX missing optimization possibilities for
11566      * them.  This optimization can't happen unless this is a truly single
11567      * character class, which means that it can't be an inversion into a
11568      * many-character class, and there must be no possibility of there being
11569      * things outside the bitmap.  'stored' (only) for locales doesn't include
11570      * \w, etc, so have to make a special test that they aren't present
11571      *
11572      * Similarly A 2-character class of the very special form like [bB] can be
11573      * optimized into an EXACTFish node, but only for non-locales, and for
11574      * characters which only have the two folds; so things like 'fF' and 'Ii'
11575      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11576      * FI'. */
11577     if (! nonbitmap
11578         && ! unicode_alternate
11579         && SvCUR(listsv) == initial_listsv_len
11580         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
11581         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11582                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11583             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11584                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11585                                  /* If the latest code point has a fold whose
11586                                   * bit is set, it must be the only other one */
11587                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
11588                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
11589     {
11590         /* Note that the information needed to decide to do this optimization
11591          * is not currently available until the 2nd pass, and that the actually
11592          * used EXACTish node takes less space than the calculated ANYOF node,
11593          * and hence the amount of space calculated in the first pass is larger
11594          * than actually used, so this optimization doesn't gain us any space.
11595          * But an EXACT node is faster than an ANYOF node, and can be combined
11596          * with any adjacent EXACT nodes later by the optimizer for further
11597          * gains.  The speed of executing an EXACTF is similar to an ANYOF
11598          * node, so the optimization advantage comes from the ability to join
11599          * it to adjacent EXACT nodes */
11600
11601         const char * cur_parse= RExC_parse;
11602         U8 op;
11603         RExC_emit = (regnode *)orig_emit;
11604         RExC_parse = (char *)orig_parse;
11605
11606         if (stored == 1) {
11607
11608             /* A locale node with one point can be folded; all the other cases
11609              * with folding will have two points, since we calculate them above
11610              */
11611             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
11612                  op = EXACTFL;
11613             }
11614             else {
11615                 op = EXACT;
11616             }
11617         }
11618         else {   /* else 2 chars in the bit map: the folds of each other */
11619
11620             /* Use the folded value, which for the cases where we get here,
11621              * is just the lower case of the current one (which may resolve to
11622              * itself, or to the other one */
11623             value = toLOWER_LATIN1(value);
11624
11625             /* To join adjacent nodes, they must be the exact EXACTish type.
11626              * Try to use the most likely type, by using EXACTFA if possible,
11627              * then EXACTFU if the regex calls for it, or is required because
11628              * the character is non-ASCII.  (If <value> is ASCII, its fold is
11629              * also ASCII for the cases where we get here.) */
11630             if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11631                 op = EXACTFA;
11632             }
11633             else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
11634                 op = EXACTFU;
11635             }
11636             else {    /* Otherwise, more likely to be EXACTF type */
11637                 op = EXACTF;
11638             }
11639         }
11640
11641         ret = reg_node(pRExC_state, op);
11642         RExC_parse = (char *)cur_parse;
11643         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11644             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11645             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11646             STR_LEN(ret)= 2;
11647             RExC_emit += STR_SZ(2);
11648         }
11649         else {
11650             *STRING(ret)= (char)value;
11651             STR_LEN(ret)= 1;
11652             RExC_emit += STR_SZ(1);
11653         }
11654         SvREFCNT_dec(listsv);
11655         return ret;
11656     }
11657
11658     /* If there is a swash and more than one element, we can't use the swash in
11659      * the optimization below. */
11660     if (swash && element_count > 1) {
11661         SvREFCNT_dec(swash);
11662         swash = NULL;
11663     }
11664     if (! nonbitmap
11665         && SvCUR(listsv) == initial_listsv_len
11666         && ! unicode_alternate)
11667     {
11668         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11669         SvREFCNT_dec(listsv);
11670         SvREFCNT_dec(unicode_alternate);
11671     }
11672     else {
11673         /* av[0] stores the character class description in its textual form:
11674          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
11675          *       appropriate swash, and is also useful for dumping the regnode.
11676          * av[1] if NULL, is a placeholder to later contain the swash computed
11677          *       from av[0].  But if no further computation need be done, the
11678          *       swash is stored there now.
11679          * av[2] stores the multicharacter foldings, used later in
11680          *       regexec.c:S_reginclass().
11681          * av[3] stores the nonbitmap inversion list for use in addition or
11682          *       instead of av[0]; not used if av[1] isn't NULL
11683          * av[4] is set if any component of the class is from a user-defined
11684          *       property; not used if av[1] isn't NULL */
11685         AV * const av = newAV();
11686         SV *rv;
11687
11688         av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11689                         ? &PL_sv_undef
11690                         : listsv);
11691         if (swash) {
11692             av_store(av, 1, swash);
11693             SvREFCNT_dec(nonbitmap);
11694         }
11695         else {
11696             av_store(av, 1, NULL);
11697             if (nonbitmap) {
11698                 av_store(av, 3, nonbitmap);
11699                 av_store(av, 4, newSVuv(has_user_defined_property));
11700             }
11701         }
11702
11703         /* Store any computed multi-char folds only if we are allowing
11704          * them */
11705         if (allow_full_fold) {
11706             av_store(av, 2, MUTABLE_SV(unicode_alternate));
11707             if (unicode_alternate) { /* This node is variable length */
11708                 OP(ret) = ANYOFV;
11709             }
11710         }
11711         else {
11712             av_store(av, 2, NULL);
11713         }
11714         rv = newRV_noinc(MUTABLE_SV(av));
11715         n = add_data(pRExC_state, 1, "s");
11716         RExC_rxi->data->data[n] = (void*)rv;
11717         ARG_SET(ret, n);
11718     }
11719     return ret;
11720 }
11721
11722
11723 /* reg_skipcomment()
11724
11725    Absorbs an /x style # comments from the input stream.
11726    Returns true if there is more text remaining in the stream.
11727    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11728    terminates the pattern without including a newline.
11729
11730    Note its the callers responsibility to ensure that we are
11731    actually in /x mode
11732
11733 */
11734
11735 STATIC bool
11736 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11737 {
11738     bool ended = 0;
11739
11740     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11741
11742     while (RExC_parse < RExC_end)
11743         if (*RExC_parse++ == '\n') {
11744             ended = 1;
11745             break;
11746         }
11747     if (!ended) {
11748         /* we ran off the end of the pattern without ending
11749            the comment, so we have to add an \n when wrapping */
11750         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11751         return 0;
11752     } else
11753         return 1;
11754 }
11755
11756 /* nextchar()
11757
11758    Advances the parse position, and optionally absorbs
11759    "whitespace" from the inputstream.
11760
11761    Without /x "whitespace" means (?#...) style comments only,
11762    with /x this means (?#...) and # comments and whitespace proper.
11763
11764    Returns the RExC_parse point from BEFORE the scan occurs.
11765
11766    This is the /x friendly way of saying RExC_parse++.
11767 */
11768
11769 STATIC char*
11770 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
11771 {
11772     char* const retval = RExC_parse++;
11773
11774     PERL_ARGS_ASSERT_NEXTCHAR;
11775
11776     for (;;) {
11777         if (RExC_end - RExC_parse >= 3
11778             && *RExC_parse == '('
11779             && RExC_parse[1] == '?'
11780             && RExC_parse[2] == '#')
11781         {
11782             while (*RExC_parse != ')') {
11783                 if (RExC_parse == RExC_end)
11784                     FAIL("Sequence (?#... not terminated");
11785                 RExC_parse++;
11786             }
11787             RExC_parse++;
11788             continue;
11789         }
11790         if (RExC_flags & RXf_PMf_EXTENDED) {
11791             if (isSPACE(*RExC_parse)) {
11792                 RExC_parse++;
11793                 continue;
11794             }
11795             else if (*RExC_parse == '#') {
11796                 if ( reg_skipcomment( pRExC_state ) )
11797                     continue;
11798             }
11799         }
11800         return retval;
11801     }
11802 }
11803
11804 /*
11805 - reg_node - emit a node
11806 */
11807 STATIC regnode *                        /* Location. */
11808 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
11809 {
11810     dVAR;
11811     register regnode *ptr;
11812     regnode * const ret = RExC_emit;
11813     GET_RE_DEBUG_FLAGS_DECL;
11814
11815     PERL_ARGS_ASSERT_REG_NODE;
11816
11817     if (SIZE_ONLY) {
11818         SIZE_ALIGN(RExC_size);
11819         RExC_size += 1;
11820         return(ret);
11821     }
11822     if (RExC_emit >= RExC_emit_bound)
11823         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11824                    op, RExC_emit, RExC_emit_bound);
11825
11826     NODE_ALIGN_FILL(ret);
11827     ptr = ret;
11828     FILL_ADVANCE_NODE(ptr, op);
11829 #ifdef RE_TRACK_PATTERN_OFFSETS
11830     if (RExC_offsets) {         /* MJD */
11831         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
11832               "reg_node", __LINE__,
11833               PL_reg_name[op],
11834               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11835                 ? "Overwriting end of array!\n" : "OK",
11836               (UV)(RExC_emit - RExC_emit_start),
11837               (UV)(RExC_parse - RExC_start),
11838               (UV)RExC_offsets[0]));
11839         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
11840     }
11841 #endif
11842     RExC_emit = ptr;
11843     return(ret);
11844 }
11845
11846 /*
11847 - reganode - emit a node with an argument
11848 */
11849 STATIC regnode *                        /* Location. */
11850 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
11851 {
11852     dVAR;
11853     register regnode *ptr;
11854     regnode * const ret = RExC_emit;
11855     GET_RE_DEBUG_FLAGS_DECL;
11856
11857     PERL_ARGS_ASSERT_REGANODE;
11858
11859     if (SIZE_ONLY) {
11860         SIZE_ALIGN(RExC_size);
11861         RExC_size += 2;
11862         /*
11863            We can't do this:
11864
11865            assert(2==regarglen[op]+1);
11866
11867            Anything larger than this has to allocate the extra amount.
11868            If we changed this to be:
11869
11870            RExC_size += (1 + regarglen[op]);
11871
11872            then it wouldn't matter. Its not clear what side effect
11873            might come from that so its not done so far.
11874            -- dmq
11875         */
11876         return(ret);
11877     }
11878     if (RExC_emit >= RExC_emit_bound)
11879         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11880                    op, RExC_emit, RExC_emit_bound);
11881
11882     NODE_ALIGN_FILL(ret);
11883     ptr = ret;
11884     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
11885 #ifdef RE_TRACK_PATTERN_OFFSETS
11886     if (RExC_offsets) {         /* MJD */
11887         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11888               "reganode",
11889               __LINE__,
11890               PL_reg_name[op],
11891               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
11892               "Overwriting end of array!\n" : "OK",
11893               (UV)(RExC_emit - RExC_emit_start),
11894               (UV)(RExC_parse - RExC_start),
11895               (UV)RExC_offsets[0]));
11896         Set_Cur_Node_Offset;
11897     }
11898 #endif
11899     RExC_emit = ptr;
11900     return(ret);
11901 }
11902
11903 /*
11904 - reguni - emit (if appropriate) a Unicode character
11905 */
11906 STATIC STRLEN
11907 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
11908 {
11909     dVAR;
11910
11911     PERL_ARGS_ASSERT_REGUNI;
11912
11913     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
11914 }
11915
11916 /*
11917 - reginsert - insert an operator in front of already-emitted operand
11918 *
11919 * Means relocating the operand.
11920 */
11921 STATIC void
11922 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
11923 {
11924     dVAR;
11925     register regnode *src;
11926     register regnode *dst;
11927     register regnode *place;
11928     const int offset = regarglen[(U8)op];
11929     const int size = NODE_STEP_REGNODE + offset;
11930     GET_RE_DEBUG_FLAGS_DECL;
11931
11932     PERL_ARGS_ASSERT_REGINSERT;
11933     PERL_UNUSED_ARG(depth);
11934 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
11935     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
11936     if (SIZE_ONLY) {
11937         RExC_size += size;
11938         return;
11939     }
11940
11941     src = RExC_emit;
11942     RExC_emit += size;
11943     dst = RExC_emit;
11944     if (RExC_open_parens) {
11945         int paren;
11946         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
11947         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
11948             if ( RExC_open_parens[paren] >= opnd ) {
11949                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
11950                 RExC_open_parens[paren] += size;
11951             } else {
11952                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
11953             }
11954             if ( RExC_close_parens[paren] >= opnd ) {
11955                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
11956                 RExC_close_parens[paren] += size;
11957             } else {
11958                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
11959             }
11960         }
11961     }
11962
11963     while (src > opnd) {
11964         StructCopy(--src, --dst, regnode);
11965 #ifdef RE_TRACK_PATTERN_OFFSETS
11966         if (RExC_offsets) {     /* MJD 20010112 */
11967             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
11968                   "reg_insert",
11969                   __LINE__,
11970                   PL_reg_name[op],
11971                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
11972                     ? "Overwriting end of array!\n" : "OK",
11973                   (UV)(src - RExC_emit_start),
11974                   (UV)(dst - RExC_emit_start),
11975                   (UV)RExC_offsets[0]));
11976             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
11977             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
11978         }
11979 #endif
11980     }
11981
11982
11983     place = opnd;               /* Op node, where operand used to be. */
11984 #ifdef RE_TRACK_PATTERN_OFFSETS
11985     if (RExC_offsets) {         /* MJD */
11986         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11987               "reginsert",
11988               __LINE__,
11989               PL_reg_name[op],
11990               (UV)(place - RExC_emit_start) > RExC_offsets[0]
11991               ? "Overwriting end of array!\n" : "OK",
11992               (UV)(place - RExC_emit_start),
11993               (UV)(RExC_parse - RExC_start),
11994               (UV)RExC_offsets[0]));
11995         Set_Node_Offset(place, RExC_parse);
11996         Set_Node_Length(place, 1);
11997     }
11998 #endif
11999     src = NEXTOPER(place);
12000     FILL_ADVANCE_NODE(place, op);
12001     Zero(src, offset, regnode);
12002 }
12003
12004 /*
12005 - regtail - set the next-pointer at the end of a node chain of p to val.
12006 - SEE ALSO: regtail_study
12007 */
12008 /* TODO: All three parms should be const */
12009 STATIC void
12010 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12011 {
12012     dVAR;
12013     register regnode *scan;
12014     GET_RE_DEBUG_FLAGS_DECL;
12015
12016     PERL_ARGS_ASSERT_REGTAIL;
12017 #ifndef DEBUGGING
12018     PERL_UNUSED_ARG(depth);
12019 #endif
12020
12021     if (SIZE_ONLY)
12022         return;
12023
12024     /* Find last node. */
12025     scan = p;
12026     for (;;) {
12027         regnode * const temp = regnext(scan);
12028         DEBUG_PARSE_r({
12029             SV * const mysv=sv_newmortal();
12030             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12031             regprop(RExC_rx, mysv, scan);
12032             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12033                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12034                     (temp == NULL ? "->" : ""),
12035                     (temp == NULL ? PL_reg_name[OP(val)] : "")
12036             );
12037         });
12038         if (temp == NULL)
12039             break;
12040         scan = temp;
12041     }
12042
12043     if (reg_off_by_arg[OP(scan)]) {
12044         ARG_SET(scan, val - scan);
12045     }
12046     else {
12047         NEXT_OFF(scan) = val - scan;
12048     }
12049 }
12050
12051 #ifdef DEBUGGING
12052 /*
12053 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12054 - Look for optimizable sequences at the same time.
12055 - currently only looks for EXACT chains.
12056
12057 This is experimental code. The idea is to use this routine to perform
12058 in place optimizations on branches and groups as they are constructed,
12059 with the long term intention of removing optimization from study_chunk so
12060 that it is purely analytical.
12061
12062 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12063 to control which is which.
12064
12065 */
12066 /* TODO: All four parms should be const */
12067
12068 STATIC U8
12069 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12070 {
12071     dVAR;
12072     register regnode *scan;
12073     U8 exact = PSEUDO;
12074 #ifdef EXPERIMENTAL_INPLACESCAN
12075     I32 min = 0;
12076 #endif
12077     GET_RE_DEBUG_FLAGS_DECL;
12078
12079     PERL_ARGS_ASSERT_REGTAIL_STUDY;
12080
12081
12082     if (SIZE_ONLY)
12083         return exact;
12084
12085     /* Find last node. */
12086
12087     scan = p;
12088     for (;;) {
12089         regnode * const temp = regnext(scan);
12090 #ifdef EXPERIMENTAL_INPLACESCAN
12091         if (PL_regkind[OP(scan)] == EXACT) {
12092             bool has_exactf_sharp_s;    /* Unexamined in this routine */
12093             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12094                 return EXACT;
12095         }
12096 #endif
12097         if ( exact ) {
12098             switch (OP(scan)) {
12099                 case EXACT:
12100                 case EXACTF:
12101                 case EXACTFA:
12102                 case EXACTFU:
12103                 case EXACTFU_SS:
12104                 case EXACTFU_TRICKYFOLD:
12105                 case EXACTFL:
12106                         if( exact == PSEUDO )
12107                             exact= OP(scan);
12108                         else if ( exact != OP(scan) )
12109                             exact= 0;
12110                 case NOTHING:
12111                     break;
12112                 default:
12113                     exact= 0;
12114             }
12115         }
12116         DEBUG_PARSE_r({
12117             SV * const mysv=sv_newmortal();
12118             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12119             regprop(RExC_rx, mysv, scan);
12120             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12121                 SvPV_nolen_const(mysv),
12122                 REG_NODE_NUM(scan),
12123                 PL_reg_name[exact]);
12124         });
12125         if (temp == NULL)
12126             break;
12127         scan = temp;
12128     }
12129     DEBUG_PARSE_r({
12130         SV * const mysv_val=sv_newmortal();
12131         DEBUG_PARSE_MSG("");
12132         regprop(RExC_rx, mysv_val, val);
12133         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12134                       SvPV_nolen_const(mysv_val),
12135                       (IV)REG_NODE_NUM(val),
12136                       (IV)(val - scan)
12137         );
12138     });
12139     if (reg_off_by_arg[OP(scan)]) {
12140         ARG_SET(scan, val - scan);
12141     }
12142     else {
12143         NEXT_OFF(scan) = val - scan;
12144     }
12145
12146     return exact;
12147 }
12148 #endif
12149
12150 /*
12151  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12152  */
12153 #ifdef DEBUGGING
12154 static void
12155 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12156 {
12157     int bit;
12158     int set=0;
12159     regex_charset cs;
12160
12161     for (bit=0; bit<32; bit++) {
12162         if (flags & (1<<bit)) {
12163             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
12164                 continue;
12165             }
12166             if (!set++ && lead)
12167                 PerlIO_printf(Perl_debug_log, "%s",lead);
12168             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12169         }
12170     }
12171     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12172             if (!set++ && lead) {
12173                 PerlIO_printf(Perl_debug_log, "%s",lead);
12174             }
12175             switch (cs) {
12176                 case REGEX_UNICODE_CHARSET:
12177                     PerlIO_printf(Perl_debug_log, "UNICODE");
12178                     break;
12179                 case REGEX_LOCALE_CHARSET:
12180                     PerlIO_printf(Perl_debug_log, "LOCALE");
12181                     break;
12182                 case REGEX_ASCII_RESTRICTED_CHARSET:
12183                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12184                     break;
12185                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12186                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12187                     break;
12188                 default:
12189                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12190                     break;
12191             }
12192     }
12193     if (lead)  {
12194         if (set)
12195             PerlIO_printf(Perl_debug_log, "\n");
12196         else
12197             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12198     }
12199 }
12200 #endif
12201
12202 void
12203 Perl_regdump(pTHX_ const regexp *r)
12204 {
12205 #ifdef DEBUGGING
12206     dVAR;
12207     SV * const sv = sv_newmortal();
12208     SV *dsv= sv_newmortal();
12209     RXi_GET_DECL(r,ri);
12210     GET_RE_DEBUG_FLAGS_DECL;
12211
12212     PERL_ARGS_ASSERT_REGDUMP;
12213
12214     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12215
12216     /* Header fields of interest. */
12217     if (r->anchored_substr) {
12218         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12219             RE_SV_DUMPLEN(r->anchored_substr), 30);
12220         PerlIO_printf(Perl_debug_log,
12221                       "anchored %s%s at %"IVdf" ",
12222                       s, RE_SV_TAIL(r->anchored_substr),
12223                       (IV)r->anchored_offset);
12224     } else if (r->anchored_utf8) {
12225         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12226             RE_SV_DUMPLEN(r->anchored_utf8), 30);
12227         PerlIO_printf(Perl_debug_log,
12228                       "anchored utf8 %s%s at %"IVdf" ",
12229                       s, RE_SV_TAIL(r->anchored_utf8),
12230                       (IV)r->anchored_offset);
12231     }
12232     if (r->float_substr) {
12233         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12234             RE_SV_DUMPLEN(r->float_substr), 30);
12235         PerlIO_printf(Perl_debug_log,
12236                       "floating %s%s at %"IVdf"..%"UVuf" ",
12237                       s, RE_SV_TAIL(r->float_substr),
12238                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12239     } else if (r->float_utf8) {
12240         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12241             RE_SV_DUMPLEN(r->float_utf8), 30);
12242         PerlIO_printf(Perl_debug_log,
12243                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12244                       s, RE_SV_TAIL(r->float_utf8),
12245                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12246     }
12247     if (r->check_substr || r->check_utf8)
12248         PerlIO_printf(Perl_debug_log,
12249                       (const char *)
12250                       (r->check_substr == r->float_substr
12251                        && r->check_utf8 == r->float_utf8
12252                        ? "(checking floating" : "(checking anchored"));
12253     if (r->extflags & RXf_NOSCAN)
12254         PerlIO_printf(Perl_debug_log, " noscan");
12255     if (r->extflags & RXf_CHECK_ALL)
12256         PerlIO_printf(Perl_debug_log, " isall");
12257     if (r->check_substr || r->check_utf8)
12258         PerlIO_printf(Perl_debug_log, ") ");
12259
12260     if (ri->regstclass) {
12261         regprop(r, sv, ri->regstclass);
12262         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12263     }
12264     if (r->extflags & RXf_ANCH) {
12265         PerlIO_printf(Perl_debug_log, "anchored");
12266         if (r->extflags & RXf_ANCH_BOL)
12267             PerlIO_printf(Perl_debug_log, "(BOL)");
12268         if (r->extflags & RXf_ANCH_MBOL)
12269             PerlIO_printf(Perl_debug_log, "(MBOL)");
12270         if (r->extflags & RXf_ANCH_SBOL)
12271             PerlIO_printf(Perl_debug_log, "(SBOL)");
12272         if (r->extflags & RXf_ANCH_GPOS)
12273             PerlIO_printf(Perl_debug_log, "(GPOS)");
12274         PerlIO_putc(Perl_debug_log, ' ');
12275     }
12276     if (r->extflags & RXf_GPOS_SEEN)
12277         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12278     if (r->intflags & PREGf_SKIP)
12279         PerlIO_printf(Perl_debug_log, "plus ");
12280     if (r->intflags & PREGf_IMPLICIT)
12281         PerlIO_printf(Perl_debug_log, "implicit ");
12282     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12283     if (r->extflags & RXf_EVAL_SEEN)
12284         PerlIO_printf(Perl_debug_log, "with eval ");
12285     PerlIO_printf(Perl_debug_log, "\n");
12286     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12287 #else
12288     PERL_ARGS_ASSERT_REGDUMP;
12289     PERL_UNUSED_CONTEXT;
12290     PERL_UNUSED_ARG(r);
12291 #endif  /* DEBUGGING */
12292 }
12293
12294 /*
12295 - regprop - printable representation of opcode
12296 */
12297 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12298 STMT_START { \
12299         if (do_sep) {                           \
12300             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12301             if (flags & ANYOF_INVERT)           \
12302                 /*make sure the invert info is in each */ \
12303                 sv_catpvs(sv, "^");             \
12304             do_sep = 0;                         \
12305         }                                       \
12306 } STMT_END
12307
12308 void
12309 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12310 {
12311 #ifdef DEBUGGING
12312     dVAR;
12313     register int k;
12314     RXi_GET_DECL(prog,progi);
12315     GET_RE_DEBUG_FLAGS_DECL;
12316
12317     PERL_ARGS_ASSERT_REGPROP;
12318
12319     sv_setpvs(sv, "");
12320
12321     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
12322         /* It would be nice to FAIL() here, but this may be called from
12323            regexec.c, and it would be hard to supply pRExC_state. */
12324         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12325     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12326
12327     k = PL_regkind[OP(o)];
12328
12329     if (k == EXACT) {
12330         sv_catpvs(sv, " ");
12331         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12332          * is a crude hack but it may be the best for now since
12333          * we have no flag "this EXACTish node was UTF-8"
12334          * --jhi */
12335         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12336                   PERL_PV_ESCAPE_UNI_DETECT |
12337                   PERL_PV_ESCAPE_NONASCII   |
12338                   PERL_PV_PRETTY_ELLIPSES   |
12339                   PERL_PV_PRETTY_LTGT       |
12340                   PERL_PV_PRETTY_NOCLEAR
12341                   );
12342     } else if (k == TRIE) {
12343         /* print the details of the trie in dumpuntil instead, as
12344          * progi->data isn't available here */
12345         const char op = OP(o);
12346         const U32 n = ARG(o);
12347         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12348                (reg_ac_data *)progi->data->data[n] :
12349                NULL;
12350         const reg_trie_data * const trie
12351             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12352
12353         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12354         DEBUG_TRIE_COMPILE_r(
12355             Perl_sv_catpvf(aTHX_ sv,
12356                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12357                 (UV)trie->startstate,
12358                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12359                 (UV)trie->wordcount,
12360                 (UV)trie->minlen,
12361                 (UV)trie->maxlen,
12362                 (UV)TRIE_CHARCOUNT(trie),
12363                 (UV)trie->uniquecharcount
12364             )
12365         );
12366         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12367             int i;
12368             int rangestart = -1;
12369             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12370             sv_catpvs(sv, "[");
12371             for (i = 0; i <= 256; i++) {
12372                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12373                     if (rangestart == -1)
12374                         rangestart = i;
12375                 } else if (rangestart != -1) {
12376                     if (i <= rangestart + 3)
12377                         for (; rangestart < i; rangestart++)
12378                             put_byte(sv, rangestart);
12379                     else {
12380                         put_byte(sv, rangestart);
12381                         sv_catpvs(sv, "-");
12382                         put_byte(sv, i - 1);
12383                     }
12384                     rangestart = -1;
12385                 }
12386             }
12387             sv_catpvs(sv, "]");
12388         }
12389
12390     } else if (k == CURLY) {
12391         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12392             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12393         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12394     }
12395     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
12396         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12397     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12398         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
12399         if ( RXp_PAREN_NAMES(prog) ) {
12400             if ( k != REF || (OP(o) < NREF)) {
12401                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12402                 SV **name= av_fetch(list, ARG(o), 0 );
12403                 if (name)
12404                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12405             }
12406             else {
12407                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12408                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12409                 I32 *nums=(I32*)SvPVX(sv_dat);
12410                 SV **name= av_fetch(list, nums[0], 0 );
12411                 I32 n;
12412                 if (name) {
12413                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
12414                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12415                                     (n ? "," : ""), (IV)nums[n]);
12416                     }
12417                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12418                 }
12419             }
12420         }
12421     } else if (k == GOSUB)
12422         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12423     else if (k == VERB) {
12424         if (!o->flags)
12425             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12426                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12427     } else if (k == LOGICAL)
12428         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
12429     else if (k == ANYOF) {
12430         int i, rangestart = -1;
12431         const U8 flags = ANYOF_FLAGS(o);
12432         int do_sep = 0;
12433
12434         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12435         static const char * const anyofs[] = {
12436             "\\w",
12437             "\\W",
12438             "\\s",
12439             "\\S",
12440             "\\d",
12441             "\\D",
12442             "[:alnum:]",
12443             "[:^alnum:]",
12444             "[:alpha:]",
12445             "[:^alpha:]",
12446             "[:ascii:]",
12447             "[:^ascii:]",
12448             "[:cntrl:]",
12449             "[:^cntrl:]",
12450             "[:graph:]",
12451             "[:^graph:]",
12452             "[:lower:]",
12453             "[:^lower:]",
12454             "[:print:]",
12455             "[:^print:]",
12456             "[:punct:]",
12457             "[:^punct:]",
12458             "[:upper:]",
12459             "[:^upper:]",
12460             "[:xdigit:]",
12461             "[:^xdigit:]",
12462             "[:space:]",
12463             "[:^space:]",
12464             "[:blank:]",
12465             "[:^blank:]"
12466         };
12467
12468         if (flags & ANYOF_LOCALE)
12469             sv_catpvs(sv, "{loc}");
12470         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
12471             sv_catpvs(sv, "{i}");
12472         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
12473         if (flags & ANYOF_INVERT)
12474             sv_catpvs(sv, "^");
12475
12476         /* output what the standard cp 0-255 bitmap matches */
12477         for (i = 0; i <= 256; i++) {
12478             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12479                 if (rangestart == -1)
12480                     rangestart = i;
12481             } else if (rangestart != -1) {
12482                 if (i <= rangestart + 3)
12483                     for (; rangestart < i; rangestart++)
12484                         put_byte(sv, rangestart);
12485                 else {
12486                     put_byte(sv, rangestart);
12487                     sv_catpvs(sv, "-");
12488                     put_byte(sv, i - 1);
12489                 }
12490                 do_sep = 1;
12491                 rangestart = -1;
12492             }
12493         }
12494
12495         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12496         /* output any special charclass tests (used entirely under use locale) */
12497         if (ANYOF_CLASS_TEST_ANY_SET(o))
12498             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
12499                 if (ANYOF_CLASS_TEST(o,i)) {
12500                     sv_catpv(sv, anyofs[i]);
12501                     do_sep = 1;
12502                 }
12503
12504         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12505
12506         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12507             sv_catpvs(sv, "{non-utf8-latin1-all}");
12508         }
12509
12510         /* output information about the unicode matching */
12511         if (flags & ANYOF_UNICODE_ALL)
12512             sv_catpvs(sv, "{unicode_all}");
12513         else if (ANYOF_NONBITMAP(o))
12514             sv_catpvs(sv, "{unicode}");
12515         if (flags & ANYOF_NONBITMAP_NON_UTF8)
12516             sv_catpvs(sv, "{outside bitmap}");
12517
12518         if (ANYOF_NONBITMAP(o)) {
12519             SV *lv; /* Set if there is something outside the bit map */
12520             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
12521             bool byte_output = FALSE;   /* If something in the bitmap has been
12522                                            output */
12523
12524             if (lv && lv != &PL_sv_undef) {
12525                 if (sw) {
12526                     U8 s[UTF8_MAXBYTES_CASE+1];
12527
12528                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
12529                         uvchr_to_utf8(s, i);
12530
12531                         if (i < 256
12532                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
12533                                                                things already
12534                                                                output as part
12535                                                                of the bitmap */
12536                             && swash_fetch(sw, s, TRUE))
12537                         {
12538                             if (rangestart == -1)
12539                                 rangestart = i;
12540                         } else if (rangestart != -1) {
12541                             byte_output = TRUE;
12542                             if (i <= rangestart + 3)
12543                                 for (; rangestart < i; rangestart++) {
12544                                     put_byte(sv, rangestart);
12545                                 }
12546                             else {
12547                                 put_byte(sv, rangestart);
12548                                 sv_catpvs(sv, "-");
12549                                 put_byte(sv, i-1);
12550                             }
12551                             rangestart = -1;
12552                         }
12553                     }
12554                 }
12555
12556                 {
12557                     char *s = savesvpv(lv);
12558                     char * const origs = s;
12559
12560                     while (*s && *s != '\n')
12561                         s++;
12562
12563                     if (*s == '\n') {
12564                         const char * const t = ++s;
12565
12566                         if (byte_output) {
12567                             sv_catpvs(sv, " ");
12568                         }
12569
12570                         while (*s) {
12571                             if (*s == '\n') {
12572
12573                                 /* Truncate very long output */
12574                                 if (s - origs > 256) {
12575                                     Perl_sv_catpvf(aTHX_ sv,
12576                                                    "%.*s...",
12577                                                    (int) (s - origs - 1),
12578                                                    t);
12579                                     goto out_dump;
12580                                 }
12581                                 *s = ' ';
12582                             }
12583                             else if (*s == '\t') {
12584                                 *s = '-';
12585                             }
12586                             s++;
12587                         }
12588                         if (s[-1] == ' ')
12589                             s[-1] = 0;
12590
12591                         sv_catpv(sv, t);
12592                     }
12593
12594                 out_dump:
12595
12596                     Safefree(origs);
12597                 }
12598                 SvREFCNT_dec(lv);
12599             }
12600         }
12601
12602         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12603     }
12604     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
12605         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
12606 #else
12607     PERL_UNUSED_CONTEXT;
12608     PERL_UNUSED_ARG(sv);
12609     PERL_UNUSED_ARG(o);
12610     PERL_UNUSED_ARG(prog);
12611 #endif  /* DEBUGGING */
12612 }
12613
12614 SV *
12615 Perl_re_intuit_string(pTHX_ REGEXP * const r)
12616 {                               /* Assume that RE_INTUIT is set */
12617     dVAR;
12618     struct regexp *const prog = (struct regexp *)SvANY(r);
12619     GET_RE_DEBUG_FLAGS_DECL;
12620
12621     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
12622     PERL_UNUSED_CONTEXT;
12623
12624     DEBUG_COMPILE_r(
12625         {
12626             const char * const s = SvPV_nolen_const(prog->check_substr
12627                       ? prog->check_substr : prog->check_utf8);
12628
12629             if (!PL_colorset) reginitcolors();
12630             PerlIO_printf(Perl_debug_log,
12631                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
12632                       PL_colors[4],
12633                       prog->check_substr ? "" : "utf8 ",
12634                       PL_colors[5],PL_colors[0],
12635                       s,
12636                       PL_colors[1],
12637                       (strlen(s) > 60 ? "..." : ""));
12638         } );
12639
12640     return prog->check_substr ? prog->check_substr : prog->check_utf8;
12641 }
12642
12643 /*
12644    pregfree()
12645
12646    handles refcounting and freeing the perl core regexp structure. When
12647    it is necessary to actually free the structure the first thing it
12648    does is call the 'free' method of the regexp_engine associated to
12649    the regexp, allowing the handling of the void *pprivate; member
12650    first. (This routine is not overridable by extensions, which is why
12651    the extensions free is called first.)
12652
12653    See regdupe and regdupe_internal if you change anything here.
12654 */
12655 #ifndef PERL_IN_XSUB_RE
12656 void
12657 Perl_pregfree(pTHX_ REGEXP *r)
12658 {
12659     SvREFCNT_dec(r);
12660 }
12661
12662 void
12663 Perl_pregfree2(pTHX_ REGEXP *rx)
12664 {
12665     dVAR;
12666     struct regexp *const r = (struct regexp *)SvANY(rx);
12667     GET_RE_DEBUG_FLAGS_DECL;
12668
12669     PERL_ARGS_ASSERT_PREGFREE2;
12670
12671     if (r->mother_re) {
12672         ReREFCNT_dec(r->mother_re);
12673     } else {
12674         CALLREGFREE_PVT(rx); /* free the private data */
12675         SvREFCNT_dec(RXp_PAREN_NAMES(r));
12676     }
12677     if (r->substrs) {
12678         SvREFCNT_dec(r->anchored_substr);
12679         SvREFCNT_dec(r->anchored_utf8);
12680         SvREFCNT_dec(r->float_substr);
12681         SvREFCNT_dec(r->float_utf8);
12682         Safefree(r->substrs);
12683     }
12684     RX_MATCH_COPY_FREE(rx);
12685 #ifdef PERL_OLD_COPY_ON_WRITE
12686     SvREFCNT_dec(r->saved_copy);
12687 #endif
12688     Safefree(r->offs);
12689 }
12690
12691 /*  reg_temp_copy()
12692
12693     This is a hacky workaround to the structural issue of match results
12694     being stored in the regexp structure which is in turn stored in
12695     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12696     could be PL_curpm in multiple contexts, and could require multiple
12697     result sets being associated with the pattern simultaneously, such
12698     as when doing a recursive match with (??{$qr})
12699
12700     The solution is to make a lightweight copy of the regexp structure
12701     when a qr// is returned from the code executed by (??{$qr}) this
12702     lightweight copy doesn't actually own any of its data except for
12703     the starp/end and the actual regexp structure itself.
12704
12705 */
12706
12707
12708 REGEXP *
12709 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
12710 {
12711     struct regexp *ret;
12712     struct regexp *const r = (struct regexp *)SvANY(rx);
12713     register const I32 npar = r->nparens+1;
12714
12715     PERL_ARGS_ASSERT_REG_TEMP_COPY;
12716
12717     if (!ret_x)
12718         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12719     ret = (struct regexp *)SvANY(ret_x);
12720
12721     (void)ReREFCNT_inc(rx);
12722     /* We can take advantage of the existing "copied buffer" mechanism in SVs
12723        by pointing directly at the buffer, but flagging that the allocated
12724        space in the copy is zero. As we've just done a struct copy, it's now
12725        a case of zero-ing that, rather than copying the current length.  */
12726     SvPV_set(ret_x, RX_WRAPPED(rx));
12727     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
12728     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12729            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
12730     SvLEN_set(ret_x, 0);
12731     SvSTASH_set(ret_x, NULL);
12732     SvMAGIC_set(ret_x, NULL);
12733     Newx(ret->offs, npar, regexp_paren_pair);
12734     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12735     if (r->substrs) {
12736         Newx(ret->substrs, 1, struct reg_substr_data);
12737         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12738
12739         SvREFCNT_inc_void(ret->anchored_substr);
12740         SvREFCNT_inc_void(ret->anchored_utf8);
12741         SvREFCNT_inc_void(ret->float_substr);
12742         SvREFCNT_inc_void(ret->float_utf8);
12743
12744         /* check_substr and check_utf8, if non-NULL, point to either their
12745            anchored or float namesakes, and don't hold a second reference.  */
12746     }
12747     RX_MATCH_COPIED_off(ret_x);
12748 #ifdef PERL_OLD_COPY_ON_WRITE
12749     ret->saved_copy = NULL;
12750 #endif
12751     ret->mother_re = rx;
12752
12753     return ret_x;
12754 }
12755 #endif
12756
12757 /* regfree_internal()
12758
12759    Free the private data in a regexp. This is overloadable by
12760    extensions. Perl takes care of the regexp structure in pregfree(),
12761    this covers the *pprivate pointer which technically perl doesn't
12762    know about, however of course we have to handle the
12763    regexp_internal structure when no extension is in use.
12764
12765    Note this is called before freeing anything in the regexp
12766    structure.
12767  */
12768
12769 void
12770 Perl_regfree_internal(pTHX_ REGEXP * const rx)
12771 {
12772     dVAR;
12773     struct regexp *const r = (struct regexp *)SvANY(rx);
12774     RXi_GET_DECL(r,ri);
12775     GET_RE_DEBUG_FLAGS_DECL;
12776
12777     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12778
12779     DEBUG_COMPILE_r({
12780         if (!PL_colorset)
12781             reginitcolors();
12782         {
12783             SV *dsv= sv_newmortal();
12784             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
12785                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
12786             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12787                 PL_colors[4],PL_colors[5],s);
12788         }
12789     });
12790 #ifdef RE_TRACK_PATTERN_OFFSETS
12791     if (ri->u.offsets)
12792         Safefree(ri->u.offsets);             /* 20010421 MJD */
12793 #endif
12794     if (ri->data) {
12795         int n = ri->data->count;
12796         PAD* new_comppad = NULL;
12797         PAD* old_comppad;
12798         PADOFFSET refcnt;
12799
12800         while (--n >= 0) {
12801           /* If you add a ->what type here, update the comment in regcomp.h */
12802             switch (ri->data->what[n]) {
12803             case 'a':
12804             case 's':
12805             case 'S':
12806             case 'u':
12807                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
12808                 break;
12809             case 'f':
12810                 Safefree(ri->data->data[n]);
12811                 break;
12812             case 'p':
12813                 new_comppad = MUTABLE_AV(ri->data->data[n]);
12814                 break;
12815             case 'o':
12816                 if (new_comppad == NULL)
12817                     Perl_croak(aTHX_ "panic: pregfree comppad");
12818                 PAD_SAVE_LOCAL(old_comppad,
12819                     /* Watch out for global destruction's random ordering. */
12820                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
12821                 );
12822                 OP_REFCNT_LOCK;
12823                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
12824                 OP_REFCNT_UNLOCK;
12825                 if (!refcnt)
12826                     op_free((OP_4tree*)ri->data->data[n]);
12827
12828                 PAD_RESTORE_LOCAL(old_comppad);
12829                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
12830                 new_comppad = NULL;
12831                 break;
12832             case 'n':
12833                 break;
12834             case 'T':
12835                 { /* Aho Corasick add-on structure for a trie node.
12836                      Used in stclass optimization only */
12837                     U32 refcount;
12838                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
12839                     OP_REFCNT_LOCK;
12840                     refcount = --aho->refcount;
12841                     OP_REFCNT_UNLOCK;
12842                     if ( !refcount ) {
12843                         PerlMemShared_free(aho->states);
12844                         PerlMemShared_free(aho->fail);
12845                          /* do this last!!!! */
12846                         PerlMemShared_free(ri->data->data[n]);
12847                         PerlMemShared_free(ri->regstclass);
12848                     }
12849                 }
12850                 break;
12851             case 't':
12852                 {
12853                     /* trie structure. */
12854                     U32 refcount;
12855                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
12856                     OP_REFCNT_LOCK;
12857                     refcount = --trie->refcount;
12858                     OP_REFCNT_UNLOCK;
12859                     if ( !refcount ) {
12860                         PerlMemShared_free(trie->charmap);
12861                         PerlMemShared_free(trie->states);
12862                         PerlMemShared_free(trie->trans);
12863                         if (trie->bitmap)
12864                             PerlMemShared_free(trie->bitmap);
12865                         if (trie->jump)
12866                             PerlMemShared_free(trie->jump);
12867                         PerlMemShared_free(trie->wordinfo);
12868                         /* do this last!!!! */
12869                         PerlMemShared_free(ri->data->data[n]);
12870                     }
12871                 }
12872                 break;
12873             default:
12874                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
12875             }
12876         }
12877         Safefree(ri->data->what);
12878         Safefree(ri->data);
12879     }
12880
12881     Safefree(ri);
12882 }
12883
12884 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12885 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
12886 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
12887
12888 /*
12889    re_dup - duplicate a regexp.
12890
12891    This routine is expected to clone a given regexp structure. It is only
12892    compiled under USE_ITHREADS.
12893
12894    After all of the core data stored in struct regexp is duplicated
12895    the regexp_engine.dupe method is used to copy any private data
12896    stored in the *pprivate pointer. This allows extensions to handle
12897    any duplication it needs to do.
12898
12899    See pregfree() and regfree_internal() if you change anything here.
12900 */
12901 #if defined(USE_ITHREADS)
12902 #ifndef PERL_IN_XSUB_RE
12903 void
12904 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
12905 {
12906     dVAR;
12907     I32 npar;
12908     const struct regexp *r = (const struct regexp *)SvANY(sstr);
12909     struct regexp *ret = (struct regexp *)SvANY(dstr);
12910
12911     PERL_ARGS_ASSERT_RE_DUP_GUTS;
12912
12913     npar = r->nparens+1;
12914     Newx(ret->offs, npar, regexp_paren_pair);
12915     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12916     if(ret->swap) {
12917         /* no need to copy these */
12918         Newx(ret->swap, npar, regexp_paren_pair);
12919     }
12920
12921     if (ret->substrs) {
12922         /* Do it this way to avoid reading from *r after the StructCopy().
12923            That way, if any of the sv_dup_inc()s dislodge *r from the L1
12924            cache, it doesn't matter.  */
12925         const bool anchored = r->check_substr
12926             ? r->check_substr == r->anchored_substr
12927             : r->check_utf8 == r->anchored_utf8;
12928         Newx(ret->substrs, 1, struct reg_substr_data);
12929         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12930
12931         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12932         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12933         ret->float_substr = sv_dup_inc(ret->float_substr, param);
12934         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
12935
12936         /* check_substr and check_utf8, if non-NULL, point to either their
12937            anchored or float namesakes, and don't hold a second reference.  */
12938
12939         if (ret->check_substr) {
12940             if (anchored) {
12941                 assert(r->check_utf8 == r->anchored_utf8);
12942                 ret->check_substr = ret->anchored_substr;
12943                 ret->check_utf8 = ret->anchored_utf8;
12944             } else {
12945                 assert(r->check_substr == r->float_substr);
12946                 assert(r->check_utf8 == r->float_utf8);
12947                 ret->check_substr = ret->float_substr;
12948                 ret->check_utf8 = ret->float_utf8;
12949             }
12950         } else if (ret->check_utf8) {
12951             if (anchored) {
12952                 ret->check_utf8 = ret->anchored_utf8;
12953             } else {
12954                 ret->check_utf8 = ret->float_utf8;
12955             }
12956         }
12957     }
12958
12959     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
12960
12961     if (ret->pprivate)
12962         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
12963
12964     if (RX_MATCH_COPIED(dstr))
12965         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
12966     else
12967         ret->subbeg = NULL;
12968 #ifdef PERL_OLD_COPY_ON_WRITE
12969     ret->saved_copy = NULL;
12970 #endif
12971
12972     if (ret->mother_re) {
12973         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
12974             /* Our storage points directly to our mother regexp, but that's
12975                1: a buffer in a different thread
12976                2: something we no longer hold a reference on
12977                so we need to copy it locally.  */
12978             /* Note we need to use SvCUR(), rather than
12979                SvLEN(), on our mother_re, because it, in
12980                turn, may well be pointing to its own mother_re.  */
12981             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
12982                                    SvCUR(ret->mother_re)+1));
12983             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
12984         }
12985         ret->mother_re      = NULL;
12986     }
12987     ret->gofs = 0;
12988 }
12989 #endif /* PERL_IN_XSUB_RE */
12990
12991 /*
12992    regdupe_internal()
12993
12994    This is the internal complement to regdupe() which is used to copy
12995    the structure pointed to by the *pprivate pointer in the regexp.
12996    This is the core version of the extension overridable cloning hook.
12997    The regexp structure being duplicated will be copied by perl prior
12998    to this and will be provided as the regexp *r argument, however
12999    with the /old/ structures pprivate pointer value. Thus this routine
13000    may override any copying normally done by perl.
13001
13002    It returns a pointer to the new regexp_internal structure.
13003 */
13004
13005 void *
13006 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13007 {
13008     dVAR;
13009     struct regexp *const r = (struct regexp *)SvANY(rx);
13010     regexp_internal *reti;
13011     int len;
13012     RXi_GET_DECL(r,ri);
13013
13014     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13015
13016     len = ProgLen(ri);
13017
13018     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
13019     Copy(ri->program, reti->program, len+1, regnode);
13020
13021
13022     reti->regstclass = NULL;
13023
13024     if (ri->data) {
13025         struct reg_data *d;
13026         const int count = ri->data->count;
13027         int i;
13028
13029         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13030                 char, struct reg_data);
13031         Newx(d->what, count, U8);
13032
13033         d->count = count;
13034         for (i = 0; i < count; i++) {
13035             d->what[i] = ri->data->what[i];
13036             switch (d->what[i]) {
13037                 /* legal options are one of: sSfpontTua
13038                    see also regcomp.h and pregfree() */
13039             case 'a': /* actually an AV, but the dup function is identical.  */
13040             case 's':
13041             case 'S':
13042             case 'p': /* actually an AV, but the dup function is identical.  */
13043             case 'u': /* actually an HV, but the dup function is identical.  */
13044                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13045                 break;
13046             case 'f':
13047                 /* This is cheating. */
13048                 Newx(d->data[i], 1, struct regnode_charclass_class);
13049                 StructCopy(ri->data->data[i], d->data[i],
13050                             struct regnode_charclass_class);
13051                 reti->regstclass = (regnode*)d->data[i];
13052                 break;
13053             case 'o':
13054                 /* Compiled op trees are readonly and in shared memory,
13055                    and can thus be shared without duplication. */
13056                 OP_REFCNT_LOCK;
13057                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
13058                 OP_REFCNT_UNLOCK;
13059                 break;
13060             case 'T':
13061                 /* Trie stclasses are readonly and can thus be shared
13062                  * without duplication. We free the stclass in pregfree
13063                  * when the corresponding reg_ac_data struct is freed.
13064                  */
13065                 reti->regstclass= ri->regstclass;
13066                 /* Fall through */
13067             case 't':
13068                 OP_REFCNT_LOCK;
13069                 ((reg_trie_data*)ri->data->data[i])->refcount++;
13070                 OP_REFCNT_UNLOCK;
13071                 /* Fall through */
13072             case 'n':
13073                 d->data[i] = ri->data->data[i];
13074                 break;
13075             default:
13076                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13077             }
13078         }
13079
13080         reti->data = d;
13081     }
13082     else
13083         reti->data = NULL;
13084
13085     reti->name_list_idx = ri->name_list_idx;
13086
13087 #ifdef RE_TRACK_PATTERN_OFFSETS
13088     if (ri->u.offsets) {
13089         Newx(reti->u.offsets, 2*len+1, U32);
13090         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13091     }
13092 #else
13093     SetProgLen(reti,len);
13094 #endif
13095
13096     return (void*)reti;
13097 }
13098
13099 #endif    /* USE_ITHREADS */
13100
13101 #ifndef PERL_IN_XSUB_RE
13102
13103 /*
13104  - regnext - dig the "next" pointer out of a node
13105  */
13106 regnode *
13107 Perl_regnext(pTHX_ register regnode *p)
13108 {
13109     dVAR;
13110     register I32 offset;
13111
13112     if (!p)
13113         return(NULL);
13114
13115     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
13116         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13117     }
13118
13119     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13120     if (offset == 0)
13121         return(NULL);
13122
13123     return(p+offset);
13124 }
13125 #endif
13126
13127 STATIC void
13128 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13129 {
13130     va_list args;
13131     STRLEN l1 = strlen(pat1);
13132     STRLEN l2 = strlen(pat2);
13133     char buf[512];
13134     SV *msv;
13135     const char *message;
13136
13137     PERL_ARGS_ASSERT_RE_CROAK2;
13138
13139     if (l1 > 510)
13140         l1 = 510;
13141     if (l1 + l2 > 510)
13142         l2 = 510 - l1;
13143     Copy(pat1, buf, l1 , char);
13144     Copy(pat2, buf + l1, l2 , char);
13145     buf[l1 + l2] = '\n';
13146     buf[l1 + l2 + 1] = '\0';
13147 #ifdef I_STDARG
13148     /* ANSI variant takes additional second argument */
13149     va_start(args, pat2);
13150 #else
13151     va_start(args);
13152 #endif
13153     msv = vmess(buf, &args);
13154     va_end(args);
13155     message = SvPV_const(msv,l1);
13156     if (l1 > 512)
13157         l1 = 512;
13158     Copy(message, buf, l1 , char);
13159     buf[l1-1] = '\0';                   /* Overwrite \n */
13160     Perl_croak(aTHX_ "%s", buf);
13161 }
13162
13163 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
13164
13165 #ifndef PERL_IN_XSUB_RE
13166 void
13167 Perl_save_re_context(pTHX)
13168 {
13169     dVAR;
13170
13171     struct re_save_state *state;
13172
13173     SAVEVPTR(PL_curcop);
13174     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13175
13176     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13177     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13178     SSPUSHUV(SAVEt_RE_STATE);
13179
13180     Copy(&PL_reg_state, state, 1, struct re_save_state);
13181
13182     PL_reg_start_tmp = 0;
13183     PL_reg_start_tmpl = 0;
13184     PL_reg_oldsaved = NULL;
13185     PL_reg_oldsavedlen = 0;
13186     PL_reg_maxiter = 0;
13187     PL_reg_leftiter = 0;
13188     PL_reg_poscache = NULL;
13189     PL_reg_poscache_size = 0;
13190 #ifdef PERL_OLD_COPY_ON_WRITE
13191     PL_nrs = NULL;
13192 #endif
13193
13194     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13195     if (PL_curpm) {
13196         const REGEXP * const rx = PM_GETRE(PL_curpm);
13197         if (rx) {
13198             U32 i;
13199             for (i = 1; i <= RX_NPARENS(rx); i++) {
13200                 char digits[TYPE_CHARS(long)];
13201                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13202                 GV *const *const gvp
13203                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13204
13205                 if (gvp) {
13206                     GV * const gv = *gvp;
13207                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13208                         save_scalar(gv);
13209                 }
13210             }
13211         }
13212     }
13213 }
13214 #endif
13215
13216 static void
13217 clear_re(pTHX_ void *r)
13218 {
13219     dVAR;
13220     ReREFCNT_dec((REGEXP *)r);
13221 }
13222
13223 #ifdef DEBUGGING
13224
13225 STATIC void
13226 S_put_byte(pTHX_ SV *sv, int c)
13227 {
13228     PERL_ARGS_ASSERT_PUT_BYTE;
13229
13230     /* Our definition of isPRINT() ignores locales, so only bytes that are
13231        not part of UTF-8 are considered printable. I assume that the same
13232        holds for UTF-EBCDIC.
13233        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13234        which Wikipedia says:
13235
13236        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13237        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13238        identical, to the ASCII delete (DEL) or rubout control character.
13239        ) So the old condition can be simplified to !isPRINT(c)  */
13240     if (!isPRINT(c)) {
13241         if (c < 256) {
13242             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13243         }
13244         else {
13245             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13246         }
13247     }
13248     else {
13249         const char string = c;
13250         if (c == '-' || c == ']' || c == '\\' || c == '^')
13251             sv_catpvs(sv, "\\");
13252         sv_catpvn(sv, &string, 1);
13253     }
13254 }
13255
13256
13257 #define CLEAR_OPTSTART \
13258     if (optstart) STMT_START { \
13259             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13260             optstart=NULL; \
13261     } STMT_END
13262
13263 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13264
13265 STATIC const regnode *
13266 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13267             const regnode *last, const regnode *plast,
13268             SV* sv, I32 indent, U32 depth)
13269 {
13270     dVAR;
13271     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
13272     register const regnode *next;
13273     const regnode *optstart= NULL;
13274
13275     RXi_GET_DECL(r,ri);
13276     GET_RE_DEBUG_FLAGS_DECL;
13277
13278     PERL_ARGS_ASSERT_DUMPUNTIL;
13279
13280 #ifdef DEBUG_DUMPUNTIL
13281     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13282         last ? last-start : 0,plast ? plast-start : 0);
13283 #endif
13284
13285     if (plast && plast < last)
13286         last= plast;
13287
13288     while (PL_regkind[op] != END && (!last || node < last)) {
13289         /* While that wasn't END last time... */
13290         NODE_ALIGN(node);
13291         op = OP(node);
13292         if (op == CLOSE || op == WHILEM)
13293             indent--;
13294         next = regnext((regnode *)node);
13295
13296         /* Where, what. */
13297         if (OP(node) == OPTIMIZED) {
13298             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13299                 optstart = node;
13300             else
13301                 goto after_print;
13302         } else
13303             CLEAR_OPTSTART;
13304
13305         regprop(r, sv, node);
13306         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13307                       (int)(2*indent + 1), "", SvPVX_const(sv));
13308
13309         if (OP(node) != OPTIMIZED) {
13310             if (next == NULL)           /* Next ptr. */
13311                 PerlIO_printf(Perl_debug_log, " (0)");
13312             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13313                 PerlIO_printf(Perl_debug_log, " (FAIL)");
13314             else
13315                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13316             (void)PerlIO_putc(Perl_debug_log, '\n');
13317         }
13318
13319       after_print:
13320         if (PL_regkind[(U8)op] == BRANCHJ) {
13321             assert(next);
13322             {
13323                 register const regnode *nnode = (OP(next) == LONGJMP
13324                                              ? regnext((regnode *)next)
13325                                              : next);
13326                 if (last && nnode > last)
13327                     nnode = last;
13328                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13329             }
13330         }
13331         else if (PL_regkind[(U8)op] == BRANCH) {
13332             assert(next);
13333             DUMPUNTIL(NEXTOPER(node), next);
13334         }
13335         else if ( PL_regkind[(U8)op]  == TRIE ) {
13336             const regnode *this_trie = node;
13337             const char op = OP(node);
13338             const U32 n = ARG(node);
13339             const reg_ac_data * const ac = op>=AHOCORASICK ?
13340                (reg_ac_data *)ri->data->data[n] :
13341                NULL;
13342             const reg_trie_data * const trie =
13343                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13344 #ifdef DEBUGGING
13345             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13346 #endif
13347             const regnode *nextbranch= NULL;
13348             I32 word_idx;
13349             sv_setpvs(sv, "");
13350             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13351                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13352
13353                 PerlIO_printf(Perl_debug_log, "%*s%s ",
13354                    (int)(2*(indent+3)), "",
13355                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13356                             PL_colors[0], PL_colors[1],
13357                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13358                             PERL_PV_PRETTY_ELLIPSES    |
13359                             PERL_PV_PRETTY_LTGT
13360                             )
13361                             : "???"
13362                 );
13363                 if (trie->jump) {
13364                     U16 dist= trie->jump[word_idx+1];
13365                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13366                                   (UV)((dist ? this_trie + dist : next) - start));
13367                     if (dist) {
13368                         if (!nextbranch)
13369                             nextbranch= this_trie + trie->jump[0];
13370                         DUMPUNTIL(this_trie + dist, nextbranch);
13371                     }
13372                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13373                         nextbranch= regnext((regnode *)nextbranch);
13374                 } else {
13375                     PerlIO_printf(Perl_debug_log, "\n");
13376                 }
13377             }
13378             if (last && next > last)
13379                 node= last;
13380             else
13381                 node= next;
13382         }
13383         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
13384             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13385                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13386         }
13387         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13388             assert(next);
13389             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13390         }
13391         else if ( op == PLUS || op == STAR) {
13392             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13393         }
13394         else if (PL_regkind[(U8)op] == ANYOF) {
13395             /* arglen 1 + class block */
13396             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13397                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13398             node = NEXTOPER(node);
13399         }
13400         else if (PL_regkind[(U8)op] == EXACT) {
13401             /* Literal string, where present. */
13402             node += NODE_SZ_STR(node) - 1;
13403             node = NEXTOPER(node);
13404         }
13405         else {
13406             node = NEXTOPER(node);
13407             node += regarglen[(U8)op];
13408         }
13409         if (op == CURLYX || op == OPEN)
13410             indent++;
13411     }
13412     CLEAR_OPTSTART;
13413 #ifdef DEBUG_DUMPUNTIL
13414     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13415 #endif
13416     return node;
13417 }
13418
13419 #endif  /* DEBUGGING */
13420
13421 /*
13422  * Local variables:
13423  * c-indentation-style: bsd
13424  * c-basic-offset: 4
13425  * indent-tabs-mode: t
13426  * End:
13427  *
13428  * ex: set ts=8 sts=4 sw=4 noet:
13429  */