This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate:
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* This file contains functions for compiling a regular expression.  See
9  * also regexec.c which funnily enough, contains functions for executing
10  * a regular expression.
11  *
12  * This file is also copied at build time to ext/re/re_comp.c, where
13  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14  * This causes the main functions to be compiled under new names and with
15  * debugging support added, which makes "use re 'debug'" work.
16  */
17
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19  * confused with the original package (see point 3 below).  Thanks, Henry!
20  */
21
22 /* Additional note: this code is very heavily munged from Henry's version
23  * in places.  In some spots I've traded clarity for efficiency, so don't
24  * blame Henry for some of the lack of readability.
25  */
26
27 /* The names of the functions have been changed from regcomp and
28  * regexec to  pregcomp and pregexec in order to avoid conflicts
29  * with the POSIX routines of the same names.
30 */
31
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 #  ifndef PERL_IN_XSUB_RE
35 #    define PERL_IN_XSUB_RE
36 #  endif
37 /* need access to debugger hooks */
38 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
39 #    define DEBUGGING
40 #  endif
41 #endif
42
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 #  define Perl_pregcomp my_regcomp
46 #  define Perl_regdump my_regdump
47 #  define Perl_regprop my_regprop
48 #  define Perl_pregfree my_regfree
49 #  define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 #  define Perl_regnext my_regnext
52 #  define Perl_save_re_context my_save_re_context
53 #  define Perl_reginitcolors my_reginitcolors
54
55 #  define PERL_NO_GET_CONTEXT
56 #endif
57
58 /*
59  * pregcomp and pregexec -- regsub and regerror are not used in perl
60  *
61  *      Copyright (c) 1986 by University of Toronto.
62  *      Written by Henry Spencer.  Not derived from licensed software.
63  *
64  *      Permission is granted to anyone to use this software for any
65  *      purpose on any computer system, and to redistribute it freely,
66  *      subject to the following restrictions:
67  *
68  *      1. The author is not responsible for the consequences of use of
69  *              this software, no matter how awful, even if they arise
70  *              from defects in it.
71  *
72  *      2. The origin of this software must not be misrepresented, either
73  *              by explicit claim or by omission.
74  *
75  *      3. Altered versions must be plainly marked as such, and must not
76  *              be misrepresented as being the original software.
77  *
78  *
79  ****    Alterations to Henry's code are...
80  ****
81  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
83  ****
84  ****    You may distribute under the terms of either the GNU General Public
85  ****    License or the Artistic License, as specified in the README file.
86
87  *
88  * Beware that some of this code is subtly aware of the way operator
89  * precedence is structured in regular expressions.  Serious changes in
90  * regular-expression syntax might require a total rethink.
91  */
92 #include "EXTERN.h"
93 #define PERL_IN_REGCOMP_C
94 #include "perl.h"
95
96 #ifndef PERL_IN_XSUB_RE
97 #  include "INTERN.h"
98 #endif
99
100 #define REG_COMP_C
101 #include "regcomp.h"
102
103 #ifdef op
104 #undef op
105 #endif /* op */
106
107 #ifdef MSDOS
108 #  if defined(BUGGY_MSC6)
109  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 #    pragma optimize("a",off)
111  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 #    pragma optimize("w",on )
113 #  endif /* BUGGY_MSC6 */
114 #endif /* MSDOS */
115
116 #ifndef STATIC
117 #define STATIC  static
118 #endif
119
120 typedef struct RExC_state_t {
121     U32         flags;                  /* are we folding, multilining? */
122     char        *precomp;               /* uncompiled string. */
123     regexp      *rx;
124     char        *start;                 /* Start of input for compile */
125     char        *end;                   /* End of input for compile */
126     char        *parse;                 /* Input-scan pointer. */
127     I32         whilem_seen;            /* number of WHILEM in this expr */
128     regnode     *emit_start;            /* Start of emitted-code area */
129     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
130     I32         naughty;                /* How bad is this pattern? */
131     I32         sawback;                /* Did we see \1, ...? */
132     U32         seen;
133     I32         size;                   /* Code size. */
134     I32         npar;                   /* () count. */
135     I32         extralen;
136     I32         seen_zerolen;
137     I32         seen_evals;
138     I32         utf8;
139 #if ADD_TO_REGEXEC
140     char        *starttry;              /* -Dr: where regtry was called. */
141 #define RExC_starttry   (pRExC_state->starttry)
142 #endif
143 } RExC_state_t;
144
145 #define RExC_flags      (pRExC_state->flags)
146 #define RExC_precomp    (pRExC_state->precomp)
147 #define RExC_rx         (pRExC_state->rx)
148 #define RExC_start      (pRExC_state->start)
149 #define RExC_end        (pRExC_state->end)
150 #define RExC_parse      (pRExC_state->parse)
151 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
152 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit       (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty    (pRExC_state->naughty)
156 #define RExC_sawback    (pRExC_state->sawback)
157 #define RExC_seen       (pRExC_state->seen)
158 #define RExC_size       (pRExC_state->size)
159 #define RExC_npar       (pRExC_state->npar)
160 #define RExC_extralen   (pRExC_state->extralen)
161 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8       (pRExC_state->utf8)
164
165 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167         ((*s) == '{' && regcurly(s)))
168
169 #ifdef SPSTART
170 #undef SPSTART          /* dratted cpp namespace... */
171 #endif
172 /*
173  * Flags to be passed up and down.
174  */
175 #define WORST           0       /* Worst case. */
176 #define HASWIDTH        0x1     /* Known to match non-null strings. */
177 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART         0x4     /* Starts with * or +. */
179 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
180
181 /* Length of a variant. */
182
183 typedef struct scan_data_t {
184     I32 len_min;
185     I32 len_delta;
186     I32 pos_min;
187     I32 pos_delta;
188     SV *last_found;
189     I32 last_end;                       /* min value, <0 unless valid. */
190     I32 last_start_min;
191     I32 last_start_max;
192     SV **longest;                       /* Either &l_fixed, or &l_float. */
193     SV *longest_fixed;
194     I32 offset_fixed;
195     SV *longest_float;
196     I32 offset_float_min;
197     I32 offset_float_max;
198     I32 flags;
199     I32 whilem_c;
200     I32 *last_closep;
201     struct regnode_charclass_class *start_class;
202 } scan_data_t;
203
204 /*
205  * Forward declarations for pregcomp()'s friends.
206  */
207
208 static const scan_data_t zero_scan_data =
209   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
210
211 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL          0x1
213 #define SF_BEFORE_MEOL          0x2
214 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
217 #ifdef NO_UNARY_PLUS
218 #  define SF_FIX_SHIFT_EOL      (0+2)
219 #  define SF_FL_SHIFT_EOL               (0+4)
220 #else
221 #  define SF_FIX_SHIFT_EOL      (+2)
222 #  define SF_FL_SHIFT_EOL               (+4)
223 #endif
224
225 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF               0x40
231 #define SF_HAS_PAR              0x80
232 #define SF_IN_PAR               0x100
233 #define SF_HAS_EVAL             0x200
234 #define SCF_DO_SUBSTR           0x400
235 #define SCF_DO_STCLASS_AND      0x0800
236 #define SCF_DO_STCLASS_OR       0x1000
237 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS  0x2000
239
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
243
244 #define OOB_UNICODE             12345678
245 #define OOB_NAMEDCLASS          -1
246
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
250
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
253
254 /*
255  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257  * op/pragma/warn/regcomp.
258  */
259 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
261
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
263
264 /*
265  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266  * arg. Show regex, up to a maximum length. If it's too long, chop and add
267  * "...".
268  */
269 #define FAIL(msg) STMT_START {                                          \
270     const char *ellipses = "";                                          \
271     IV len = RExC_end - RExC_precomp;                                   \
272                                                                         \
273     if (!SIZE_ONLY)                                                     \
274         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
275     if (len > RegexLengthToShowInErrorMessages) {                       \
276         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
277         len = RegexLengthToShowInErrorMessages - 10;                    \
278         ellipses = "...";                                               \
279     }                                                                   \
280     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
281             msg, (int)len, RExC_precomp, ellipses);                     \
282 } STMT_END
283
284 /*
285  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
286  */
287 #define Simple_vFAIL(m) STMT_START {                                    \
288     const IV offset = RExC_parse - RExC_precomp;                        \
289     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
290             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
291 } STMT_END
292
293 /*
294  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
295  */
296 #define vFAIL(m) STMT_START {                           \
297     if (!SIZE_ONLY)                                     \
298         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
299     Simple_vFAIL(m);                                    \
300 } STMT_END
301
302 /*
303  * Like Simple_vFAIL(), but accepts two arguments.
304  */
305 #define Simple_vFAIL2(m,a1) STMT_START {                        \
306     const IV offset = RExC_parse - RExC_precomp;                        \
307     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
308             (int)offset, RExC_precomp, RExC_precomp + offset);  \
309 } STMT_END
310
311 /*
312  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
313  */
314 #define vFAIL2(m,a1) STMT_START {                       \
315     if (!SIZE_ONLY)                                     \
316         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
317     Simple_vFAIL2(m, a1);                               \
318 } STMT_END
319
320
321 /*
322  * Like Simple_vFAIL(), but accepts three arguments.
323  */
324 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
325     const IV offset = RExC_parse - RExC_precomp;                \
326     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
327             (int)offset, RExC_precomp, RExC_precomp + offset);  \
328 } STMT_END
329
330 /*
331  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
332  */
333 #define vFAIL3(m,a1,a2) STMT_START {                    \
334     if (!SIZE_ONLY)                                     \
335         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
336     Simple_vFAIL3(m, a1, a2);                           \
337 } STMT_END
338
339 /*
340  * Like Simple_vFAIL(), but accepts four arguments.
341  */
342 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
343     const IV offset = RExC_parse - RExC_precomp;                \
344     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
345             (int)offset, RExC_precomp, RExC_precomp + offset);  \
346 } STMT_END
347
348 #define vWARN(loc,m) STMT_START {                                       \
349     const IV offset = loc - RExC_precomp;                               \
350     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
351             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
352 } STMT_END
353
354 #define vWARNdep(loc,m) STMT_START {                                    \
355     const IV offset = loc - RExC_precomp;                               \
356     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
357             "%s" REPORT_LOCATION,                                       \
358             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
359 } STMT_END
360
361
362 #define vWARN2(loc, m, a1) STMT_START {                                 \
363     const IV offset = loc - RExC_precomp;                               \
364     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
365             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
366 } STMT_END
367
368 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
369     const IV offset = loc - RExC_precomp;                               \
370     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
371             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
372 } STMT_END
373
374 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
375     const IV offset = loc - RExC_precomp;                               \
376     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
377             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
378 } STMT_END
379
380 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
381     const IV offset = loc - RExC_precomp;                               \
382     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
383             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 } STMT_END
385
386
387 /* Allow for side effects in s */
388 #define REGC(c,s) STMT_START {                  \
389     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
390 } STMT_END
391
392 /* Macros for recording node offsets.   20001227 mjd@plover.com 
393  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
394  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
395  * Element 0 holds the number n.
396  */
397
398 #define MJD_OFFSET_DEBUG(x)
399 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
400
401
402 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
403     if (! SIZE_ONLY) {                                                  \
404         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
405                 __LINE__, (node), (byte)));                             \
406         if((node) < 0) {                                                \
407             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
408         } else {                                                        \
409             RExC_offsets[2*(node)-1] = (byte);                          \
410         }                                                               \
411     }                                                                   \
412 } STMT_END
413
414 #define Set_Node_Offset(node,byte) \
415     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
417
418 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
419     if (! SIZE_ONLY) {                                                  \
420         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
421                 __LINE__, (int)(node), (int)(len)));                    \
422         if((node) < 0) {                                                \
423             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
424         } else {                                                        \
425             RExC_offsets[2*(node)] = (len);                             \
426         }                                                               \
427     }                                                                   \
428 } STMT_END
429
430 #define Set_Node_Length(node,len) \
431     Set_Node_Length_To_R((node)-RExC_emit_start, len)
432 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433 #define Set_Node_Cur_Length(node) \
434     Set_Node_Length(node, RExC_parse - parse_start)
435
436 /* Get offsets and lengths */
437 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
439
440 static void clear_re(pTHX_ void *r);
441
442 /* Mark that we cannot extend a found fixed substring at this point.
443    Updata the longest found anchored substring and the longest found
444    floating substrings if needed. */
445
446 STATIC void
447 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
448 {
449     const STRLEN l = CHR_SVLEN(data->last_found);
450     const STRLEN old_l = CHR_SVLEN(*data->longest);
451
452     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
453         SvSetMagicSV(*data->longest, data->last_found);
454         if (*data->longest == data->longest_fixed) {
455             data->offset_fixed = l ? data->last_start_min : data->pos_min;
456             if (data->flags & SF_BEFORE_EOL)
457                 data->flags
458                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
459             else
460                 data->flags &= ~SF_FIX_BEFORE_EOL;
461         }
462         else {
463             data->offset_float_min = l ? data->last_start_min : data->pos_min;
464             data->offset_float_max = (l
465                                       ? data->last_start_max
466                                       : data->pos_min + data->pos_delta);
467             if ((U32)data->offset_float_max > (U32)I32_MAX)
468                 data->offset_float_max = I32_MAX;
469             if (data->flags & SF_BEFORE_EOL)
470                 data->flags
471                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
472             else
473                 data->flags &= ~SF_FL_BEFORE_EOL;
474         }
475     }
476     SvCUR_set(data->last_found, 0);
477     {
478         SV * const sv = data->last_found;
479         if (SvUTF8(sv) && SvMAGICAL(sv)) {
480             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
481             if (mg)
482                 mg->mg_len = 0;
483         }
484     }
485     data->last_end = -1;
486     data->flags &= ~SF_BEFORE_EOL;
487 }
488
489 /* Can match anything (initialization) */
490 STATIC void
491 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
492 {
493     ANYOF_CLASS_ZERO(cl);
494     ANYOF_BITMAP_SETALL(cl);
495     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
496     if (LOC)
497         cl->flags |= ANYOF_LOCALE;
498 }
499
500 /* Can match anything (initialization) */
501 STATIC int
502 S_cl_is_anything(const struct regnode_charclass_class *cl)
503 {
504     int value;
505
506     for (value = 0; value <= ANYOF_MAX; value += 2)
507         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
508             return 1;
509     if (!(cl->flags & ANYOF_UNICODE_ALL))
510         return 0;
511     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
512         return 0;
513     return 1;
514 }
515
516 /* Can match anything (initialization) */
517 STATIC void
518 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
519 {
520     Zero(cl, 1, struct regnode_charclass_class);
521     cl->type = ANYOF;
522     cl_anything(pRExC_state, cl);
523 }
524
525 STATIC void
526 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
527 {
528     Zero(cl, 1, struct regnode_charclass_class);
529     cl->type = ANYOF;
530     cl_anything(pRExC_state, cl);
531     if (LOC)
532         cl->flags |= ANYOF_LOCALE;
533 }
534
535 /* 'And' a given class with another one.  Can create false positives */
536 /* We assume that cl is not inverted */
537 STATIC void
538 S_cl_and(struct regnode_charclass_class *cl,
539         const struct regnode_charclass_class *and_with)
540 {
541     if (!(and_with->flags & ANYOF_CLASS)
542         && !(cl->flags & ANYOF_CLASS)
543         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
544         && !(and_with->flags & ANYOF_FOLD)
545         && !(cl->flags & ANYOF_FOLD)) {
546         int i;
547
548         if (and_with->flags & ANYOF_INVERT)
549             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
550                 cl->bitmap[i] &= ~and_with->bitmap[i];
551         else
552             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
553                 cl->bitmap[i] &= and_with->bitmap[i];
554     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
555     if (!(and_with->flags & ANYOF_EOS))
556         cl->flags &= ~ANYOF_EOS;
557
558     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
559         !(and_with->flags & ANYOF_INVERT)) {
560         cl->flags &= ~ANYOF_UNICODE_ALL;
561         cl->flags |= ANYOF_UNICODE;
562         ARG_SET(cl, ARG(and_with));
563     }
564     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
565         !(and_with->flags & ANYOF_INVERT))
566         cl->flags &= ~ANYOF_UNICODE_ALL;
567     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
568         !(and_with->flags & ANYOF_INVERT))
569         cl->flags &= ~ANYOF_UNICODE;
570 }
571
572 /* 'OR' a given class with another one.  Can create false positives */
573 /* We assume that cl is not inverted */
574 STATIC void
575 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
576 {
577     if (or_with->flags & ANYOF_INVERT) {
578         /* We do not use
579          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
580          *   <= (B1 | !B2) | (CL1 | !CL2)
581          * which is wasteful if CL2 is small, but we ignore CL2:
582          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
583          * XXXX Can we handle case-fold?  Unclear:
584          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
585          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
586          */
587         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
588              && !(or_with->flags & ANYOF_FOLD)
589              && !(cl->flags & ANYOF_FOLD) ) {
590             int i;
591
592             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
593                 cl->bitmap[i] |= ~or_with->bitmap[i];
594         } /* XXXX: logic is complicated otherwise */
595         else {
596             cl_anything(pRExC_state, cl);
597         }
598     } else {
599         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
600         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
601              && (!(or_with->flags & ANYOF_FOLD)
602                  || (cl->flags & ANYOF_FOLD)) ) {
603             int i;
604
605             /* OR char bitmap and class bitmap separately */
606             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
607                 cl->bitmap[i] |= or_with->bitmap[i];
608             if (or_with->flags & ANYOF_CLASS) {
609                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
610                     cl->classflags[i] |= or_with->classflags[i];
611                 cl->flags |= ANYOF_CLASS;
612             }
613         }
614         else { /* XXXX: logic is complicated, leave it along for a moment. */
615             cl_anything(pRExC_state, cl);
616         }
617     }
618     if (or_with->flags & ANYOF_EOS)
619         cl->flags |= ANYOF_EOS;
620
621     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
622         ARG(cl) != ARG(or_with)) {
623         cl->flags |= ANYOF_UNICODE_ALL;
624         cl->flags &= ~ANYOF_UNICODE;
625     }
626     if (or_with->flags & ANYOF_UNICODE_ALL) {
627         cl->flags |= ANYOF_UNICODE_ALL;
628         cl->flags &= ~ANYOF_UNICODE;
629     }
630 }
631
632 /*
633  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
634  * These need to be revisited when a newer toolchain becomes available.
635  */
636 #if defined(__sparc64__) && defined(__GNUC__)
637 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
638 #       undef  SPARC64_GCC_WORKAROUND
639 #       define SPARC64_GCC_WORKAROUND 1
640 #   endif
641 #endif
642
643 /* REx optimizer.  Converts nodes into quickier variants "in place".
644    Finds fixed substrings.  */
645
646 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
647    to the position after last scanned or to NULL. */
648
649 STATIC I32
650 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
651                         /* scanp: Start here (read-write). */
652                         /* deltap: Write maxlen-minlen here. */
653                         /* last: Stop before this one. */
654 {
655     I32 min = 0, pars = 0, code;
656     regnode *scan = *scanp, *next;
657     I32 delta = 0;
658     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
659     int is_inf_internal = 0;            /* The studied chunk is infinite */
660     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
661     scan_data_t data_fake;
662     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
663
664     while (scan && OP(scan) != END && scan < last) {
665         /* Peephole optimizer: */
666
667         if (PL_regkind[(U8)OP(scan)] == EXACT) {
668             /* Merge several consecutive EXACTish nodes into one. */
669             regnode *n = regnext(scan);
670             U32 stringok = 1;
671 #ifdef DEBUGGING
672             regnode *stop = scan;
673 #endif
674
675             next = scan + NODE_SZ_STR(scan);
676             /* Skip NOTHING, merge EXACT*. */
677             while (n &&
678                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
679                      (stringok && (OP(n) == OP(scan))))
680                    && NEXT_OFF(n)
681                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
682                 if (OP(n) == TAIL || n > next)
683                     stringok = 0;
684                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
685                     NEXT_OFF(scan) += NEXT_OFF(n);
686                     next = n + NODE_STEP_REGNODE;
687 #ifdef DEBUGGING
688                     if (stringok)
689                         stop = n;
690 #endif
691                     n = regnext(n);
692                 }
693                 else if (stringok) {
694                     const int oldl = STR_LEN(scan);
695                     regnode * const nnext = regnext(n);
696
697                     if (oldl + STR_LEN(n) > U8_MAX)
698                         break;
699                     NEXT_OFF(scan) += NEXT_OFF(n);
700                     STR_LEN(scan) += STR_LEN(n);
701                     next = n + NODE_SZ_STR(n);
702                     /* Now we can overwrite *n : */
703                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
704 #ifdef DEBUGGING
705                     stop = next - 1;
706 #endif
707                     n = nnext;
708                 }
709             }
710
711             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
712 /*
713   Two problematic code points in Unicode casefolding of EXACT nodes:
714
715    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
716    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
717
718    which casefold to
719
720    Unicode                      UTF-8
721
722    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
723    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
724
725    This means that in case-insensitive matching (or "loose matching",
726    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
727    length of the above casefolded versions) can match a target string
728    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
729    This would rather mess up the minimum length computation.
730
731    What we'll do is to look for the tail four bytes, and then peek
732    at the preceding two bytes to see whether we need to decrease
733    the minimum length by four (six minus two).
734
735    Thanks to the design of UTF-8, there cannot be false matches:
736    A sequence of valid UTF-8 bytes cannot be a subsequence of
737    another valid sequence of UTF-8 bytes.
738
739 */
740                  char * const s0 = STRING(scan), *s, *t;
741                  char * const s1 = s0 + STR_LEN(scan) - 1;
742                  char * const s2 = s1 - 4;
743                  const char t0[] = "\xcc\x88\xcc\x81";
744                  const char * const t1 = t0 + 3;
745
746                  for (s = s0 + 2;
747                       s < s2 && (t = ninstr(s, s1, t0, t1));
748                       s = t + 4) {
749                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
750                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
751                            min -= 4;
752                  }
753             }
754
755 #ifdef DEBUGGING
756             /* Allow dumping */
757             n = scan + NODE_SZ_STR(scan);
758             while (n <= stop) {
759                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
760                     OP(n) = OPTIMIZED;
761                     NEXT_OFF(n) = 0;
762                 }
763                 n++;
764             }
765 #endif
766         }
767         /* Follow the next-chain of the current node and optimize
768            away all the NOTHINGs from it.  */
769         if (OP(scan) != CURLYX) {
770             const int max = (reg_off_by_arg[OP(scan)]
771                        ? I32_MAX
772                        /* I32 may be smaller than U16 on CRAYs! */
773                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
774             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
775             int noff;
776             regnode *n = scan;
777         
778             /* Skip NOTHING and LONGJMP. */
779             while ((n = regnext(n))
780                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
781                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
782                    && off + noff < max)
783                 off += noff;
784             if (reg_off_by_arg[OP(scan)])
785                 ARG(scan) = off;
786             else
787                 NEXT_OFF(scan) = off;
788         }
789         /* The principal pseudo-switch.  Cannot be a switch, since we
790            look into several different things.  */
791         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
792                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
793             next = regnext(scan);
794             code = OP(scan);
795         
796             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
797                 I32 max1 = 0, min1 = I32_MAX, num = 0;
798                 struct regnode_charclass_class accum;
799                 
800                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
801                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
802                 if (flags & SCF_DO_STCLASS)
803                     cl_init_zero(pRExC_state, &accum);
804                 while (OP(scan) == code) {
805                     I32 deltanext, minnext, f = 0, fake;
806                     struct regnode_charclass_class this_class;
807
808                     num++;
809                     data_fake.flags = 0;
810                     if (data) {         
811                         data_fake.whilem_c = data->whilem_c;
812                         data_fake.last_closep = data->last_closep;
813                     }
814                     else
815                         data_fake.last_closep = &fake;
816                     next = regnext(scan);
817                     scan = NEXTOPER(scan);
818                     if (code != BRANCH)
819                         scan = NEXTOPER(scan);
820                     if (flags & SCF_DO_STCLASS) {
821                         cl_init(pRExC_state, &this_class);
822                         data_fake.start_class = &this_class;
823                         f = SCF_DO_STCLASS_AND;
824                     }           
825                     if (flags & SCF_WHILEM_VISITED_POS)
826                         f |= SCF_WHILEM_VISITED_POS;
827                     /* we suppose the run is continuous, last=next...*/
828                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
829                                           next, &data_fake, f);
830                     if (min1 > minnext)
831                         min1 = minnext;
832                     if (max1 < minnext + deltanext)
833                         max1 = minnext + deltanext;
834                     if (deltanext == I32_MAX)
835                         is_inf = is_inf_internal = 1;
836                     scan = next;
837                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
838                         pars++;
839                     if (data && (data_fake.flags & SF_HAS_EVAL))
840                         data->flags |= SF_HAS_EVAL;
841                     if (data)
842                         data->whilem_c = data_fake.whilem_c;
843                     if (flags & SCF_DO_STCLASS)
844                         cl_or(pRExC_state, &accum, &this_class);
845                     if (code == SUSPEND)
846                         break;
847                 }
848                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
849                     min1 = 0;
850                 if (flags & SCF_DO_SUBSTR) {
851                     data->pos_min += min1;
852                     data->pos_delta += max1 - min1;
853                     if (max1 != min1 || is_inf)
854                         data->longest = &(data->longest_float);
855                 }
856                 min += min1;
857                 delta += max1 - min1;
858                 if (flags & SCF_DO_STCLASS_OR) {
859                     cl_or(pRExC_state, data->start_class, &accum);
860                     if (min1) {
861                         cl_and(data->start_class, &and_with);
862                         flags &= ~SCF_DO_STCLASS;
863                     }
864                 }
865                 else if (flags & SCF_DO_STCLASS_AND) {
866                     if (min1) {
867                         cl_and(data->start_class, &accum);
868                         flags &= ~SCF_DO_STCLASS;
869                     }
870                     else {
871                         /* Switch to OR mode: cache the old value of
872                          * data->start_class */
873                         StructCopy(data->start_class, &and_with,
874                                    struct regnode_charclass_class);
875                         flags &= ~SCF_DO_STCLASS_AND;
876                         StructCopy(&accum, data->start_class,
877                                    struct regnode_charclass_class);
878                         flags |= SCF_DO_STCLASS_OR;
879                         data->start_class->flags |= ANYOF_EOS;
880                     }
881                 }
882
883             }
884             else if (code == BRANCHJ)   /* single branch is optimized. */
885                 scan = NEXTOPER(NEXTOPER(scan));
886             else                        /* single branch is optimized. */
887                 scan = NEXTOPER(scan);
888             continue;
889         }
890         else if (OP(scan) == EXACT) {
891             I32 l = STR_LEN(scan);
892             UV uc;
893             if (UTF) {
894                 const U8 * const s = (U8*)STRING(scan);
895                 l = utf8_length((U8 *)s, (U8 *)s + l);
896                 uc = utf8_to_uvchr((U8 *)s, NULL);
897             } else {
898                 uc = *((U8*)STRING(scan));
899             }
900             min += l;
901             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
902                 /* The code below prefers earlier match for fixed
903                    offset, later match for variable offset.  */
904                 if (data->last_end == -1) { /* Update the start info. */
905                     data->last_start_min = data->pos_min;
906                     data->last_start_max = is_inf
907                         ? I32_MAX : data->pos_min + data->pos_delta;
908                 }
909                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
910                 if (UTF)
911                     SvUTF8_on(data->last_found);
912                 {
913                     SV * const sv = data->last_found;
914                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
915                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
916                     if (mg && mg->mg_len >= 0)
917                         mg->mg_len += utf8_length((U8*)STRING(scan),
918                                                   (U8*)STRING(scan)+STR_LEN(scan));
919                 }
920                 data->last_end = data->pos_min + l;
921                 data->pos_min += l; /* As in the first entry. */
922                 data->flags &= ~SF_BEFORE_EOL;
923             }
924             if (flags & SCF_DO_STCLASS_AND) {
925                 /* Check whether it is compatible with what we know already! */
926                 int compat = 1;
927
928                 if (uc >= 0x100 ||
929                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
930                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
931                     && (!(data->start_class->flags & ANYOF_FOLD)
932                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
933                     )
934                     compat = 0;
935                 ANYOF_CLASS_ZERO(data->start_class);
936                 ANYOF_BITMAP_ZERO(data->start_class);
937                 if (compat)
938                     ANYOF_BITMAP_SET(data->start_class, uc);
939                 data->start_class->flags &= ~ANYOF_EOS;
940                 if (uc < 0x100)
941                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
942             }
943             else if (flags & SCF_DO_STCLASS_OR) {
944                 /* false positive possible if the class is case-folded */
945                 if (uc < 0x100)
946                     ANYOF_BITMAP_SET(data->start_class, uc);
947                 else
948                     data->start_class->flags |= ANYOF_UNICODE_ALL;
949                 data->start_class->flags &= ~ANYOF_EOS;
950                 cl_and(data->start_class, &and_with);
951             }
952             flags &= ~SCF_DO_STCLASS;
953         }
954         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
955             I32 l = STR_LEN(scan);
956             UV uc = *((U8*)STRING(scan));
957
958             /* Search for fixed substrings supports EXACT only. */
959             if (flags & SCF_DO_SUBSTR) {
960                 assert(data);
961                 scan_commit(pRExC_state, data);
962             }
963             if (UTF) {
964                 U8 * const s = (U8 *)STRING(scan);
965                 l = utf8_length(s, s + l);
966                 uc = utf8_to_uvchr(s, NULL);
967             }
968             min += l;
969             if (flags & SCF_DO_SUBSTR)
970                 data->pos_min += l;
971             if (flags & SCF_DO_STCLASS_AND) {
972                 /* Check whether it is compatible with what we know already! */
973                 int compat = 1;
974
975                 if (uc >= 0x100 ||
976                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
977                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
978                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
979                     compat = 0;
980                 ANYOF_CLASS_ZERO(data->start_class);
981                 ANYOF_BITMAP_ZERO(data->start_class);
982                 if (compat) {
983                     ANYOF_BITMAP_SET(data->start_class, uc);
984                     data->start_class->flags &= ~ANYOF_EOS;
985                     data->start_class->flags |= ANYOF_FOLD;
986                     if (OP(scan) == EXACTFL)
987                         data->start_class->flags |= ANYOF_LOCALE;
988                 }
989             }
990             else if (flags & SCF_DO_STCLASS_OR) {
991                 if (data->start_class->flags & ANYOF_FOLD) {
992                     /* false positive possible if the class is case-folded.
993                        Assume that the locale settings are the same... */
994                     if (uc < 0x100)
995                         ANYOF_BITMAP_SET(data->start_class, uc);
996                     data->start_class->flags &= ~ANYOF_EOS;
997                 }
998                 cl_and(data->start_class, &and_with);
999             }
1000             flags &= ~SCF_DO_STCLASS;
1001         }
1002         else if (strchr((const char*)PL_varies,OP(scan))) {
1003             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1004             I32 f = flags, pos_before = 0;
1005             regnode * const oscan = scan;
1006             struct regnode_charclass_class this_class;
1007             struct regnode_charclass_class *oclass = NULL;
1008             I32 next_is_eval = 0;
1009
1010             switch (PL_regkind[(U8)OP(scan)]) {
1011             case WHILEM:                /* End of (?:...)* . */
1012                 scan = NEXTOPER(scan);
1013                 goto finish;
1014             case PLUS:
1015                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1016                     next = NEXTOPER(scan);
1017                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1018                         mincount = 1;
1019                         maxcount = REG_INFTY;
1020                         next = regnext(scan);
1021                         scan = NEXTOPER(scan);
1022                         goto do_curly;
1023                     }
1024                 }
1025                 if (flags & SCF_DO_SUBSTR)
1026                     data->pos_min++;
1027                 min++;
1028                 /* Fall through. */
1029             case STAR:
1030                 if (flags & SCF_DO_STCLASS) {
1031                     mincount = 0;
1032                     maxcount = REG_INFTY;
1033                     next = regnext(scan);
1034                     scan = NEXTOPER(scan);
1035                     goto do_curly;
1036                 }
1037                 is_inf = is_inf_internal = 1;
1038                 scan = regnext(scan);
1039                 if (flags & SCF_DO_SUBSTR) {
1040                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1041                     data->longest = &(data->longest_float);
1042                 }
1043                 goto optimize_curly_tail;
1044             case CURLY:
1045                 mincount = ARG1(scan);
1046                 maxcount = ARG2(scan);
1047                 next = regnext(scan);
1048                 if (OP(scan) == CURLYX) {
1049                     I32 lp = (data ? *(data->last_closep) : 0);
1050
1051                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1052                 }
1053                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1054                 next_is_eval = (OP(scan) == EVAL);
1055               do_curly:
1056                 if (flags & SCF_DO_SUBSTR) {
1057                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1058                     pos_before = data->pos_min;
1059                 }
1060                 if (data) {
1061                     fl = data->flags;
1062                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1063                     if (is_inf)
1064                         data->flags |= SF_IS_INF;
1065                 }
1066                 if (flags & SCF_DO_STCLASS) {
1067                     cl_init(pRExC_state, &this_class);
1068                     oclass = data->start_class;
1069                     data->start_class = &this_class;
1070                     f |= SCF_DO_STCLASS_AND;
1071                     f &= ~SCF_DO_STCLASS_OR;
1072                 }
1073                 /* These are the cases when once a subexpression
1074                    fails at a particular position, it cannot succeed
1075                    even after backtracking at the enclosing scope.
1076                 
1077                    XXXX what if minimal match and we are at the
1078                         initial run of {n,m}? */
1079                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1080                     f &= ~SCF_WHILEM_VISITED_POS;
1081
1082                 /* This will finish on WHILEM, setting scan, or on NULL: */
1083                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1084                                       mincount == 0
1085                                         ? (f & ~SCF_DO_SUBSTR) : f);
1086
1087                 if (flags & SCF_DO_STCLASS)
1088                     data->start_class = oclass;
1089                 if (mincount == 0 || minnext == 0) {
1090                     if (flags & SCF_DO_STCLASS_OR) {
1091                         cl_or(pRExC_state, data->start_class, &this_class);
1092                     }
1093                     else if (flags & SCF_DO_STCLASS_AND) {
1094                         /* Switch to OR mode: cache the old value of
1095                          * data->start_class */
1096                         StructCopy(data->start_class, &and_with,
1097                                    struct regnode_charclass_class);
1098                         flags &= ~SCF_DO_STCLASS_AND;
1099                         StructCopy(&this_class, data->start_class,
1100                                    struct regnode_charclass_class);
1101                         flags |= SCF_DO_STCLASS_OR;
1102                         data->start_class->flags |= ANYOF_EOS;
1103                     }
1104                 } else {                /* Non-zero len */
1105                     if (flags & SCF_DO_STCLASS_OR) {
1106                         cl_or(pRExC_state, data->start_class, &this_class);
1107                         cl_and(data->start_class, &and_with);
1108                     }
1109                     else if (flags & SCF_DO_STCLASS_AND)
1110                         cl_and(data->start_class, &this_class);
1111                     flags &= ~SCF_DO_STCLASS;
1112                 }
1113                 if (!scan)              /* It was not CURLYX, but CURLY. */
1114                     scan = next;
1115                 if ( /* ? quantifier ok, except for (?{ ... }) */
1116                     (next_is_eval || !(mincount == 0 && maxcount == 1))
1117                     && (minnext == 0) && (deltanext == 0)
1118                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1119                     && maxcount <= REG_INFTY/3 /* Complement check for big count */
1120                     && ckWARN(WARN_REGEXP))
1121                 {
1122                     vWARN(RExC_parse,
1123                           "Quantifier unexpected on zero-length expression");
1124                 }
1125
1126                 min += minnext * mincount;
1127                 is_inf_internal |= ((maxcount == REG_INFTY
1128                                      && (minnext + deltanext) > 0)
1129                                     || deltanext == I32_MAX);
1130                 is_inf |= is_inf_internal;
1131                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1132
1133                 /* Try powerful optimization CURLYX => CURLYN. */
1134                 if (  OP(oscan) == CURLYX && data
1135                       && data->flags & SF_IN_PAR
1136                       && !(data->flags & SF_HAS_EVAL)
1137                       && !deltanext && minnext == 1 ) {
1138                     /* Try to optimize to CURLYN.  */
1139                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1140                     regnode * const nxt1 = nxt;
1141 #ifdef DEBUGGING
1142                     regnode *nxt2;
1143 #endif
1144
1145                     /* Skip open. */
1146                     nxt = regnext(nxt);
1147                     if (!strchr((const char*)PL_simple,OP(nxt))
1148                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1149                              && STR_LEN(nxt) == 1))
1150                         goto nogo;
1151 #ifdef DEBUGGING
1152                     nxt2 = nxt;
1153 #endif
1154                     nxt = regnext(nxt);
1155                     if (OP(nxt) != CLOSE)
1156                         goto nogo;
1157                     /* Now we know that nxt2 is the only contents: */
1158                     oscan->flags = (U8)ARG(nxt);
1159                     OP(oscan) = CURLYN;
1160                     OP(nxt1) = NOTHING; /* was OPEN. */
1161 #ifdef DEBUGGING
1162                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1163                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1164                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1165                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1166                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1167                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1168 #endif
1169                 }
1170               nogo:
1171
1172                 /* Try optimization CURLYX => CURLYM. */
1173                 if (  OP(oscan) == CURLYX && data
1174                       && !(data->flags & SF_HAS_PAR)
1175                       && !(data->flags & SF_HAS_EVAL)
1176                       && !deltanext     /* atom is fixed width */
1177                       && minnext != 0   /* CURLYM can't handle zero width */
1178                 ) {
1179                     /* XXXX How to optimize if data == 0? */
1180                     /* Optimize to a simpler form.  */
1181                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1182                     regnode *nxt2;
1183
1184                     OP(oscan) = CURLYM;
1185                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1186                             && (OP(nxt2) != WHILEM))
1187                         nxt = nxt2;
1188                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1189                     /* Need to optimize away parenths. */
1190                     if (data->flags & SF_IN_PAR) {
1191                         /* Set the parenth number.  */
1192                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1193
1194                         if (OP(nxt) != CLOSE)
1195                             FAIL("Panic opt close");
1196                         oscan->flags = (U8)ARG(nxt);
1197                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1198                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1199 #ifdef DEBUGGING
1200                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1201                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1202                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1203                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1204 #endif
1205 #if 0
1206                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1207                             regnode *nnxt = regnext(nxt1);
1208                         
1209                             if (nnxt == nxt) {
1210                                 if (reg_off_by_arg[OP(nxt1)])
1211                                     ARG_SET(nxt1, nxt2 - nxt1);
1212                                 else if (nxt2 - nxt1 < U16_MAX)
1213                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1214                                 else
1215                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1216                             }
1217                             nxt1 = nnxt;
1218                         }
1219 #endif
1220                         /* Optimize again: */
1221                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1222                                     NULL, 0);
1223                     }
1224                     else
1225                         oscan->flags = 0;
1226                 }
1227                 else if ((OP(oscan) == CURLYX)
1228                          && (flags & SCF_WHILEM_VISITED_POS)
1229                          /* See the comment on a similar expression above.
1230                             However, this time it not a subexpression
1231                             we care about, but the expression itself. */
1232                          && (maxcount == REG_INFTY)
1233                          && data && ++data->whilem_c < 16) {
1234                     /* This stays as CURLYX, we can put the count/of pair. */
1235                     /* Find WHILEM (as in regexec.c) */
1236                     regnode *nxt = oscan + NEXT_OFF(oscan);
1237
1238                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1239                         nxt += ARG(nxt);
1240                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1241                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1242                 }
1243                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1244                     pars++;
1245                 if (flags & SCF_DO_SUBSTR) {
1246                     SV *last_str = NULL;
1247                     int counted = mincount != 0;
1248
1249                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1250 #if defined(SPARC64_GCC_WORKAROUND)
1251                         I32 b = 0;
1252                         STRLEN l = 0;
1253                         const char *s = NULL;
1254                         I32 old = 0;
1255
1256                         if (pos_before >= data->last_start_min)
1257                             b = pos_before;
1258                         else
1259                             b = data->last_start_min;
1260
1261                         l = 0;
1262                         s = SvPV_const(data->last_found, l);
1263                         old = b - data->last_start_min;
1264
1265 #else
1266                         I32 b = pos_before >= data->last_start_min
1267                             ? pos_before : data->last_start_min;
1268                         STRLEN l;
1269                         const char * const s = SvPV_const(data->last_found, l);
1270                         I32 old = b - data->last_start_min;
1271 #endif
1272
1273                         if (UTF)
1274                             old = utf8_hop((U8*)s, old) - (U8*)s;
1275                         
1276                         l -= old;
1277                         /* Get the added string: */
1278                         last_str = newSVpvn(s  + old, l);
1279                         if (UTF)
1280                             SvUTF8_on(last_str);
1281                         if (deltanext == 0 && pos_before == b) {
1282                             /* What was added is a constant string */
1283                             if (mincount > 1) {
1284                                 SvGROW(last_str, (mincount * l) + 1);
1285                                 repeatcpy(SvPVX(last_str) + l,
1286                                           SvPVX_const(last_str), l, mincount - 1);
1287                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
1288                                 /* Add additional parts. */
1289                                 SvCUR_set(data->last_found,
1290                                           SvCUR(data->last_found) - l);
1291                                 sv_catsv(data->last_found, last_str);
1292                                 {
1293                                     SV * sv = data->last_found;
1294                                     MAGIC *mg =
1295                                         SvUTF8(sv) && SvMAGICAL(sv) ?
1296                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
1297                                     if (mg && mg->mg_len >= 0)
1298                                         mg->mg_len += CHR_SVLEN(last_str);
1299                                 }
1300                                 data->last_end += l * (mincount - 1);
1301                             }
1302                         } else {
1303                             /* start offset must point into the last copy */
1304                             data->last_start_min += minnext * (mincount - 1);
1305                             data->last_start_max += is_inf ? I32_MAX
1306                                 : (maxcount - 1) * (minnext + data->pos_delta);
1307                         }
1308                     }
1309                     /* It is counted once already... */
1310                     data->pos_min += minnext * (mincount - counted);
1311                     data->pos_delta += - counted * deltanext +
1312                         (minnext + deltanext) * maxcount - minnext * mincount;
1313                     if (mincount != maxcount) {
1314                          /* Cannot extend fixed substrings found inside
1315                             the group.  */
1316                         scan_commit(pRExC_state,data);
1317                         if (mincount && last_str) {
1318                             SV * const sv = data->last_found;
1319                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
1320                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1321
1322                             if (mg)
1323                                 mg->mg_len = -1;
1324                             sv_setsv(sv, last_str);
1325                             data->last_end = data->pos_min;
1326                             data->last_start_min =
1327                                 data->pos_min - CHR_SVLEN(last_str);
1328                             data->last_start_max = is_inf
1329                                 ? I32_MAX
1330                                 : data->pos_min + data->pos_delta
1331                                 - CHR_SVLEN(last_str);
1332                         }
1333                         data->longest = &(data->longest_float);
1334                     }
1335                     SvREFCNT_dec(last_str);
1336                 }
1337                 if (data && (fl & SF_HAS_EVAL))
1338                     data->flags |= SF_HAS_EVAL;
1339               optimize_curly_tail:
1340                 if (OP(oscan) != CURLYX) {
1341                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1342                            && NEXT_OFF(next))
1343                         NEXT_OFF(oscan) += NEXT_OFF(next);
1344                 }
1345                 continue;
1346             default:                    /* REF and CLUMP only? */
1347                 if (flags & SCF_DO_SUBSTR) {
1348                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1349                     data->longest = &(data->longest_float);
1350                 }
1351                 is_inf = is_inf_internal = 1;
1352                 if (flags & SCF_DO_STCLASS_OR)
1353                     cl_anything(pRExC_state, data->start_class);
1354                 flags &= ~SCF_DO_STCLASS;
1355                 break;
1356             }
1357         }
1358         else if (strchr((const char*)PL_simple,OP(scan))) {
1359             int value = 0;
1360
1361             if (flags & SCF_DO_SUBSTR) {
1362                 scan_commit(pRExC_state,data);
1363                 data->pos_min++;
1364             }
1365             min++;
1366             if (flags & SCF_DO_STCLASS) {
1367                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1368
1369                 /* Some of the logic below assumes that switching
1370                    locale on will only add false positives. */
1371                 switch (PL_regkind[(U8)OP(scan)]) {
1372                 case SANY:
1373                 default:
1374                   do_default:
1375                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1376                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1377                         cl_anything(pRExC_state, data->start_class);
1378                     break;
1379                 case REG_ANY:
1380                     if (OP(scan) == SANY)
1381                         goto do_default;
1382                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1383                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1384                                  || (data->start_class->flags & ANYOF_CLASS));
1385                         cl_anything(pRExC_state, data->start_class);
1386                     }
1387                     if (flags & SCF_DO_STCLASS_AND || !value)
1388                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1389                     break;
1390                 case ANYOF:
1391                     if (flags & SCF_DO_STCLASS_AND)
1392                         cl_and(data->start_class,
1393                                (struct regnode_charclass_class*)scan);
1394                     else
1395                         cl_or(pRExC_state, data->start_class,
1396                               (struct regnode_charclass_class*)scan);
1397                     break;
1398                 case ALNUM:
1399                     if (flags & SCF_DO_STCLASS_AND) {
1400                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1401                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1402                             for (value = 0; value < 256; value++)
1403                                 if (!isALNUM(value))
1404                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1405                         }
1406                     }
1407                     else {
1408                         if (data->start_class->flags & ANYOF_LOCALE)
1409                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1410                         else {
1411                             for (value = 0; value < 256; value++)
1412                                 if (isALNUM(value))
1413                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1414                         }
1415                     }
1416                     break;
1417                 case ALNUML:
1418                     if (flags & SCF_DO_STCLASS_AND) {
1419                         if (data->start_class->flags & ANYOF_LOCALE)
1420                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1421                     }
1422                     else {
1423                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1424                         data->start_class->flags |= ANYOF_LOCALE;
1425                     }
1426                     break;
1427                 case NALNUM:
1428                     if (flags & SCF_DO_STCLASS_AND) {
1429                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1430                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1431                             for (value = 0; value < 256; value++)
1432                                 if (isALNUM(value))
1433                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1434                         }
1435                     }
1436                     else {
1437                         if (data->start_class->flags & ANYOF_LOCALE)
1438                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1439                         else {
1440                             for (value = 0; value < 256; value++)
1441                                 if (!isALNUM(value))
1442                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1443                         }
1444                     }
1445                     break;
1446                 case NALNUML:
1447                     if (flags & SCF_DO_STCLASS_AND) {
1448                         if (data->start_class->flags & ANYOF_LOCALE)
1449                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1450                     }
1451                     else {
1452                         data->start_class->flags |= ANYOF_LOCALE;
1453                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1454                     }
1455                     break;
1456                 case SPACE:
1457                     if (flags & SCF_DO_STCLASS_AND) {
1458                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1459                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1460                             for (value = 0; value < 256; value++)
1461                                 if (!isSPACE(value))
1462                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1463                         }
1464                     }
1465                     else {
1466                         if (data->start_class->flags & ANYOF_LOCALE)
1467                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1468                         else {
1469                             for (value = 0; value < 256; value++)
1470                                 if (isSPACE(value))
1471                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1472                         }
1473                     }
1474                     break;
1475                 case SPACEL:
1476                     if (flags & SCF_DO_STCLASS_AND) {
1477                         if (data->start_class->flags & ANYOF_LOCALE)
1478                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1479                     }
1480                     else {
1481                         data->start_class->flags |= ANYOF_LOCALE;
1482                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1483                     }
1484                     break;
1485                 case NSPACE:
1486                     if (flags & SCF_DO_STCLASS_AND) {
1487                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1488                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1489                             for (value = 0; value < 256; value++)
1490                                 if (isSPACE(value))
1491                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1492                         }
1493                     }
1494                     else {
1495                         if (data->start_class->flags & ANYOF_LOCALE)
1496                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1497                         else {
1498                             for (value = 0; value < 256; value++)
1499                                 if (!isSPACE(value))
1500                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1501                         }
1502                     }
1503                     break;
1504                 case NSPACEL:
1505                     if (flags & SCF_DO_STCLASS_AND) {
1506                         if (data->start_class->flags & ANYOF_LOCALE) {
1507                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1508                             for (value = 0; value < 256; value++)
1509                                 if (!isSPACE(value))
1510                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1511                         }
1512                     }
1513                     else {
1514                         data->start_class->flags |= ANYOF_LOCALE;
1515                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1516                     }
1517                     break;
1518                 case DIGIT:
1519                     if (flags & SCF_DO_STCLASS_AND) {
1520                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1521                         for (value = 0; value < 256; value++)
1522                             if (!isDIGIT(value))
1523                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1524                     }
1525                     else {
1526                         if (data->start_class->flags & ANYOF_LOCALE)
1527                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1528                         else {
1529                             for (value = 0; value < 256; value++)
1530                                 if (isDIGIT(value))
1531                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1532                         }
1533                     }
1534                     break;
1535                 case NDIGIT:
1536                     if (flags & SCF_DO_STCLASS_AND) {
1537                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1538                         for (value = 0; value < 256; value++)
1539                             if (isDIGIT(value))
1540                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1541                     }
1542                     else {
1543                         if (data->start_class->flags & ANYOF_LOCALE)
1544                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1545                         else {
1546                             for (value = 0; value < 256; value++)
1547                                 if (!isDIGIT(value))
1548                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1549                         }
1550                     }
1551                     break;
1552                 }
1553                 if (flags & SCF_DO_STCLASS_OR)
1554                     cl_and(data->start_class, &and_with);
1555                 flags &= ~SCF_DO_STCLASS;
1556             }
1557         }
1558         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1559             data->flags |= (OP(scan) == MEOL
1560                             ? SF_BEFORE_MEOL
1561                             : SF_BEFORE_SEOL);
1562         }
1563         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1564                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1565                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1566                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1567             /* Lookahead/lookbehind */
1568             I32 deltanext, minnext, fake = 0;
1569             regnode *nscan;
1570             struct regnode_charclass_class intrnl;
1571             int f = 0;
1572
1573             data_fake.flags = 0;
1574             if (data) {         
1575                 data_fake.whilem_c = data->whilem_c;
1576                 data_fake.last_closep = data->last_closep;
1577             }
1578             else
1579                 data_fake.last_closep = &fake;
1580             if ( flags & SCF_DO_STCLASS && !scan->flags
1581                  && OP(scan) == IFMATCH ) { /* Lookahead */
1582                 cl_init(pRExC_state, &intrnl);
1583                 data_fake.start_class = &intrnl;
1584                 f |= SCF_DO_STCLASS_AND;
1585             }
1586             if (flags & SCF_WHILEM_VISITED_POS)
1587                 f |= SCF_WHILEM_VISITED_POS;
1588             next = regnext(scan);
1589             nscan = NEXTOPER(NEXTOPER(scan));
1590             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1591             if (scan->flags) {
1592                 if (deltanext) {
1593                     vFAIL("Variable length lookbehind not implemented");
1594                 }
1595                 else if (minnext > U8_MAX) {
1596                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1597                 }
1598                 scan->flags = (U8)minnext;
1599             }
1600             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1601                 pars++;
1602             if (data && (data_fake.flags & SF_HAS_EVAL))
1603                 data->flags |= SF_HAS_EVAL;
1604             if (data)
1605                 data->whilem_c = data_fake.whilem_c;
1606             if (f & SCF_DO_STCLASS_AND) {
1607                 const int was = (data->start_class->flags & ANYOF_EOS);
1608
1609                 cl_and(data->start_class, &intrnl);
1610                 if (was)
1611                     data->start_class->flags |= ANYOF_EOS;
1612             }
1613         }
1614         else if (OP(scan) == OPEN) {
1615             pars++;
1616         }
1617         else if (OP(scan) == CLOSE) {
1618             if ((I32)ARG(scan) == is_par) {
1619                 next = regnext(scan);
1620
1621                 if ( next && (OP(next) != WHILEM) && next < last)
1622                     is_par = 0;         /* Disable optimization */
1623             }
1624             if (data)
1625                 *(data->last_closep) = ARG(scan);
1626         }
1627         else if (OP(scan) == EVAL) {
1628                 if (data)
1629                     data->flags |= SF_HAS_EVAL;
1630         }
1631         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1632                 if (flags & SCF_DO_SUBSTR) {
1633                     scan_commit(pRExC_state,data);
1634                     data->longest = &(data->longest_float);
1635                 }
1636                 is_inf = is_inf_internal = 1;
1637                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1638                     cl_anything(pRExC_state, data->start_class);
1639                 flags &= ~SCF_DO_STCLASS;
1640         }
1641         /* Else: zero-length, ignore. */
1642         scan = regnext(scan);
1643     }
1644
1645   finish:
1646     *scanp = scan;
1647     *deltap = is_inf_internal ? I32_MAX : delta;
1648     if (flags & SCF_DO_SUBSTR && is_inf)
1649         data->pos_delta = I32_MAX - data->pos_min;
1650     if (is_par > U8_MAX)
1651         is_par = 0;
1652     if (is_par && pars==1 && data) {
1653         data->flags |= SF_IN_PAR;
1654         data->flags &= ~SF_HAS_PAR;
1655     }
1656     else if (pars && data) {
1657         data->flags |= SF_HAS_PAR;
1658         data->flags &= ~SF_IN_PAR;
1659     }
1660     if (flags & SCF_DO_STCLASS_OR)
1661         cl_and(data->start_class, &and_with);
1662     return min;
1663 }
1664
1665 STATIC U32
1666 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
1667 {
1668     U32 count = RExC_rx->data ? RExC_rx->data->count : 0;
1669
1670     Renewc(RExC_rx->data,
1671            sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
1672            char, struct reg_data);
1673     if(count)
1674         Renew(RExC_rx->data->what, count + n, U8);
1675     else
1676         Newx(RExC_rx->data->what, n, U8);
1677     RExC_rx->data->count = count + n;
1678     Copy(s, RExC_rx->data->what + count, n, U8);
1679     return count;
1680 }
1681
1682 void
1683 Perl_reginitcolors(pTHX)
1684 {
1685     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
1686     if (s) {
1687         char *t = savepv(s);
1688         int i = 0;
1689         PL_colors[0] = t;
1690         while (++i < 6) {
1691             t = strchr(t, '\t');
1692             if (t) {
1693                 *t = '\0';
1694                 PL_colors[i] = ++t;
1695             }
1696             else
1697                 PL_colors[i] = t = (char *)"";
1698         }
1699     } else {
1700         int i = 0;
1701         while (i < 6)
1702             PL_colors[i++] = (char *)"";
1703     }
1704     PL_colorset = 1;
1705 }
1706
1707
1708 /*
1709  - pregcomp - compile a regular expression into internal code
1710  *
1711  * We can't allocate space until we know how big the compiled form will be,
1712  * but we can't compile it (and thus know how big it is) until we've got a
1713  * place to put the code.  So we cheat:  we compile it twice, once with code
1714  * generation turned off and size counting turned on, and once "for real".
1715  * This also means that we don't allocate space until we are sure that the
1716  * thing really will compile successfully, and we never have to move the
1717  * code and thus invalidate pointers into it.  (Note that it has to be in
1718  * one piece because free() must be able to free it all.) [NB: not true in perl]
1719  *
1720  * Beware that the optimization-preparation code in here knows about some
1721  * of the structure of the compiled regexp.  [I'll say.]
1722  */
1723 regexp *
1724 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1725 {
1726     register regexp *r;
1727     regnode *scan;
1728     regnode *first;
1729     I32 flags;
1730     I32 minlen = 0;
1731     I32 sawplus = 0;
1732     I32 sawopen = 0;
1733     scan_data_t data;
1734     RExC_state_t RExC_state;
1735     RExC_state_t *pRExC_state = &RExC_state;
1736
1737     if (exp == NULL)
1738         FAIL("NULL regexp argument");
1739
1740     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1741
1742     RExC_precomp = exp;
1743     DEBUG_r({
1744          if (!PL_colorset) reginitcolors();
1745          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1746                        PL_colors[4],PL_colors[5],PL_colors[0],
1747                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1748     });
1749     RExC_flags = pm->op_pmflags;
1750     RExC_sawback = 0;
1751
1752     RExC_seen = 0;
1753     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1754     RExC_seen_evals = 0;
1755     RExC_extralen = 0;
1756
1757     /* First pass: determine size, legality. */
1758     RExC_parse = exp;
1759     RExC_start = exp;
1760     RExC_end = xend;
1761     RExC_naughty = 0;
1762     RExC_npar = 1;
1763     RExC_size = 0L;
1764     RExC_emit = &PL_regdummy;
1765     RExC_whilem_seen = 0;
1766 #if 0 /* REGC() is (currently) a NOP at the first pass.
1767        * Clever compilers notice this and complain. --jhi */
1768     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1769 #endif
1770     if (reg(pRExC_state, 0, &flags) == NULL) {
1771         RExC_precomp = NULL;
1772         return(NULL);
1773     }
1774     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1775
1776     /* Small enough for pointer-storage convention?
1777        If extralen==0, this means that we will not need long jumps. */
1778     if (RExC_size >= 0x10000L && RExC_extralen)
1779         RExC_size += RExC_extralen;
1780     else
1781         RExC_extralen = 0;
1782     if (RExC_whilem_seen > 15)
1783         RExC_whilem_seen = 15;
1784
1785     /* Allocate space and initialize. */
1786     Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1787          char, regexp);
1788     if (r == NULL)
1789         FAIL("Regexp out of space");
1790
1791 #ifdef DEBUGGING
1792     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1793     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1794 #endif
1795     r->refcnt = 1;
1796     r->prelen = xend - exp;
1797     r->precomp = savepvn(RExC_precomp, r->prelen);
1798     r->subbeg = NULL;
1799     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1800     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1801     r->lastparen = 0;                   /* mg.c reads this.  */
1802
1803     r->substrs = 0;                     /* Useful during FAIL. */
1804     r->startp = 0;                      /* Useful during FAIL. */
1805     r->endp = 0;                        /* Useful during FAIL. */
1806
1807     Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1808     if (r->offsets) {
1809         r->offsets[0] = RExC_size;
1810     }
1811     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1812                           "%s %"UVuf" bytes for offset annotations.\n",
1813                           r->offsets ? "Got" : "Couldn't get",
1814                           (UV)((2*RExC_size+1) * sizeof(U32))));
1815
1816     RExC_rx = r;
1817
1818     /* Second pass: emit code. */
1819     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1820     RExC_parse = exp;
1821     RExC_end = xend;
1822     RExC_naughty = 0;
1823     RExC_npar = 1;
1824     RExC_emit_start = r->program;
1825     RExC_emit = r->program;
1826     /* Store the count of eval-groups for security checks: */
1827     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1828     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1829     r->data = 0;
1830     if (reg(pRExC_state, 0, &flags) == NULL)
1831         return(NULL);
1832
1833     /* Dig out information for optimizations. */
1834     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1835     pm->op_pmflags = RExC_flags;
1836     if (UTF)
1837         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1838     r->regstclass = NULL;
1839     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1840         r->reganch |= ROPT_NAUGHTY;
1841     scan = r->program + 1;              /* First BRANCH. */
1842
1843     /* XXXX To minimize changes to RE engine we always allocate
1844        3-units-long substrs field. */
1845     Newxz(r->substrs, 1, struct reg_substr_data);
1846
1847     StructCopy(&zero_scan_data, &data, scan_data_t);
1848     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1849     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1850         I32 fake;
1851         STRLEN longest_float_length, longest_fixed_length;
1852         struct regnode_charclass_class ch_class;
1853         int stclass_flag;
1854         I32 last_close = 0;
1855
1856         first = scan;
1857         /* Skip introductions and multiplicators >= 1. */
1858         while ((OP(first) == OPEN && (sawopen = 1)) ||
1859                /* An OR of *one* alternative - should not happen now. */
1860             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1861             (OP(first) == PLUS) ||
1862             (OP(first) == MINMOD) ||
1863                /* An {n,m} with n>0 */
1864             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1865                 if (OP(first) == PLUS)
1866                     sawplus = 1;
1867                 else
1868                     first += regarglen[(U8)OP(first)];
1869                 first = NEXTOPER(first);
1870         }
1871
1872         /* Starting-point info. */
1873       again:
1874         if (PL_regkind[(U8)OP(first)] == EXACT) {
1875             if (OP(first) == EXACT)
1876                 NOOP;   /* Empty, get anchored substr later. */
1877             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1878                 r->regstclass = first;
1879         }
1880         else if (strchr((const char*)PL_simple,OP(first)))
1881             r->regstclass = first;
1882         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1883                  PL_regkind[(U8)OP(first)] == NBOUND)
1884             r->regstclass = first;
1885         else if (PL_regkind[(U8)OP(first)] == BOL) {
1886             r->reganch |= (OP(first) == MBOL
1887                            ? ROPT_ANCH_MBOL
1888                            : (OP(first) == SBOL
1889                               ? ROPT_ANCH_SBOL
1890                               : ROPT_ANCH_BOL));
1891             first = NEXTOPER(first);
1892             goto again;
1893         }
1894         else if (OP(first) == GPOS) {
1895             r->reganch |= ROPT_ANCH_GPOS;
1896             first = NEXTOPER(first);
1897             goto again;
1898         }
1899         else if ((!sawopen || !RExC_sawback) &&
1900             (OP(first) == STAR &&
1901             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1902             !(r->reganch & ROPT_ANCH) )
1903         {
1904             /* turn .* into ^.* with an implied $*=1 */
1905             const int type =
1906                 (OP(NEXTOPER(first)) == REG_ANY)
1907                     ? ROPT_ANCH_MBOL
1908                     : ROPT_ANCH_SBOL;
1909             r->reganch |= type | ROPT_IMPLICIT;
1910             first = NEXTOPER(first);
1911             goto again;
1912         }
1913         if (sawplus && (!sawopen || !RExC_sawback)
1914             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1915             /* x+ must match at the 1st pos of run of x's */
1916             r->reganch |= ROPT_SKIP;
1917
1918         /* Scan is after the zeroth branch, first is atomic matcher. */
1919         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1920                               (IV)(first - scan + 1)));
1921         /*
1922         * If there's something expensive in the r.e., find the
1923         * longest literal string that must appear and make it the
1924         * regmust.  Resolve ties in favor of later strings, since
1925         * the regstart check works with the beginning of the r.e.
1926         * and avoiding duplication strengthens checking.  Not a
1927         * strong reason, but sufficient in the absence of others.
1928         * [Now we resolve ties in favor of the earlier string if
1929         * it happens that c_offset_min has been invalidated, since the
1930         * earlier string may buy us something the later one won't.]
1931         */
1932         minlen = 0;
1933
1934         data.longest_fixed = newSVpvs("");
1935         data.longest_float = newSVpvs("");
1936         data.last_found = newSVpvs("");
1937         data.longest = &(data.longest_fixed);
1938         first = scan;
1939         if (!r->regstclass) {
1940             cl_init(pRExC_state, &ch_class);
1941             data.start_class = &ch_class;
1942             stclass_flag = SCF_DO_STCLASS_AND;
1943         } else                          /* XXXX Check for BOUND? */
1944             stclass_flag = 0;
1945         data.last_closep = &last_close;
1946
1947         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1948                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1949         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1950              && data.last_start_min == 0 && data.last_end > 0
1951              && !RExC_seen_zerolen
1952              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1953             r->reganch |= ROPT_CHECK_ALL;
1954         scan_commit(pRExC_state, &data);
1955         SvREFCNT_dec(data.last_found);
1956
1957         longest_float_length = CHR_SVLEN(data.longest_float);
1958         if (longest_float_length
1959             || (data.flags & SF_FL_BEFORE_EOL
1960                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1961                     || (RExC_flags & PMf_MULTILINE)))) {
1962             int t;
1963
1964             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1965                 && data.offset_fixed == data.offset_float_min
1966                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1967                     goto remove_float;          /* As in (a)+. */
1968
1969             if (SvUTF8(data.longest_float)) {
1970                 r->float_utf8 = data.longest_float;
1971                 r->float_substr = NULL;
1972             } else {
1973                 r->float_substr = data.longest_float;
1974                 r->float_utf8 = NULL;
1975             }
1976             r->float_min_offset = data.offset_float_min;
1977             r->float_max_offset = data.offset_float_max;
1978             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1979                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1980                            || (RExC_flags & PMf_MULTILINE)));
1981             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1982         }
1983         else {
1984           remove_float:
1985             r->float_substr = r->float_utf8 = NULL;
1986             SvREFCNT_dec(data.longest_float);
1987             longest_float_length = 0;
1988         }
1989
1990         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1991         if (longest_fixed_length
1992             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1993                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1994                     || (RExC_flags & PMf_MULTILINE)))) {
1995             int t;
1996
1997             if (SvUTF8(data.longest_fixed)) {
1998                 r->anchored_utf8 = data.longest_fixed;
1999                 r->anchored_substr = NULL;
2000             } else {
2001                 r->anchored_substr = data.longest_fixed;
2002                 r->anchored_utf8 = NULL;
2003             }
2004             r->anchored_offset = data.offset_fixed;
2005             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2006                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2007                      || (RExC_flags & PMf_MULTILINE)));
2008             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2009         }
2010         else {
2011             r->anchored_substr = r->anchored_utf8 = NULL;
2012             SvREFCNT_dec(data.longest_fixed);
2013             longest_fixed_length = 0;
2014         }
2015         if (r->regstclass
2016             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2017             r->regstclass = NULL;
2018         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2019             && stclass_flag
2020             && !(data.start_class->flags & ANYOF_EOS)
2021             && !cl_is_anything(data.start_class))
2022         {
2023             const U32 n = add_data(pRExC_state, 1, "f");
2024
2025             Newx(RExC_rx->data->data[n], 1,
2026                 struct regnode_charclass_class);
2027             StructCopy(data.start_class,
2028                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2029                        struct regnode_charclass_class);
2030             r->regstclass = (regnode*)RExC_rx->data->data[n];
2031             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2032             PL_regdata = r->data; /* for regprop() */
2033             DEBUG_r({ SV *sv = sv_newmortal();
2034                       regprop(sv, (regnode*)data.start_class);
2035                       PerlIO_printf(Perl_debug_log,
2036                                     "synthetic stclass \"%s\".\n",
2037                                     SvPVX_const(sv));});
2038         }
2039
2040         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2041         if (longest_fixed_length > longest_float_length) {
2042             r->check_substr = r->anchored_substr;
2043             r->check_utf8 = r->anchored_utf8;
2044             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2045             if (r->reganch & ROPT_ANCH_SINGLE)
2046                 r->reganch |= ROPT_NOSCAN;
2047         }
2048         else {
2049             r->check_substr = r->float_substr;
2050             r->check_utf8 = r->float_utf8;
2051             r->check_offset_min = data.offset_float_min;
2052             r->check_offset_max = data.offset_float_max;
2053         }
2054         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2055            This should be changed ASAP!  */
2056         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2057             r->reganch |= RE_USE_INTUIT;
2058             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2059                 r->reganch |= RE_INTUIT_TAIL;
2060         }
2061     }
2062     else {
2063         /* Several toplevels. Best we can is to set minlen. */
2064         I32 fake;
2065         struct regnode_charclass_class ch_class;
2066         I32 last_close = 0;
2067         
2068         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2069         scan = r->program + 1;
2070         cl_init(pRExC_state, &ch_class);
2071         data.start_class = &ch_class;
2072         data.last_closep = &last_close;
2073         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2074         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2075                 = r->float_substr = r->float_utf8 = NULL;
2076         if (!(data.start_class->flags & ANYOF_EOS)
2077             && !cl_is_anything(data.start_class))
2078         {
2079             const U32 n = add_data(pRExC_state, 1, "f");
2080
2081             Newx(RExC_rx->data->data[n], 1,
2082                 struct regnode_charclass_class);
2083             StructCopy(data.start_class,
2084                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2085                        struct regnode_charclass_class);
2086             r->regstclass = (regnode*)RExC_rx->data->data[n];
2087             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2088             DEBUG_r({ SV* sv = sv_newmortal();
2089                       regprop(sv, (regnode*)data.start_class);
2090                       PerlIO_printf(Perl_debug_log,
2091                                     "synthetic stclass \"%s\".\n",
2092                                     SvPVX_const(sv));});
2093         }
2094     }
2095
2096     r->minlen = minlen;
2097     if (RExC_seen & REG_SEEN_GPOS)
2098         r->reganch |= ROPT_GPOS_SEEN;
2099     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2100         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2101     if (RExC_seen & REG_SEEN_EVAL)
2102         r->reganch |= ROPT_EVAL_SEEN;
2103     if (RExC_seen & REG_SEEN_CANY)
2104         r->reganch |= ROPT_CANY_SEEN;
2105     Newxz(r->startp, RExC_npar, I32);
2106     Newxz(r->endp, RExC_npar, I32);
2107     PL_regdata = r->data; /* for regprop() */
2108     DEBUG_r(regdump(r));
2109     return(r);
2110 }
2111
2112 /*
2113  - reg - regular expression, i.e. main body or parenthesized thing
2114  *
2115  * Caller must absorb opening parenthesis.
2116  *
2117  * Combining parenthesis handling with the base level of regular expression
2118  * is a trifle forced, but the need to tie the tails of the branches to what
2119  * follows makes it hard to avoid.
2120  */
2121 STATIC regnode *
2122 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2123     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2124 {
2125     register regnode *ret;              /* Will be the head of the group. */
2126     register regnode *br;
2127     register regnode *lastbr;
2128     register regnode *ender = NULL;
2129     register I32 parno = 0;
2130     I32 flags;
2131     const I32 oregflags = RExC_flags;
2132     bool have_branch = 0;
2133     bool is_open = 0;
2134
2135     /* for (?g), (?gc), and (?o) warnings; warning
2136        about (?c) will warn about (?g) -- japhy    */
2137
2138 #define WASTED_O  0x01
2139 #define WASTED_G  0x02
2140 #define WASTED_C  0x04
2141 #define WASTED_GC (0x02|0x04)
2142     I32 wastedflags = 0x00;
2143
2144     char * parse_start = RExC_parse; /* MJD */
2145     char * const oregcomp_parse = RExC_parse;
2146
2147     *flagp = 0;                         /* Tentatively. */
2148
2149
2150     /* Make an OPEN node, if parenthesized. */
2151     if (paren) {
2152         if (*RExC_parse == '?') { /* (?...) */
2153             U32 posflags = 0, negflags = 0;
2154             U32 *flagsp = &posflags;
2155             bool is_logical = 0;
2156             const char * const seqstart = RExC_parse;
2157
2158             RExC_parse++;
2159             paren = *RExC_parse++;
2160             ret = NULL;                 /* For look-ahead/behind. */
2161             switch (paren) {
2162             case '<':           /* (?<...) */
2163                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2164                 if (*RExC_parse == '!')
2165                     paren = ',';
2166                 if (*RExC_parse != '=' && *RExC_parse != '!')
2167                     goto unknown;
2168                 RExC_parse++;
2169             case '=':           /* (?=...) */
2170             case '!':           /* (?!...) */
2171                 RExC_seen_zerolen++;
2172             case ':':           /* (?:...) */
2173             case '>':           /* (?>...) */
2174                 break;
2175             case '$':           /* (?$...) */
2176             case '@':           /* (?@...) */
2177                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2178                 break;
2179             case '#':           /* (?#...) */
2180                 while (*RExC_parse && *RExC_parse != ')')
2181                     RExC_parse++;
2182                 if (*RExC_parse != ')')
2183                     FAIL("Sequence (?#... not terminated");
2184                 nextchar(pRExC_state);
2185                 *flagp = TRYAGAIN;
2186                 return NULL;
2187             case 'p':           /* (?p...) */
2188                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2189                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2190                 /* FALL THROUGH*/
2191             case '?':           /* (??...) */
2192                 is_logical = 1;
2193                 if (*RExC_parse != '{')
2194                     goto unknown;
2195                 paren = *RExC_parse++;
2196                 /* FALL THROUGH */
2197             case '{':           /* (?{...}) */
2198             {
2199                 I32 count = 1;
2200                 U32 n = 0;
2201                 char c;
2202                 char *s = RExC_parse;
2203
2204                 RExC_seen_zerolen++;
2205                 RExC_seen |= REG_SEEN_EVAL;
2206                 while (count && (c = *RExC_parse)) {
2207                     if (c == '\\') {
2208                         if (RExC_parse[1])
2209                             RExC_parse++;
2210                     }
2211                     else if (c == '{')
2212                         count++;
2213                     else if (c == '}')
2214                         count--;
2215                     RExC_parse++;
2216                 }
2217                 if (*RExC_parse != ')') {
2218                     RExC_parse = s;             
2219                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2220                 }
2221                 if (!SIZE_ONLY) {
2222                     PAD *pad;
2223                     OP_4tree *sop, *rop;
2224                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
2225
2226                     ENTER;
2227                     Perl_save_re_context(aTHX);
2228                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2229                     sop->op_private |= OPpREFCOUNTED;
2230                     /* re_dup will OpREFCNT_inc */
2231                     OpREFCNT_set(sop, 1);
2232                     LEAVE;
2233
2234                     n = add_data(pRExC_state, 3, "nop");
2235                     RExC_rx->data->data[n] = (void*)rop;
2236                     RExC_rx->data->data[n+1] = (void*)sop;
2237                     RExC_rx->data->data[n+2] = (void*)pad;
2238                     SvREFCNT_dec(sv);
2239                 }
2240                 else {                                          /* First pass */
2241                     if (PL_reginterp_cnt < ++RExC_seen_evals
2242                         && IN_PERL_RUNTIME)
2243                         /* No compiled RE interpolated, has runtime
2244                            components ===> unsafe.  */
2245                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2246                     if (PL_tainting && PL_tainted)
2247                         FAIL("Eval-group in insecure regular expression");
2248                 }
2249                 
2250                 nextchar(pRExC_state);
2251                 if (is_logical) {
2252                     ret = reg_node(pRExC_state, LOGICAL);
2253                     if (!SIZE_ONLY)
2254                         ret->flags = 2;
2255                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2256                     /* deal with the length of this later - MJD */
2257                     return ret;
2258                 }
2259                 ret = reganode(pRExC_state, EVAL, n);
2260                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2261                 Set_Node_Offset(ret, parse_start);
2262                 return ret;
2263             }
2264             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2265             {
2266                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2267                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2268                         || RExC_parse[1] == '<'
2269                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2270                         I32 flag;
2271                         
2272                         ret = reg_node(pRExC_state, LOGICAL);
2273                         if (!SIZE_ONLY)
2274                             ret->flags = 1;
2275                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2276                         goto insert_if;
2277                     }
2278                 }
2279                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2280                     /* (?(1)...) */
2281                     char c;
2282                     parno = atoi(RExC_parse++);
2283
2284                     while (isDIGIT(*RExC_parse))
2285                         RExC_parse++;
2286                     ret = reganode(pRExC_state, GROUPP, parno);
2287
2288                     if ((c = *nextchar(pRExC_state)) != ')')
2289                         vFAIL("Switch condition not recognized");
2290                   insert_if:
2291                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2292                     br = regbranch(pRExC_state, &flags, 1);
2293                     if (br == NULL)
2294                         br = reganode(pRExC_state, LONGJMP, 0);
2295                     else
2296                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2297                     c = *nextchar(pRExC_state);
2298                     if (flags&HASWIDTH)
2299                         *flagp |= HASWIDTH;
2300                     if (c == '|') {
2301                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2302                         regbranch(pRExC_state, &flags, 1);
2303                         regtail(pRExC_state, ret, lastbr);
2304                         if (flags&HASWIDTH)
2305                             *flagp |= HASWIDTH;
2306                         c = *nextchar(pRExC_state);
2307                     }
2308                     else
2309                         lastbr = NULL;
2310                     if (c != ')')
2311                         vFAIL("Switch (?(condition)... contains too many branches");
2312                     ender = reg_node(pRExC_state, TAIL);
2313                     regtail(pRExC_state, br, ender);
2314                     if (lastbr) {
2315                         regtail(pRExC_state, lastbr, ender);
2316                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2317                     }
2318                     else
2319                         regtail(pRExC_state, ret, ender);
2320                     return ret;
2321                 }
2322                 else {
2323                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2324                 }
2325             }
2326             case 0:
2327                 RExC_parse--; /* for vFAIL to print correctly */
2328                 vFAIL("Sequence (? incomplete");
2329                 break;
2330             default:
2331                 --RExC_parse;
2332               parse_flags:      /* (?i) */
2333                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2334                     /* (?g), (?gc) and (?o) are useless here
2335                        and must be globally applied -- japhy */
2336
2337                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2338                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2339                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
2340                             if (! (wastedflags & wflagbit) ) {
2341                                 wastedflags |= wflagbit;
2342                                 vWARN5(
2343                                     RExC_parse + 1,
2344                                     "Useless (%s%c) - %suse /%c modifier",
2345                                     flagsp == &negflags ? "?-" : "?",
2346                                     *RExC_parse,
2347                                     flagsp == &negflags ? "don't " : "",
2348                                     *RExC_parse
2349                                 );
2350                             }
2351                         }
2352                     }
2353                     else if (*RExC_parse == 'c') {
2354                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2355                             if (! (wastedflags & WASTED_C) ) {
2356                                 wastedflags |= WASTED_GC;
2357                                 vWARN3(
2358                                     RExC_parse + 1,
2359                                     "Useless (%sc) - %suse /gc modifier",
2360                                     flagsp == &negflags ? "?-" : "?",
2361                                     flagsp == &negflags ? "don't " : ""
2362                                 );
2363                             }
2364                         }
2365                     }
2366                     else { pmflag(flagsp, *RExC_parse); }
2367
2368                     ++RExC_parse;
2369                 }
2370                 if (*RExC_parse == '-') {
2371                     flagsp = &negflags;
2372                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2373                     ++RExC_parse;
2374                     goto parse_flags;
2375                 }
2376                 RExC_flags |= posflags;
2377                 RExC_flags &= ~negflags;
2378                 if (*RExC_parse == ':') {
2379                     RExC_parse++;
2380                     paren = ':';
2381                     break;
2382                 }               
2383               unknown:
2384                 if (*RExC_parse != ')') {
2385                     RExC_parse++;
2386                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2387                 }
2388                 nextchar(pRExC_state);
2389                 *flagp = TRYAGAIN;
2390                 return NULL;
2391             }
2392         }
2393         else {                  /* (...) */
2394             parno = RExC_npar;
2395             RExC_npar++;
2396             ret = reganode(pRExC_state, OPEN, parno);
2397             Set_Node_Length(ret, 1); /* MJD */
2398             Set_Node_Offset(ret, RExC_parse); /* MJD */
2399             is_open = 1;
2400         }
2401     }
2402     else                        /* ! paren */
2403         ret = NULL;
2404
2405     /* Pick up the branches, linking them together. */
2406     parse_start = RExC_parse;   /* MJD */
2407     br = regbranch(pRExC_state, &flags, 1);
2408     /*     branch_len = (paren != 0); */
2409
2410     if (br == NULL)
2411         return(NULL);
2412     if (*RExC_parse == '|') {
2413         if (!SIZE_ONLY && RExC_extralen) {
2414             reginsert(pRExC_state, BRANCHJ, br);
2415         }
2416         else {                  /* MJD */
2417             reginsert(pRExC_state, BRANCH, br);
2418             Set_Node_Length(br, paren != 0);
2419             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2420         }
2421         have_branch = 1;
2422         if (SIZE_ONLY)
2423             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2424     }
2425     else if (paren == ':') {
2426         *flagp |= flags&SIMPLE;
2427     }
2428     if (is_open) {                              /* Starts with OPEN. */
2429         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2430     }
2431     else if (paren != '?')              /* Not Conditional */
2432         ret = br;
2433     *flagp |= flags & (SPSTART | HASWIDTH);
2434     lastbr = br;
2435     while (*RExC_parse == '|') {
2436         if (!SIZE_ONLY && RExC_extralen) {
2437             ender = reganode(pRExC_state, LONGJMP,0);
2438             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2439         }
2440         if (SIZE_ONLY)
2441             RExC_extralen += 2;         /* Account for LONGJMP. */
2442         nextchar(pRExC_state);
2443         br = regbranch(pRExC_state, &flags, 0);
2444
2445         if (br == NULL)
2446             return(NULL);
2447         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2448         lastbr = br;
2449         if (flags&HASWIDTH)
2450             *flagp |= HASWIDTH;
2451         *flagp |= flags&SPSTART;
2452     }
2453
2454     if (have_branch || paren != ':') {
2455         /* Make a closing node, and hook it on the end. */
2456         switch (paren) {
2457         case ':':
2458             ender = reg_node(pRExC_state, TAIL);
2459             break;
2460         case 1:
2461             ender = reganode(pRExC_state, CLOSE, parno);
2462             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2463             Set_Node_Length(ender,1); /* MJD */
2464             break;
2465         case '<':
2466         case ',':
2467         case '=':
2468         case '!':
2469             *flagp &= ~HASWIDTH;
2470             /* FALL THROUGH */
2471         case '>':
2472             ender = reg_node(pRExC_state, SUCCEED);
2473             break;
2474         case 0:
2475             ender = reg_node(pRExC_state, END);
2476             break;
2477         }
2478         regtail(pRExC_state, lastbr, ender);
2479
2480         if (have_branch && !SIZE_ONLY) {
2481             /* Hook the tails of the branches to the closing node. */
2482             for (br = ret; br; br = regnext(br)) {
2483                 const U8 op = PL_regkind[OP(br)];
2484                 if (op == BRANCH) {
2485                     regtail(pRExC_state, NEXTOPER(br), ender);
2486                 }
2487                 else if (op == BRANCHJ) {
2488                     regtail(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
2489                 }
2490             }
2491         }
2492     }
2493
2494     {
2495         const char *p;
2496         static const char parens[] = "=!<,>";
2497
2498         if (paren && (p = strchr(parens, paren))) {
2499             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2500             int flag = (p - parens) > 1;
2501
2502             if (paren == '>')
2503                 node = SUSPEND, flag = 0;
2504             reginsert(pRExC_state, node,ret);
2505             Set_Node_Cur_Length(ret);
2506             Set_Node_Offset(ret, parse_start + 1);
2507             ret->flags = flag;
2508             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2509         }
2510     }
2511
2512     /* Check for proper termination. */
2513     if (paren) {
2514         RExC_flags = oregflags;
2515         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2516             RExC_parse = oregcomp_parse;
2517             vFAIL("Unmatched (");
2518         }
2519     }
2520     else if (!paren && RExC_parse < RExC_end) {
2521         if (*RExC_parse == ')') {
2522             RExC_parse++;
2523             vFAIL("Unmatched )");
2524         }
2525         else
2526             FAIL("Junk on end of regexp");      /* "Can't happen". */
2527         /* NOTREACHED */
2528     }
2529
2530     return(ret);
2531 }
2532
2533 /*
2534  - regbranch - one alternative of an | operator
2535  *
2536  * Implements the concatenation operator.
2537  */
2538 STATIC regnode *
2539 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2540 {
2541     register regnode *ret;
2542     register regnode *chain = NULL;
2543     register regnode *latest;
2544     I32 flags = 0, c = 0;
2545
2546     if (first)
2547         ret = NULL;
2548     else {
2549         if (!SIZE_ONLY && RExC_extralen)
2550             ret = reganode(pRExC_state, BRANCHJ,0);
2551         else {
2552             ret = reg_node(pRExC_state, BRANCH);
2553             Set_Node_Length(ret, 1);
2554         }
2555     }
2556         
2557     if (!first && SIZE_ONLY)
2558         RExC_extralen += 1;                     /* BRANCHJ */
2559
2560     *flagp = WORST;                     /* Tentatively. */
2561
2562     RExC_parse--;
2563     nextchar(pRExC_state);
2564     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2565         flags &= ~TRYAGAIN;
2566         latest = regpiece(pRExC_state, &flags);
2567         if (latest == NULL) {
2568             if (flags & TRYAGAIN)
2569                 continue;
2570             return(NULL);
2571         }
2572         else if (ret == NULL)
2573             ret = latest;
2574         *flagp |= flags&HASWIDTH;
2575         if (chain == NULL)      /* First piece. */
2576             *flagp |= flags&SPSTART;
2577         else {
2578             RExC_naughty++;
2579             regtail(pRExC_state, chain, latest);
2580         }
2581         chain = latest;
2582         c++;
2583     }
2584     if (chain == NULL) {        /* Loop ran zero times. */
2585         chain = reg_node(pRExC_state, NOTHING);
2586         if (ret == NULL)
2587             ret = chain;
2588     }
2589     if (c == 1) {
2590         *flagp |= flags&SIMPLE;
2591     }
2592
2593     return ret;
2594 }
2595
2596 /*
2597  - regpiece - something followed by possible [*+?]
2598  *
2599  * Note that the branching code sequences used for ? and the general cases
2600  * of * and + are somewhat optimized:  they use the same NOTHING node as
2601  * both the endmarker for their branch list and the body of the last branch.
2602  * It might seem that this node could be dispensed with entirely, but the
2603  * endmarker role is not redundant.
2604  */
2605 STATIC regnode *
2606 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2607 {
2608     register regnode *ret;
2609     register char op;
2610     register char *next;
2611     I32 flags;
2612     const char * const origparse = RExC_parse;
2613     I32 min;
2614     I32 max = REG_INFTY;
2615     char *parse_start;
2616     const char *maxpos = NULL;
2617
2618     ret = regatom(pRExC_state, &flags);
2619     if (ret == NULL) {
2620         if (flags & TRYAGAIN)
2621             *flagp |= TRYAGAIN;
2622         return(NULL);
2623     }
2624
2625     op = *RExC_parse;
2626
2627     if (op == '{' && regcurly(RExC_parse)) {
2628         parse_start = RExC_parse; /* MJD */
2629         next = RExC_parse + 1;
2630         maxpos = NULL;
2631         while (isDIGIT(*next) || *next == ',') {
2632             if (*next == ',') {
2633                 if (maxpos)
2634                     break;
2635                 else
2636                     maxpos = next;
2637             }
2638             next++;
2639         }
2640         if (*next == '}') {             /* got one */
2641             if (!maxpos)
2642                 maxpos = next;
2643             RExC_parse++;
2644             min = atoi(RExC_parse);
2645             if (*maxpos == ',')
2646                 maxpos++;
2647             else
2648                 maxpos = RExC_parse;
2649             max = atoi(maxpos);
2650             if (!max && *maxpos != '0')
2651                 max = REG_INFTY;                /* meaning "infinity" */
2652             else if (max >= REG_INFTY)
2653                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2654             RExC_parse = next;
2655             nextchar(pRExC_state);
2656
2657         do_curly:
2658             if ((flags&SIMPLE)) {
2659                 RExC_naughty += 2 + RExC_naughty / 2;
2660                 reginsert(pRExC_state, CURLY, ret);
2661                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2662                 Set_Node_Cur_Length(ret);
2663             }
2664             else {
2665                 regnode *w = reg_node(pRExC_state, WHILEM);
2666
2667                 w->flags = 0;
2668                 regtail(pRExC_state, ret, w);
2669                 if (!SIZE_ONLY && RExC_extralen) {
2670                     reginsert(pRExC_state, LONGJMP,ret);
2671                     reginsert(pRExC_state, NOTHING,ret);
2672                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2673                 }
2674                 reginsert(pRExC_state, CURLYX,ret);
2675                                 /* MJD hk */
2676                 Set_Node_Offset(ret, parse_start+1);
2677                 Set_Node_Length(ret,
2678                                 op == '{' ? (RExC_parse - parse_start) : 1);
2679
2680                 if (!SIZE_ONLY && RExC_extralen)
2681                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2682                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2683                 if (SIZE_ONLY)
2684                     RExC_whilem_seen++, RExC_extralen += 3;
2685                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2686             }
2687             ret->flags = 0;
2688
2689             if (min > 0)
2690                 *flagp = WORST;
2691             if (max > 0)
2692                 *flagp |= HASWIDTH;
2693             if (max && max < min)
2694                 vFAIL("Can't do {n,m} with n > m");
2695             if (!SIZE_ONLY) {
2696                 ARG1_SET(ret, (U16)min);
2697                 ARG2_SET(ret, (U16)max);
2698             }
2699
2700             goto nest_check;
2701         }
2702     }
2703
2704     if (!ISMULT1(op)) {
2705         *flagp = flags;
2706         return(ret);
2707     }
2708
2709 #if 0                           /* Now runtime fix should be reliable. */
2710
2711     /* if this is reinstated, don't forget to put this back into perldiag:
2712
2713             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2714
2715            (F) The part of the regexp subject to either the * or + quantifier
2716            could match an empty string. The {#} shows in the regular
2717            expression about where the problem was discovered.
2718
2719     */
2720
2721     if (!(flags&HASWIDTH) && op != '?')
2722       vFAIL("Regexp *+ operand could be empty");
2723 #endif
2724
2725     parse_start = RExC_parse;
2726     nextchar(pRExC_state);
2727
2728     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2729
2730     if (op == '*' && (flags&SIMPLE)) {
2731         reginsert(pRExC_state, STAR, ret);
2732         ret->flags = 0;
2733         RExC_naughty += 4;
2734     }
2735     else if (op == '*') {
2736         min = 0;
2737         goto do_curly;
2738     }
2739     else if (op == '+' && (flags&SIMPLE)) {
2740         reginsert(pRExC_state, PLUS, ret);
2741         ret->flags = 0;
2742         RExC_naughty += 3;
2743     }
2744     else if (op == '+') {
2745         min = 1;
2746         goto do_curly;
2747     }
2748     else if (op == '?') {
2749         min = 0; max = 1;
2750         goto do_curly;
2751     }
2752   nest_check:
2753     if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
2754         vWARN3(RExC_parse,
2755                "%.*s matches null string many times",
2756                (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
2757                origparse);
2758     }
2759
2760     if (*RExC_parse == '?') {
2761         nextchar(pRExC_state);
2762         reginsert(pRExC_state, MINMOD, ret);
2763         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2764     }
2765     if (ISMULT2(RExC_parse)) {
2766         RExC_parse++;
2767         vFAIL("Nested quantifiers");
2768     }
2769
2770     return(ret);
2771 }
2772
2773 /*
2774  - regatom - the lowest level
2775  *
2776  * Optimization:  gobbles an entire sequence of ordinary characters so that
2777  * it can turn them into a single node, which is smaller to store and
2778  * faster to run.  Backslashed characters are exceptions, each becoming a
2779  * separate node; the code is simpler that way and it's not worth fixing.
2780  *
2781  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2782 STATIC regnode *
2783 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2784 {
2785     register regnode *ret = NULL;
2786     I32 flags;
2787     char *parse_start = RExC_parse;
2788
2789     *flagp = WORST;             /* Tentatively. */
2790
2791 tryagain:
2792     switch (*RExC_parse) {
2793     case '^':
2794         RExC_seen_zerolen++;
2795         nextchar(pRExC_state);
2796         if (RExC_flags & PMf_MULTILINE)
2797             ret = reg_node(pRExC_state, MBOL);
2798         else if (RExC_flags & PMf_SINGLELINE)
2799             ret = reg_node(pRExC_state, SBOL);
2800         else
2801             ret = reg_node(pRExC_state, BOL);
2802         Set_Node_Length(ret, 1); /* MJD */
2803         break;
2804     case '$':
2805         nextchar(pRExC_state);
2806         if (*RExC_parse)
2807             RExC_seen_zerolen++;
2808         if (RExC_flags & PMf_MULTILINE)
2809             ret = reg_node(pRExC_state, MEOL);
2810         else if (RExC_flags & PMf_SINGLELINE)
2811             ret = reg_node(pRExC_state, SEOL);
2812         else
2813             ret = reg_node(pRExC_state, EOL);
2814         Set_Node_Length(ret, 1); /* MJD */
2815         break;
2816     case '.':
2817         nextchar(pRExC_state);
2818         if (RExC_flags & PMf_SINGLELINE)
2819             ret = reg_node(pRExC_state, SANY);
2820         else
2821             ret = reg_node(pRExC_state, REG_ANY);
2822         *flagp |= HASWIDTH|SIMPLE;
2823         RExC_naughty++;
2824         Set_Node_Length(ret, 1); /* MJD */
2825         break;
2826     case '[':
2827     {
2828         char *oregcomp_parse = ++RExC_parse;
2829         ret = regclass(pRExC_state);
2830         if (*RExC_parse != ']') {
2831             RExC_parse = oregcomp_parse;
2832             vFAIL("Unmatched [");
2833         }
2834         nextchar(pRExC_state);
2835         *flagp |= HASWIDTH|SIMPLE;
2836         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2837         break;
2838     }
2839     case '(':
2840         nextchar(pRExC_state);
2841         ret = reg(pRExC_state, 1, &flags);
2842         if (ret == NULL) {
2843                 if (flags & TRYAGAIN) {
2844                     if (RExC_parse == RExC_end) {
2845                          /* Make parent create an empty node if needed. */
2846                         *flagp |= TRYAGAIN;
2847                         return(NULL);
2848                     }
2849                     goto tryagain;
2850                 }
2851                 return(NULL);
2852         }
2853         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2854         break;
2855     case '|':
2856     case ')':
2857         if (flags & TRYAGAIN) {
2858             *flagp |= TRYAGAIN;
2859             return NULL;
2860         }
2861         vFAIL("Internal urp");
2862                                 /* Supposed to be caught earlier. */
2863         break;
2864     case '{':
2865         if (!regcurly(RExC_parse)) {
2866             RExC_parse++;
2867             goto defchar;
2868         }
2869         /* FALL THROUGH */
2870     case '?':
2871     case '+':
2872     case '*':
2873         RExC_parse++;
2874         vFAIL("Quantifier follows nothing");
2875         break;
2876     case '\\':
2877         switch (*++RExC_parse) {
2878         case 'A':
2879             RExC_seen_zerolen++;
2880             ret = reg_node(pRExC_state, SBOL);
2881             *flagp |= SIMPLE;
2882             nextchar(pRExC_state);
2883             Set_Node_Length(ret, 2); /* MJD */
2884             break;
2885         case 'G':
2886             ret = reg_node(pRExC_state, GPOS);
2887             RExC_seen |= REG_SEEN_GPOS;
2888             *flagp |= SIMPLE;
2889             nextchar(pRExC_state);
2890             Set_Node_Length(ret, 2); /* MJD */
2891             break;
2892         case 'Z':
2893             ret = reg_node(pRExC_state, SEOL);
2894             *flagp |= SIMPLE;
2895             RExC_seen_zerolen++;                /* Do not optimize RE away */
2896             nextchar(pRExC_state);
2897             break;
2898         case 'z':
2899             ret = reg_node(pRExC_state, EOS);
2900             *flagp |= SIMPLE;
2901             RExC_seen_zerolen++;                /* Do not optimize RE away */
2902             nextchar(pRExC_state);
2903             Set_Node_Length(ret, 2); /* MJD */
2904             break;
2905         case 'C':
2906             ret = reg_node(pRExC_state, CANY);
2907             RExC_seen |= REG_SEEN_CANY;
2908             *flagp |= HASWIDTH|SIMPLE;
2909             nextchar(pRExC_state);
2910             Set_Node_Length(ret, 2); /* MJD */
2911             break;
2912         case 'X':
2913             ret = reg_node(pRExC_state, CLUMP);
2914             *flagp |= HASWIDTH;
2915             nextchar(pRExC_state);
2916             Set_Node_Length(ret, 2); /* MJD */
2917             break;
2918         case 'w':
2919             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2920             *flagp |= HASWIDTH|SIMPLE;
2921             nextchar(pRExC_state);
2922             Set_Node_Length(ret, 2); /* MJD */
2923             break;
2924         case 'W':
2925             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2926             *flagp |= HASWIDTH|SIMPLE;
2927             nextchar(pRExC_state);
2928             Set_Node_Length(ret, 2); /* MJD */
2929             break;
2930         case 'b':
2931             RExC_seen_zerolen++;
2932             RExC_seen |= REG_SEEN_LOOKBEHIND;
2933             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2934             *flagp |= SIMPLE;
2935             nextchar(pRExC_state);
2936             Set_Node_Length(ret, 2); /* MJD */
2937             break;
2938         case 'B':
2939             RExC_seen_zerolen++;
2940             RExC_seen |= REG_SEEN_LOOKBEHIND;
2941             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2942             *flagp |= SIMPLE;
2943             nextchar(pRExC_state);
2944             Set_Node_Length(ret, 2); /* MJD */
2945             break;
2946         case 's':
2947             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2948             *flagp |= HASWIDTH|SIMPLE;
2949             nextchar(pRExC_state);
2950             Set_Node_Length(ret, 2); /* MJD */
2951             break;
2952         case 'S':
2953             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2954             *flagp |= HASWIDTH|SIMPLE;
2955             nextchar(pRExC_state);
2956             Set_Node_Length(ret, 2); /* MJD */
2957             break;
2958         case 'd':
2959             ret = reg_node(pRExC_state, DIGIT);
2960             *flagp |= HASWIDTH|SIMPLE;
2961             nextchar(pRExC_state);
2962             Set_Node_Length(ret, 2); /* MJD */
2963             break;
2964         case 'D':
2965             ret = reg_node(pRExC_state, NDIGIT);
2966             *flagp |= HASWIDTH|SIMPLE;
2967             nextchar(pRExC_state);
2968             Set_Node_Length(ret, 2); /* MJD */
2969             break;
2970         case 'p':
2971         case 'P':
2972             {   
2973                 char* oldregxend = RExC_end;
2974                 char* parse_start = RExC_parse - 2;
2975
2976                 if (RExC_parse[1] == '{') {
2977                   /* a lovely hack--pretend we saw [\pX] instead */
2978                     RExC_end = strchr(RExC_parse, '}');
2979                     if (!RExC_end) {
2980                         U8 c = (U8)*RExC_parse;
2981                         RExC_parse += 2;
2982                         RExC_end = oldregxend;
2983                         vFAIL2("Missing right brace on \\%c{}", c);
2984                     }
2985                     RExC_end++;
2986                 }
2987                 else {
2988                     RExC_end = RExC_parse + 2;
2989                     if (RExC_end > oldregxend)
2990                         RExC_end = oldregxend;
2991                 }
2992                 RExC_parse--;
2993
2994                 ret = regclass(pRExC_state);
2995
2996                 RExC_end = oldregxend;
2997                 RExC_parse--;
2998
2999                 Set_Node_Offset(ret, parse_start + 2);
3000                 Set_Node_Cur_Length(ret);
3001                 nextchar(pRExC_state);
3002                 *flagp |= HASWIDTH|SIMPLE;
3003             }
3004             break;
3005         case 'n':
3006         case 'r':
3007         case 't':
3008         case 'f':
3009         case 'e':
3010         case 'a':
3011         case 'x':
3012         case 'c':
3013         case '0':
3014             goto defchar;
3015         case '1': case '2': case '3': case '4':
3016         case '5': case '6': case '7': case '8': case '9':
3017             {
3018                 const I32 num = atoi(RExC_parse);
3019
3020                 if (num > 9 && num >= RExC_npar)
3021                     goto defchar;
3022                 else {
3023                     char * parse_start = RExC_parse - 1; /* MJD */
3024                     while (isDIGIT(*RExC_parse))
3025                         RExC_parse++;
3026
3027                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3028                         vFAIL("Reference to nonexistent group");
3029                     RExC_sawback = 1;
3030                     ret = reganode(pRExC_state,
3031                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3032                                    num);
3033                     *flagp |= HASWIDTH;
3034
3035                     /* override incorrect value set in reganode MJD */
3036                     Set_Node_Offset(ret, parse_start+1);
3037                     Set_Node_Cur_Length(ret); /* MJD */
3038                     RExC_parse--;
3039                     nextchar(pRExC_state);
3040                 }
3041             }
3042             break;
3043         case '\0':
3044             if (RExC_parse >= RExC_end)
3045                 FAIL("Trailing \\");
3046             /* FALL THROUGH */
3047         default:
3048             /* Do not generate "unrecognized" warnings here, we fall
3049                back into the quick-grab loop below */
3050             parse_start--;
3051             goto defchar;
3052         }
3053         break;
3054
3055     case '#':
3056         if (RExC_flags & PMf_EXTENDED) {
3057             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3058             if (RExC_parse < RExC_end)
3059                 goto tryagain;
3060         }
3061         /* FALL THROUGH */
3062
3063     default: {
3064             register STRLEN len;
3065             register UV ender;
3066             register char *p;
3067             char *oldp, *s;
3068             STRLEN foldlen;
3069             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
3070
3071             parse_start = RExC_parse - 1;
3072
3073             RExC_parse++;
3074
3075         defchar:
3076             ender = 0;
3077             ret = reg_node(pRExC_state,
3078                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3079             s = STRING(ret);
3080             for (len = 0, p = RExC_parse - 1;
3081               len < 127 && p < RExC_end;
3082               len++)
3083             {
3084                 oldp = p;
3085
3086                 if (RExC_flags & PMf_EXTENDED)
3087                     p = regwhite(p, RExC_end);
3088                 switch (*p) {
3089                 case '^':
3090                 case '$':
3091                 case '.':
3092                 case '[':
3093                 case '(':
3094                 case ')':
3095                 case '|':
3096                     goto loopdone;
3097                 case '\\':
3098                     switch (*++p) {
3099                     case 'A':
3100                     case 'C':
3101                     case 'X':
3102                     case 'G':
3103                     case 'Z':
3104                     case 'z':
3105                     case 'w':
3106                     case 'W':
3107                     case 'b':
3108                     case 'B':
3109                     case 's':
3110                     case 'S':
3111                     case 'd':
3112                     case 'D':
3113                     case 'p':
3114                     case 'P':
3115                         --p;
3116                         goto loopdone;
3117                     case 'n':
3118                         ender = '\n';
3119                         p++;
3120                         break;
3121                     case 'r':
3122                         ender = '\r';
3123                         p++;
3124                         break;
3125                     case 't':
3126                         ender = '\t';
3127                         p++;
3128                         break;
3129                     case 'f':
3130                         ender = '\f';
3131                         p++;
3132                         break;
3133                     case 'e':
3134                           ender = ASCII_TO_NATIVE('\033');
3135                         p++;
3136                         break;
3137                     case 'a':
3138                           ender = ASCII_TO_NATIVE('\007');
3139                         p++;
3140                         break;
3141                     case 'x':
3142                         if (*++p == '{') {
3143                             char* const e = strchr(p, '}');
3144         
3145                             if (!e) {
3146                                 RExC_parse = p + 1;
3147                                 vFAIL("Missing right brace on \\x{}");
3148                             }
3149                             else {
3150                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3151                                     | PERL_SCAN_DISALLOW_PREFIX;
3152                                 STRLEN numlen = e - p - 1;
3153                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3154                                 if (ender > 0xff)
3155                                     RExC_utf8 = 1;
3156                                 p = e + 1;
3157                             }
3158                         }
3159                         else {
3160                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3161                             STRLEN numlen = 2;
3162                             ender = grok_hex(p, &numlen, &flags, NULL);
3163                             p += numlen;
3164                         }
3165                         break;
3166                     case 'c':
3167                         p++;
3168                         ender = UCHARAT(p++);
3169                         ender = toCTRL(ender);
3170                         break;
3171                     case '0': case '1': case '2': case '3':case '4':
3172                     case '5': case '6': case '7': case '8':case '9':
3173                         if (*p == '0' ||
3174                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3175                             I32 flags = 0;
3176                             STRLEN numlen = 3;
3177                             ender = grok_oct(p, &numlen, &flags, NULL);
3178                             p += numlen;
3179                         }
3180                         else {
3181                             --p;
3182                             goto loopdone;
3183                         }
3184                         break;
3185                     case '\0':
3186                         if (p >= RExC_end)
3187                             FAIL("Trailing \\");
3188                         /* FALL THROUGH */
3189                     default:
3190                         if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
3191                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3192                         goto normal_default;
3193                     }
3194                     break;
3195                 default:
3196                   normal_default:
3197                     if (UTF8_IS_START(*p) && UTF) {
3198                         STRLEN numlen;
3199                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3200                                                &numlen, 0);
3201                         p += numlen;
3202                     }
3203                     else
3204                         ender = *p++;
3205                     break;
3206                 }
3207                 if (RExC_flags & PMf_EXTENDED)
3208                     p = regwhite(p, RExC_end);
3209                 if (UTF && FOLD) {
3210                     /* Prime the casefolded buffer. */
3211                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3212                 }
3213                 if (ISMULT2(p)) { /* Back off on ?+*. */
3214                     if (len)
3215                         p = oldp;
3216                     else if (UTF) {
3217                          if (FOLD) {
3218                               /* Emit all the Unicode characters. */
3219                               STRLEN numlen;
3220                               for (foldbuf = tmpbuf;
3221                                    foldlen;
3222                                    foldlen -= numlen) {
3223                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3224                                    if (numlen > 0) {
3225                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
3226                                         s       += unilen;
3227                                         len     += unilen;
3228                                         /* In EBCDIC the numlen
3229                                          * and unilen can differ. */
3230                                         foldbuf += numlen;
3231                                         if (numlen >= foldlen)
3232                                              break;
3233                                    }
3234                                    else
3235                                         break; /* "Can't happen." */
3236                               }
3237                          }
3238                          else {
3239                               const STRLEN unilen = reguni(pRExC_state, ender, s);
3240                               if (unilen > 0) {
3241                                    s   += unilen;
3242                                    len += unilen;
3243                               }
3244                          }
3245                     }
3246                     else {
3247                         len++;
3248                         REGC((char)ender, s++);
3249                     }
3250                     break;
3251                 }
3252                 if (UTF) {
3253                      if (FOLD) {
3254                           /* Emit all the Unicode characters. */
3255                           STRLEN numlen;
3256                           for (foldbuf = tmpbuf;
3257                                foldlen;
3258                                foldlen -= numlen) {
3259                                ender = utf8_to_uvchr(foldbuf, &numlen);
3260                                if (numlen > 0) {
3261                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
3262                                     len     += unilen;
3263                                     s       += unilen;
3264                                     /* In EBCDIC the numlen
3265                                      * and unilen can differ. */
3266                                     foldbuf += numlen;
3267                                     if (numlen >= foldlen)
3268                                          break;
3269                                }
3270                                else
3271                                     break;
3272                           }
3273                      }
3274                      else {
3275                           const STRLEN unilen = reguni(pRExC_state, ender, s);
3276                           if (unilen > 0) {
3277                                s   += unilen;
3278                                len += unilen;
3279                           }
3280                      }
3281                      len--;
3282                 }
3283                 else
3284                     REGC((char)ender, s++);
3285             }
3286         loopdone:
3287             RExC_parse = p - 1;
3288             Set_Node_Cur_Length(ret); /* MJD */
3289             nextchar(pRExC_state);
3290             {
3291                 /* len is STRLEN which is unsigned, need to copy to signed */
3292                 IV iv = len;
3293                 if (iv < 0)
3294                     vFAIL("Internal disaster");
3295             }
3296             if (len > 0)
3297                 *flagp |= HASWIDTH;
3298             if (len == 1 && UNI_IS_INVARIANT(ender))
3299                 *flagp |= SIMPLE;
3300             if (!SIZE_ONLY)
3301                 STR_LEN(ret) = len;
3302             if (SIZE_ONLY)
3303                 RExC_size += STR_SZ(len);
3304             else
3305                 RExC_emit += STR_SZ(len);
3306         }
3307         break;
3308     }
3309
3310     /* If the encoding pragma is in effect recode the text of
3311      * any EXACT-kind nodes. */
3312     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3313         STRLEN oldlen = STR_LEN(ret);
3314         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3315
3316         if (RExC_utf8)
3317             SvUTF8_on(sv);
3318         if (sv_utf8_downgrade(sv, TRUE)) {
3319             const char * const s = sv_recode_to_utf8(sv, PL_encoding);
3320             const STRLEN newlen = SvCUR(sv);
3321
3322             if (SvUTF8(sv))
3323                 RExC_utf8 = 1;
3324             if (!SIZE_ONLY) {
3325                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3326                                       (int)oldlen, STRING(ret),
3327                                       (int)newlen, s));
3328                 Copy(s, STRING(ret), newlen, char);
3329                 STR_LEN(ret) += newlen - oldlen;
3330                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3331             } else
3332                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3333         }
3334     }
3335
3336     return(ret);
3337 }
3338
3339 STATIC char *
3340 S_regwhite(char *p, const char *e)
3341 {
3342     while (p < e) {
3343         if (isSPACE(*p))
3344             ++p;
3345         else if (*p == '#') {
3346             do {
3347                 p++;
3348             } while (p < e && *p != '\n');
3349         }
3350         else
3351             break;
3352     }
3353     return p;
3354 }
3355
3356 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3357    Character classes ([:foo:]) can also be negated ([:^foo:]).
3358    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3359    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3360    but trigger failures because they are currently unimplemented. */
3361
3362 #define POSIXCC_DONE(c)   ((c) == ':')
3363 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3364 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3365
3366 STATIC I32
3367 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3368 {
3369     I32 namedclass = OOB_NAMEDCLASS;
3370
3371     if (value == '[' && RExC_parse + 1 < RExC_end &&
3372         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3373         POSIXCC(UCHARAT(RExC_parse))) {
3374         const char c = UCHARAT(RExC_parse);
3375         char* const s = RExC_parse++;
3376         
3377         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3378             RExC_parse++;
3379         if (RExC_parse == RExC_end)
3380             /* Grandfather lone [:, [=, [. */
3381             RExC_parse = s;
3382         else {
3383             const char* t = RExC_parse++; /* skip over the c */
3384             const char *posixcc;
3385
3386             assert(*t == c);
3387
3388             if (UCHARAT(RExC_parse) == ']') {
3389                 RExC_parse++; /* skip over the ending ] */
3390                 posixcc = s + 1;
3391                 if (*s == ':') {
3392                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3393                     const I32 skip = t - posixcc;
3394
3395                     /* Initially switch on the length of the name.  */
3396                     switch (skip) {
3397                     case 4:
3398                         if (memEQ(posixcc, "word", 4)) {
3399                             /* this is not POSIX, this is the Perl \w */;
3400                             namedclass
3401                                 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3402                         }
3403                         break;
3404                     case 5:
3405                         /* Names all of length 5.  */
3406                         /* alnum alpha ascii blank cntrl digit graph lower
3407                            print punct space upper  */
3408                         /* Offset 4 gives the best switch position.  */
3409                         switch (posixcc[4]) {
3410                         case 'a':
3411                             if (memEQ(posixcc, "alph", 4)) {
3412                                 /*                  a     */
3413                                 namedclass
3414                                     = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3415                             }
3416                             break;
3417                         case 'e':
3418                             if (memEQ(posixcc, "spac", 4)) {
3419                                 /*                  e     */
3420                                 namedclass
3421                                     = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3422                             }
3423                             break;
3424                         case 'h':
3425                             if (memEQ(posixcc, "grap", 4)) {
3426                                 /*                  h     */
3427                                 namedclass
3428                                     = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3429                             }
3430                             break;
3431                         case 'i':
3432                             if (memEQ(posixcc, "asci", 4)) {
3433                                 /*                  i     */
3434                                 namedclass
3435                                     = complement ? ANYOF_NASCII : ANYOF_ASCII;
3436                             }
3437                             break;
3438                         case 'k':
3439                             if (memEQ(posixcc, "blan", 4)) {
3440                                 /*                  k     */
3441                                 namedclass
3442                                     = complement ? ANYOF_NBLANK : ANYOF_BLANK;
3443                             }
3444                             break;
3445                         case 'l':
3446                             if (memEQ(posixcc, "cntr", 4)) {
3447                                 /*                  l     */
3448                                 namedclass
3449                                     = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3450                             }
3451                             break;
3452                         case 'm':
3453                             if (memEQ(posixcc, "alnu", 4)) {
3454                                 /*                  m     */
3455                                 namedclass
3456                                     = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3457                             }
3458                             break;
3459                         case 'r':
3460                             if (memEQ(posixcc, "lowe", 4)) {
3461                                 /*                  r     */
3462                                 namedclass
3463                                     = complement ? ANYOF_NLOWER : ANYOF_LOWER;
3464                             }
3465                             if (memEQ(posixcc, "uppe", 4)) {
3466                                 /*                  r     */
3467                                 namedclass
3468                                     = complement ? ANYOF_NUPPER : ANYOF_UPPER;
3469                             }
3470                             break;
3471                         case 't':
3472                             if (memEQ(posixcc, "digi", 4)) {
3473                                 /*                  t     */
3474                                 namedclass
3475                                     = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3476                             }
3477                             if (memEQ(posixcc, "prin", 4)) {
3478                                 /*                  t     */
3479                                 namedclass
3480                                     = complement ? ANYOF_NPRINT : ANYOF_PRINT;
3481                             }
3482                             if (memEQ(posixcc, "punc", 4)) {
3483                                 /*                  t     */
3484                                 namedclass
3485                                     = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3486                             }
3487                             break;
3488                         }
3489                         break;
3490                     case 6:
3491                         if (memEQ(posixcc, "xdigit", 6)) {
3492                             namedclass
3493                                 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3494                         }
3495                         break;
3496                     }
3497
3498                     if (namedclass == OOB_NAMEDCLASS)
3499                     {
3500                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3501                                       t - s - 1, s + 1);
3502                     }
3503                     assert (posixcc[skip] == ':');
3504                     assert (posixcc[skip+1] == ']');
3505                 } else if (!SIZE_ONLY) {
3506                     /* [[=foo=]] and [[.foo.]] are still future. */
3507
3508                     /* adjust RExC_parse so the warning shows after
3509                        the class closes */
3510                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3511                         RExC_parse++;
3512                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3513                 }
3514             } else {
3515                 /* Maternal grandfather:
3516                  * "[:" ending in ":" but not in ":]" */
3517                 RExC_parse = s;
3518             }
3519         }
3520     }
3521
3522     return namedclass;
3523 }
3524
3525 STATIC void
3526 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3527 {
3528     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3529         const char *s = RExC_parse;
3530         const char  c = *s++;
3531
3532         while(*s && isALNUM(*s))
3533             s++;
3534         if (*s && c == *s && s[1] == ']') {
3535             if (ckWARN(WARN_REGEXP))
3536                 vWARN3(s+2,
3537                         "POSIX syntax [%c %c] belongs inside character classes",
3538                         c, c);
3539
3540             /* [[=foo=]] and [[.foo.]] are still future. */
3541             if (POSIXCC_NOTYET(c)) {
3542                 /* adjust RExC_parse so the error shows after
3543                    the class closes */
3544                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3545                     ;
3546                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3547             }
3548         }
3549     }
3550 }
3551
3552 STATIC regnode *
3553 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3554 {
3555     register UV value = 0;
3556     register UV nextvalue;
3557     register IV prevvalue = OOB_UNICODE;
3558     register IV range = 0;
3559     register regnode *ret;
3560     STRLEN numlen;
3561     IV namedclass;
3562     char *rangebegin = NULL;
3563     bool need_class = 0;
3564     SV *listsv = NULL;
3565     register char *e;
3566     UV n;
3567     bool optimize_invert   = TRUE;
3568     AV* unicode_alternate  = NULL;
3569 #ifdef EBCDIC
3570     UV literal_endpoint = 0;
3571 #endif
3572
3573     ret = reganode(pRExC_state, ANYOF, 0);
3574
3575     if (!SIZE_ONLY)
3576         ANYOF_FLAGS(ret) = 0;
3577
3578     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3579         RExC_naughty++;
3580         RExC_parse++;
3581         if (!SIZE_ONLY)
3582             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3583     }
3584
3585     if (SIZE_ONLY) {
3586         RExC_size += ANYOF_SKIP;
3587         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
3588     }
3589     else {
3590         RExC_emit += ANYOF_SKIP;
3591         if (FOLD)
3592             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3593         if (LOC)
3594             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3595         ANYOF_BITMAP_ZERO(ret);
3596         listsv = newSVpvs("# comment\n");
3597     }
3598
3599     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3600
3601     if (!SIZE_ONLY && POSIXCC(nextvalue))
3602         checkposixcc(pRExC_state);
3603
3604     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3605     if (UCHARAT(RExC_parse) == ']')
3606         goto charclassloop;
3607
3608     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3609
3610     charclassloop:
3611
3612         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3613
3614         if (!range)
3615             rangebegin = RExC_parse;
3616         if (UTF) {
3617             value = utf8n_to_uvchr((U8*)RExC_parse,
3618                                    RExC_end - RExC_parse,
3619                                    &numlen, 0);
3620             RExC_parse += numlen;
3621         }
3622         else
3623             value = UCHARAT(RExC_parse++);
3624         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3625         if (value == '[' && POSIXCC(nextvalue))
3626             namedclass = regpposixcc(pRExC_state, value);
3627         else if (value == '\\') {
3628             if (UTF) {
3629                 value = utf8n_to_uvchr((U8*)RExC_parse,
3630                                    RExC_end - RExC_parse,
3631                                    &numlen, 0);
3632                 RExC_parse += numlen;
3633             }
3634             else
3635                 value = UCHARAT(RExC_parse++);
3636             /* Some compilers cannot handle switching on 64-bit integer
3637              * values, therefore value cannot be an UV.  Yes, this will
3638              * be a problem later if we want switch on Unicode.
3639              * A similar issue a little bit later when switching on
3640              * namedclass. --jhi */
3641             switch ((I32)value) {
3642             case 'w':   namedclass = ANYOF_ALNUM;       break;
3643             case 'W':   namedclass = ANYOF_NALNUM;      break;
3644             case 's':   namedclass = ANYOF_SPACE;       break;
3645             case 'S':   namedclass = ANYOF_NSPACE;      break;
3646             case 'd':   namedclass = ANYOF_DIGIT;       break;
3647             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3648             case 'p':
3649             case 'P':
3650                 if (RExC_parse >= RExC_end)
3651                     vFAIL2("Empty \\%c{}", (U8)value);
3652                 if (*RExC_parse == '{') {
3653                     const U8 c = (U8)value;
3654                     e = strchr(RExC_parse++, '}');
3655                     if (!e)
3656                         vFAIL2("Missing right brace on \\%c{}", c);
3657                     while (isSPACE(UCHARAT(RExC_parse)))
3658                         RExC_parse++;
3659                     if (e == RExC_parse)
3660                         vFAIL2("Empty \\%c{}", c);
3661                     n = e - RExC_parse;
3662                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3663                         n--;
3664                 }
3665                 else {
3666                     e = RExC_parse;
3667                     n = 1;
3668                 }
3669                 if (!SIZE_ONLY) {
3670                     if (UCHARAT(RExC_parse) == '^') {
3671                          RExC_parse++;
3672                          n--;
3673                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3674                          while (isSPACE(UCHARAT(RExC_parse))) {
3675                               RExC_parse++;
3676                               n--;
3677                          }
3678                     }
3679                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
3680                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
3681                 }
3682                 RExC_parse = e + 1;
3683                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3684                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
3685                 break;
3686             case 'n':   value = '\n';                   break;
3687             case 'r':   value = '\r';                   break;
3688             case 't':   value = '\t';                   break;
3689             case 'f':   value = '\f';                   break;
3690             case 'b':   value = '\b';                   break;
3691             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3692             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3693             case 'x':
3694                 if (*RExC_parse == '{') {
3695                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3696                         | PERL_SCAN_DISALLOW_PREFIX;
3697                     e = strchr(RExC_parse++, '}');
3698                     if (!e)
3699                         vFAIL("Missing right brace on \\x{}");
3700
3701                     numlen = e - RExC_parse;
3702                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3703                     RExC_parse = e + 1;
3704                 }
3705                 else {
3706                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3707                     numlen = 2;
3708                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3709                     RExC_parse += numlen;
3710                 }
3711                 break;
3712             case 'c':
3713                 value = UCHARAT(RExC_parse++);
3714                 value = toCTRL(value);
3715                 break;
3716             case '0': case '1': case '2': case '3': case '4':
3717             case '5': case '6': case '7': case '8': case '9':
3718             {
3719                 I32 flags = 0;
3720                 numlen = 3;
3721                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3722                 RExC_parse += numlen;
3723                 break;
3724             }
3725             default:
3726                 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
3727                     vWARN2(RExC_parse,
3728                            "Unrecognized escape \\%c in character class passed through",
3729                            (int)value);
3730                 break;
3731             }
3732         } /* end of \blah */
3733 #ifdef EBCDIC
3734         else
3735             literal_endpoint++;
3736 #endif
3737
3738         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3739
3740             if (!SIZE_ONLY && !need_class)
3741                 ANYOF_CLASS_ZERO(ret);
3742
3743             need_class = 1;
3744
3745             /* a bad range like a-\d, a-[:digit:] ? */
3746             if (range) {
3747                 if (!SIZE_ONLY) {
3748                     if (ckWARN(WARN_REGEXP)) {
3749                         const int w =
3750                             RExC_parse >= rangebegin ?
3751                             RExC_parse - rangebegin : 0;
3752                         vWARN4(RExC_parse,
3753                                "False [] range \"%*.*s\"",
3754                                w, w, rangebegin);
3755                     }
3756                     if (prevvalue < 256) {
3757                         ANYOF_BITMAP_SET(ret, prevvalue);
3758                         ANYOF_BITMAP_SET(ret, '-');
3759                     }
3760                     else {
3761                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3762                         Perl_sv_catpvf(aTHX_ listsv,
3763                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3764                     }
3765                 }
3766
3767                 range = 0; /* this was not a true range */
3768             }
3769
3770             if (!SIZE_ONLY) {
3771                 const char *what = NULL;
3772                 char yesno = 0;
3773
3774                 if (namedclass > OOB_NAMEDCLASS)
3775                     optimize_invert = FALSE;
3776                 /* Possible truncation here but in some 64-bit environments
3777                  * the compiler gets heartburn about switch on 64-bit values.
3778                  * A similar issue a little earlier when switching on value.
3779                  * --jhi */
3780                 switch ((I32)namedclass) {
3781                 case ANYOF_ALNUM:
3782                     if (LOC)
3783                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3784                     else {
3785                         for (value = 0; value < 256; value++)
3786                             if (isALNUM(value))
3787                                 ANYOF_BITMAP_SET(ret, value);
3788                     }
3789                     yesno = '+';
3790                     what = "Word";      
3791                     break;
3792                 case ANYOF_NALNUM:
3793                     if (LOC)
3794                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3795                     else {
3796                         for (value = 0; value < 256; value++)
3797                             if (!isALNUM(value))
3798                                 ANYOF_BITMAP_SET(ret, value);
3799                     }
3800                     yesno = '!';
3801                     what = "Word";
3802                     break;
3803                 case ANYOF_ALNUMC:
3804                     if (LOC)
3805                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3806                     else {
3807                         for (value = 0; value < 256; value++)
3808                             if (isALNUMC(value))
3809                                 ANYOF_BITMAP_SET(ret, value);
3810                     }
3811                     yesno = '+';
3812                     what = "Alnum";
3813                     break;
3814                 case ANYOF_NALNUMC:
3815                     if (LOC)
3816                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3817                     else {
3818                         for (value = 0; value < 256; value++)
3819                             if (!isALNUMC(value))
3820                                 ANYOF_BITMAP_SET(ret, value);
3821                     }
3822                     yesno = '!';
3823                     what = "Alnum";
3824                     break;
3825                 case ANYOF_ALPHA:
3826                     if (LOC)
3827                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3828                     else {
3829                         for (value = 0; value < 256; value++)
3830                             if (isALPHA(value))
3831                                 ANYOF_BITMAP_SET(ret, value);
3832                     }
3833                     yesno = '+';
3834                     what = "Alpha";
3835                     break;
3836                 case ANYOF_NALPHA:
3837                     if (LOC)
3838                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3839                     else {
3840                         for (value = 0; value < 256; value++)
3841                             if (!isALPHA(value))
3842                                 ANYOF_BITMAP_SET(ret, value);
3843                     }
3844                     yesno = '!';
3845                     what = "Alpha";
3846                     break;
3847                 case ANYOF_ASCII:
3848                     if (LOC)
3849                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3850                     else {
3851 #ifndef EBCDIC
3852                         for (value = 0; value < 128; value++)
3853                             ANYOF_BITMAP_SET(ret, value);
3854 #else  /* EBCDIC */
3855                         for (value = 0; value < 256; value++) {
3856                             if (isASCII(value))
3857                                 ANYOF_BITMAP_SET(ret, value);
3858                         }
3859 #endif /* EBCDIC */
3860                     }
3861                     yesno = '+';
3862                     what = "ASCII";
3863                     break;
3864                 case ANYOF_NASCII:
3865                     if (LOC)
3866                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3867                     else {
3868 #ifndef EBCDIC
3869                         for (value = 128; value < 256; value++)
3870                             ANYOF_BITMAP_SET(ret, value);
3871 #else  /* EBCDIC */
3872                         for (value = 0; value < 256; value++) {
3873                             if (!isASCII(value))
3874                                 ANYOF_BITMAP_SET(ret, value);
3875                         }
3876 #endif /* EBCDIC */
3877                     }
3878                     yesno = '!';
3879                     what = "ASCII";
3880                     break;
3881                 case ANYOF_BLANK:
3882                     if (LOC)
3883                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3884                     else {
3885                         for (value = 0; value < 256; value++)
3886                             if (isBLANK(value))
3887                                 ANYOF_BITMAP_SET(ret, value);
3888                     }
3889                     yesno = '+';
3890                     what = "Blank";
3891                     break;
3892                 case ANYOF_NBLANK:
3893                     if (LOC)
3894                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3895                     else {
3896                         for (value = 0; value < 256; value++)
3897                             if (!isBLANK(value))
3898                                 ANYOF_BITMAP_SET(ret, value);
3899                     }
3900                     yesno = '!';
3901                     what = "Blank";
3902                     break;
3903                 case ANYOF_CNTRL:
3904                     if (LOC)
3905                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3906                     else {
3907                         for (value = 0; value < 256; value++)
3908                             if (isCNTRL(value))
3909                                 ANYOF_BITMAP_SET(ret, value);
3910                     }
3911                     yesno = '+';
3912                     what = "Cntrl";
3913                     break;
3914                 case ANYOF_NCNTRL:
3915                     if (LOC)
3916                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3917                     else {
3918                         for (value = 0; value < 256; value++)
3919                             if (!isCNTRL(value))
3920                                 ANYOF_BITMAP_SET(ret, value);
3921                     }
3922                     yesno = '!';
3923                     what = "Cntrl";
3924                     break;
3925                 case ANYOF_DIGIT:
3926                     if (LOC)
3927                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3928                     else {
3929                         /* consecutive digits assumed */
3930                         for (value = '0'; value <= '9'; value++)
3931                             ANYOF_BITMAP_SET(ret, value);
3932                     }
3933                     yesno = '+';
3934                     what = "Digit";
3935                     break;
3936                 case ANYOF_NDIGIT:
3937                     if (LOC)
3938                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3939                     else {
3940                         /* consecutive digits assumed */
3941                         for (value = 0; value < '0'; value++)
3942                             ANYOF_BITMAP_SET(ret, value);
3943                         for (value = '9' + 1; value < 256; value++)
3944                             ANYOF_BITMAP_SET(ret, value);
3945                     }
3946                     yesno = '!';
3947                     what = "Digit";
3948                     break;
3949                 case ANYOF_GRAPH:
3950                     if (LOC)
3951                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3952                     else {
3953                         for (value = 0; value < 256; value++)
3954                             if (isGRAPH(value))
3955                                 ANYOF_BITMAP_SET(ret, value);
3956                     }
3957                     yesno = '+';
3958                     what = "Graph";
3959                     break;
3960                 case ANYOF_NGRAPH:
3961                     if (LOC)
3962                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3963                     else {
3964                         for (value = 0; value < 256; value++)
3965                             if (!isGRAPH(value))
3966                                 ANYOF_BITMAP_SET(ret, value);
3967                     }
3968                     yesno = '!';
3969                     what = "Graph";
3970                     break;
3971                 case ANYOF_LOWER:
3972                     if (LOC)
3973                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3974                     else {
3975                         for (value = 0; value < 256; value++)
3976                             if (isLOWER(value))
3977                                 ANYOF_BITMAP_SET(ret, value);
3978                     }
3979                     yesno = '+';
3980                     what = "Lower";
3981                     break;
3982                 case ANYOF_NLOWER:
3983                     if (LOC)
3984                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3985                     else {
3986                         for (value = 0; value < 256; value++)
3987                             if (!isLOWER(value))
3988                                 ANYOF_BITMAP_SET(ret, value);
3989                     }
3990                     yesno = '!';
3991                     what = "Lower";
3992                     break;
3993                 case ANYOF_PRINT:
3994                     if (LOC)
3995                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3996                     else {
3997                         for (value = 0; value < 256; value++)
3998                             if (isPRINT(value))
3999                                 ANYOF_BITMAP_SET(ret, value);
4000                     }
4001                     yesno = '+';
4002                     what = "Print";
4003                     break;
4004                 case ANYOF_NPRINT:
4005                     if (LOC)
4006                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
4007                     else {
4008                         for (value = 0; value < 256; value++)
4009                             if (!isPRINT(value))
4010                                 ANYOF_BITMAP_SET(ret, value);
4011                     }
4012                     yesno = '!';
4013                     what = "Print";
4014                     break;
4015                 case ANYOF_PSXSPC:
4016                     if (LOC)
4017                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
4018                     else {
4019                         for (value = 0; value < 256; value++)
4020                             if (isPSXSPC(value))
4021                                 ANYOF_BITMAP_SET(ret, value);
4022                     }
4023                     yesno = '+';
4024                     what = "Space";
4025                     break;
4026                 case ANYOF_NPSXSPC:
4027                     if (LOC)
4028                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4029                     else {
4030                         for (value = 0; value < 256; value++)
4031                             if (!isPSXSPC(value))
4032                                 ANYOF_BITMAP_SET(ret, value);
4033                     }
4034                     yesno = '!';
4035                     what = "Space";
4036                     break;
4037                 case ANYOF_PUNCT:
4038                     if (LOC)
4039                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
4040                     else {
4041                         for (value = 0; value < 256; value++)
4042                             if (isPUNCT(value))
4043                                 ANYOF_BITMAP_SET(ret, value);
4044                     }
4045                     yesno = '+';
4046                     what = "Punct";
4047                     break;
4048                 case ANYOF_NPUNCT:
4049                     if (LOC)
4050                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4051                     else {
4052                         for (value = 0; value < 256; value++)
4053                             if (!isPUNCT(value))
4054                                 ANYOF_BITMAP_SET(ret, value);
4055                     }
4056                     yesno = '!';
4057                     what = "Punct";
4058                     break;
4059                 case ANYOF_SPACE:
4060                     if (LOC)
4061                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4062                     else {
4063                         for (value = 0; value < 256; value++)
4064                             if (isSPACE(value))
4065                                 ANYOF_BITMAP_SET(ret, value);
4066                     }
4067                     yesno = '+';
4068                     what = "SpacePerl";
4069                     break;
4070                 case ANYOF_NSPACE:
4071                     if (LOC)
4072                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4073                     else {
4074                         for (value = 0; value < 256; value++)
4075                             if (!isSPACE(value))
4076                                 ANYOF_BITMAP_SET(ret, value);
4077                     }
4078                     yesno = '!';
4079                     what = "SpacePerl";
4080                     break;
4081                 case ANYOF_UPPER:
4082                     if (LOC)
4083                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4084                     else {
4085                         for (value = 0; value < 256; value++)
4086                             if (isUPPER(value))
4087                                 ANYOF_BITMAP_SET(ret, value);
4088                     }
4089                     yesno = '+';
4090                     what = "Upper";
4091                     break;
4092                 case ANYOF_NUPPER:
4093                     if (LOC)
4094                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4095                     else {
4096                         for (value = 0; value < 256; value++)
4097                             if (!isUPPER(value))
4098                                 ANYOF_BITMAP_SET(ret, value);
4099                     }
4100                     yesno = '!';
4101                     what = "Upper";
4102                     break;
4103                 case ANYOF_XDIGIT:
4104                     if (LOC)
4105                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4106                     else {
4107                         for (value = 0; value < 256; value++)
4108                             if (isXDIGIT(value))
4109                                 ANYOF_BITMAP_SET(ret, value);
4110                     }
4111                     yesno = '+';
4112                     what = "XDigit";
4113                     break;
4114                 case ANYOF_NXDIGIT:
4115                     if (LOC)
4116                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4117                     else {
4118                         for (value = 0; value < 256; value++)
4119                             if (!isXDIGIT(value))
4120                                 ANYOF_BITMAP_SET(ret, value);
4121                     }
4122                     yesno = '!';
4123                     what = "XDigit";
4124                     break;
4125                 case ANYOF_MAX:
4126                     /* this is to handle \p and \P */
4127                     break;
4128                 default:
4129                     vFAIL("Invalid [::] class");
4130                     break;
4131                 }
4132                 if (what) {
4133                     /* Strings such as "+utf8::isWord\n" */
4134                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
4135                 }
4136                 if (LOC)
4137                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4138                 continue;
4139             }
4140         } /* end of namedclass \blah */
4141
4142         if (range) {
4143             if (prevvalue > (IV)value) /* b-a */ {
4144                 const int w = RExC_parse - rangebegin;
4145                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
4146                 range = 0; /* not a valid range */
4147             }
4148         }
4149         else {
4150             prevvalue = value; /* save the beginning of the range */
4151             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4152                 RExC_parse[1] != ']') {
4153                 RExC_parse++;
4154
4155                 /* a bad range like \w-, [:word:]- ? */
4156                 if (namedclass > OOB_NAMEDCLASS) {
4157                     if (ckWARN(WARN_REGEXP)) {
4158                         const int w =
4159                             RExC_parse >= rangebegin ?
4160                             RExC_parse - rangebegin : 0;
4161                         vWARN4(RExC_parse,
4162                                "False [] range \"%*.*s\"",
4163                                w, w, rangebegin);
4164                     }
4165                     if (!SIZE_ONLY)
4166                         ANYOF_BITMAP_SET(ret, '-');
4167                 } else
4168                     range = 1;  /* yeah, it's a range! */
4169                 continue;       /* but do it the next time */
4170             }
4171         }
4172
4173         /* now is the next time */
4174         if (!SIZE_ONLY) {
4175             IV i;
4176
4177             if (prevvalue < 256) {
4178                 const IV ceilvalue = value < 256 ? value : 255;
4179
4180 #ifdef EBCDIC
4181                 /* In EBCDIC [\x89-\x91] should include
4182                  * the \x8e but [i-j] should not. */
4183                 if (literal_endpoint == 2 &&
4184                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4185                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4186                 {
4187                     if (isLOWER(prevvalue)) {
4188                         for (i = prevvalue; i <= ceilvalue; i++)
4189                             if (isLOWER(i))
4190                                 ANYOF_BITMAP_SET(ret, i);
4191                     } else {
4192                         for (i = prevvalue; i <= ceilvalue; i++)
4193                             if (isUPPER(i))
4194                                 ANYOF_BITMAP_SET(ret, i);
4195                     }
4196                 }
4197                 else
4198 #endif
4199                       for (i = prevvalue; i <= ceilvalue; i++)
4200                           ANYOF_BITMAP_SET(ret, i);
4201           }
4202           if (value > 255 || UTF) {
4203                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4204                 const UV natvalue      = NATIVE_TO_UNI(value);
4205
4206                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4207                 if (prevnatvalue < natvalue) { /* what about > ? */
4208                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4209                                    prevnatvalue, natvalue);
4210                 }
4211                 else if (prevnatvalue == natvalue) {
4212                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4213                     if (FOLD) {
4214                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
4215                          STRLEN foldlen;
4216                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4217
4218 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
4219                          if (RExC_precomp[0] == ':' &&
4220                              RExC_precomp[1] == '[' &&
4221                              (f == 0xDF || f == 0x92)) {
4222                              f = NATIVE_TO_UNI(f);
4223                         }
4224 #endif
4225                          /* If folding and foldable and a single
4226                           * character, insert also the folded version
4227                           * to the charclass. */
4228                          if (f != value) {
4229 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
4230                              if ((RExC_precomp[0] == ':' &&
4231                                   RExC_precomp[1] == '[' &&
4232                                   (f == 0xA2 &&
4233                                    (value == 0xFB05 || value == 0xFB06))) ?
4234                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
4235                                  foldlen == (STRLEN)UNISKIP(f) )
4236 #else
4237                               if (foldlen == (STRLEN)UNISKIP(f))
4238 #endif
4239                                   Perl_sv_catpvf(aTHX_ listsv,
4240                                                  "%04"UVxf"\n", f);
4241                               else {
4242                                   /* Any multicharacter foldings
4243                                    * require the following transform:
4244                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4245                                    * where E folds into "pq" and F folds
4246                                    * into "rst", all other characters
4247                                    * fold to single characters.  We save
4248                                    * away these multicharacter foldings,
4249                                    * to be later saved as part of the
4250                                    * additional "s" data. */
4251                                   SV *sv;
4252
4253                                   if (!unicode_alternate)
4254                                       unicode_alternate = newAV();
4255                                   sv = newSVpvn((char*)foldbuf, foldlen);
4256                                   SvUTF8_on(sv);
4257                                   av_push(unicode_alternate, sv);
4258                               }
4259                          }
4260
4261                          /* If folding and the value is one of the Greek
4262                           * sigmas insert a few more sigmas to make the
4263                           * folding rules of the sigmas to work right.
4264                           * Note that not all the possible combinations
4265                           * are handled here: some of them are handled
4266                           * by the standard folding rules, and some of
4267                           * them (literal or EXACTF cases) are handled
4268                           * during runtime in regexec.c:S_find_byclass(). */
4269                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4270                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4271                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4272                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4273                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4274                          }
4275                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4276                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4277                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4278                     }
4279                 }
4280             }
4281 #ifdef EBCDIC
4282             literal_endpoint = 0;
4283 #endif
4284         }
4285
4286         range = 0; /* this range (if it was one) is done now */
4287     }
4288
4289     if (need_class) {
4290         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4291         if (SIZE_ONLY)
4292             RExC_size += ANYOF_CLASS_ADD_SKIP;
4293         else
4294             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4295     }
4296
4297     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4298     if (!SIZE_ONLY &&
4299          /* If the only flag is folding (plus possibly inversion). */
4300         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4301        ) {
4302         for (value = 0; value < 256; ++value) {
4303             if (ANYOF_BITMAP_TEST(ret, value)) {
4304                 UV fold = PL_fold[value];
4305
4306                 if (fold != value)
4307                     ANYOF_BITMAP_SET(ret, fold);
4308             }
4309         }
4310         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4311     }
4312
4313     /* optimize inverted simple patterns (e.g. [^a-z]) */
4314     if (!SIZE_ONLY && optimize_invert &&
4315         /* If the only flag is inversion. */
4316         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4317         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4318             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4319         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4320     }
4321
4322     if (!SIZE_ONLY) {
4323         AV * const av = newAV();
4324         SV *rv;
4325
4326         /* The 0th element stores the character class description
4327          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4328          * to initialize the appropriate swash (which gets stored in
4329          * the 1st element), and also useful for dumping the regnode.
4330          * The 2nd element stores the multicharacter foldings,
4331          * used later (regexec.c:S_reginclass()). */
4332         av_store(av, 0, listsv);
4333         av_store(av, 1, NULL);
4334         av_store(av, 2, (SV*)unicode_alternate);
4335         rv = newRV_noinc((SV*)av);
4336         n = add_data(pRExC_state, 1, "s");
4337         RExC_rx->data->data[n] = (void*)rv;
4338         ARG_SET(ret, n);
4339     }
4340
4341     return ret;
4342 }
4343
4344 STATIC char*
4345 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4346 {
4347     char* const retval = RExC_parse++;
4348
4349     for (;;) {
4350         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4351                 RExC_parse[2] == '#') {
4352             while (*RExC_parse != ')') {
4353                 if (RExC_parse == RExC_end)
4354                     FAIL("Sequence (?#... not terminated");
4355                 RExC_parse++;
4356             }
4357             RExC_parse++;
4358             continue;
4359         }
4360         if (RExC_flags & PMf_EXTENDED) {
4361             if (isSPACE(*RExC_parse)) {
4362                 RExC_parse++;
4363                 continue;
4364             }
4365             else if (*RExC_parse == '#') {
4366                 while (RExC_parse < RExC_end)
4367                     if (*RExC_parse++ == '\n') break;
4368                 continue;
4369             }
4370         }
4371         return retval;
4372     }
4373 }
4374
4375 /*
4376 - reg_node - emit a node
4377 */
4378 STATIC regnode *                        /* Location. */
4379 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4380 {
4381     register regnode *ptr;
4382     regnode * const ret = RExC_emit;
4383
4384     if (SIZE_ONLY) {
4385         SIZE_ALIGN(RExC_size);
4386         RExC_size += 1;
4387         return(ret);
4388     }
4389
4390     NODE_ALIGN_FILL(ret);
4391     ptr = ret;
4392     FILL_ADVANCE_NODE(ptr, op);
4393     if (RExC_offsets) {         /* MJD */
4394         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4395               "reg_node", __LINE__, 
4396               reg_name[op],
4397               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4398               ? "Overwriting end of array!\n" : "OK",
4399               RExC_emit - RExC_emit_start,
4400               RExC_parse - RExC_start,
4401               RExC_offsets[0])); 
4402         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4403     }
4404             
4405     RExC_emit = ptr;
4406
4407     return(ret);
4408 }
4409
4410 /*
4411 - reganode - emit a node with an argument
4412 */
4413 STATIC regnode *                        /* Location. */
4414 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4415 {
4416     register regnode *ptr;
4417     regnode * const ret = RExC_emit;
4418
4419     if (SIZE_ONLY) {
4420         SIZE_ALIGN(RExC_size);
4421         RExC_size += 2;
4422         return(ret);
4423     }
4424
4425     NODE_ALIGN_FILL(ret);
4426     ptr = ret;
4427     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4428     if (RExC_offsets) {         /* MJD */
4429         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4430               "reganode",
4431               __LINE__,
4432               reg_name[op],
4433               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4434               "Overwriting end of array!\n" : "OK",
4435               RExC_emit - RExC_emit_start,
4436               RExC_parse - RExC_start,
4437               RExC_offsets[0])); 
4438         Set_Cur_Node_Offset;
4439     }
4440             
4441     RExC_emit = ptr;
4442
4443     return(ret);
4444 }
4445
4446 /*
4447 - reguni - emit (if appropriate) a Unicode character
4448 */
4449 STATIC STRLEN
4450 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
4451 {
4452     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4453 }
4454
4455 /*
4456 - reginsert - insert an operator in front of already-emitted operand
4457 *
4458 * Means relocating the operand.
4459 */
4460 STATIC void
4461 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4462 {
4463     register regnode *src;
4464     register regnode *dst;
4465     register regnode *place;
4466     const int offset = regarglen[(U8)op];
4467
4468 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4469
4470     if (SIZE_ONLY) {
4471         RExC_size += NODE_STEP_REGNODE + offset;
4472         return;
4473     }
4474
4475     src = RExC_emit;
4476     RExC_emit += NODE_STEP_REGNODE + offset;
4477     dst = RExC_emit;
4478     while (src > opnd) {
4479         StructCopy(--src, --dst, regnode);
4480         if (RExC_offsets) {     /* MJD 20010112 */
4481             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4482                   "reg_insert",
4483                   __LINE__,
4484                   reg_name[op],
4485                   dst - RExC_emit_start > RExC_offsets[0] 
4486                   ? "Overwriting end of array!\n" : "OK",
4487                   src - RExC_emit_start,
4488                   dst - RExC_emit_start,
4489                   RExC_offsets[0])); 
4490             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4491             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4492         }
4493     }
4494     
4495
4496     place = opnd;               /* Op node, where operand used to be. */
4497     if (RExC_offsets) {         /* MJD */
4498         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4499               "reginsert",
4500               __LINE__,
4501               reg_name[op],
4502               place - RExC_emit_start > RExC_offsets[0] 
4503               ? "Overwriting end of array!\n" : "OK",
4504               place - RExC_emit_start,
4505               RExC_parse - RExC_start,
4506               RExC_offsets[0])); 
4507         Set_Node_Offset(place, RExC_parse);
4508         Set_Node_Length(place, 1);
4509     }
4510     src = NEXTOPER(place);
4511     FILL_ADVANCE_NODE(place, op);
4512     Zero(src, offset, regnode);
4513 }
4514
4515 /*
4516 - regtail - set the next-pointer at the end of a node chain of p to val.
4517 */
4518 /* TODO: All three parms should be const */
4519 STATIC void
4520 S_regtail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
4521 {
4522     register regnode *scan;
4523
4524     if (SIZE_ONLY)
4525         return;
4526
4527     /* Find last node. */
4528     scan = p;
4529     for (;;) {
4530         regnode * const temp = regnext(scan);
4531         if (temp == NULL)
4532             break;
4533         scan = temp;
4534     }
4535
4536     if (reg_off_by_arg[OP(scan)]) {
4537         ARG_SET(scan, val - scan);
4538     }
4539     else {
4540         NEXT_OFF(scan) = val - scan;
4541     }
4542 }
4543
4544 /*
4545  - regcurly - a little FSA that accepts {\d+,?\d*}
4546  */
4547 STATIC I32
4548 S_regcurly(register const char *s)
4549 {
4550     if (*s++ != '{')
4551         return FALSE;
4552     if (!isDIGIT(*s))
4553         return FALSE;
4554     while (isDIGIT(*s))
4555         s++;
4556     if (*s == ',')
4557         s++;
4558     while (isDIGIT(*s))
4559         s++;
4560     if (*s != '}')
4561         return FALSE;
4562     return TRUE;
4563 }
4564
4565 /*
4566  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4567  */
4568 void
4569 Perl_regdump(pTHX_ const regexp *r)
4570 {
4571 #ifdef DEBUGGING
4572     SV * const sv = sv_newmortal();
4573
4574     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4575
4576     /* Header fields of interest. */
4577     if (r->anchored_substr)
4578         PerlIO_printf(Perl_debug_log,
4579                       "anchored \"%s%.*s%s\"%s at %"IVdf" ",
4580                       PL_colors[0],
4581                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4582                       SvPVX_const(r->anchored_substr),
4583                       PL_colors[1],
4584                       SvTAIL(r->anchored_substr) ? "$" : "",
4585                       (IV)r->anchored_offset);
4586     else if (r->anchored_utf8)
4587         PerlIO_printf(Perl_debug_log,
4588                       "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
4589                       PL_colors[0],
4590                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4591                       SvPVX_const(r->anchored_utf8),
4592                       PL_colors[1],
4593                       SvTAIL(r->anchored_utf8) ? "$" : "",
4594                       (IV)r->anchored_offset);
4595     if (r->float_substr)
4596         PerlIO_printf(Perl_debug_log,
4597                       "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4598                       PL_colors[0],
4599                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4600                       SvPVX_const(r->float_substr),
4601                       PL_colors[1],
4602                       SvTAIL(r->float_substr) ? "$" : "",
4603                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4604     else if (r->float_utf8)
4605         PerlIO_printf(Perl_debug_log,
4606                       "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4607                       PL_colors[0],
4608                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4609                       SvPVX_const(r->float_utf8),
4610                       PL_colors[1],
4611                       SvTAIL(r->float_utf8) ? "$" : "",
4612                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4613     if (r->check_substr || r->check_utf8)
4614         PerlIO_printf(Perl_debug_log,
4615                       (const char *)
4616                       (r->check_substr == r->float_substr
4617                        && r->check_utf8 == r->float_utf8
4618                        ? "(checking floating" : "(checking anchored"));
4619     if (r->reganch & ROPT_NOSCAN)
4620         PerlIO_printf(Perl_debug_log, " noscan");
4621     if (r->reganch & ROPT_CHECK_ALL)
4622         PerlIO_printf(Perl_debug_log, " isall");
4623     if (r->check_substr || r->check_utf8)
4624         PerlIO_printf(Perl_debug_log, ") ");
4625
4626     if (r->regstclass) {
4627         regprop(sv, r->regstclass);
4628         PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
4629     }
4630     if (r->reganch & ROPT_ANCH) {
4631         PerlIO_printf(Perl_debug_log, "anchored");
4632         if (r->reganch & ROPT_ANCH_BOL)
4633             PerlIO_printf(Perl_debug_log, "(BOL)");
4634         if (r->reganch & ROPT_ANCH_MBOL)
4635             PerlIO_printf(Perl_debug_log, "(MBOL)");
4636         if (r->reganch & ROPT_ANCH_SBOL)
4637             PerlIO_printf(Perl_debug_log, "(SBOL)");
4638         if (r->reganch & ROPT_ANCH_GPOS)
4639             PerlIO_printf(Perl_debug_log, "(GPOS)");
4640         PerlIO_putc(Perl_debug_log, ' ');
4641     }
4642     if (r->reganch & ROPT_GPOS_SEEN)
4643         PerlIO_printf(Perl_debug_log, "GPOS ");
4644     if (r->reganch & ROPT_SKIP)
4645         PerlIO_printf(Perl_debug_log, "plus ");
4646     if (r->reganch & ROPT_IMPLICIT)
4647         PerlIO_printf(Perl_debug_log, "implicit ");
4648     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4649     if (r->reganch & ROPT_EVAL_SEEN)
4650         PerlIO_printf(Perl_debug_log, "with eval ");
4651     PerlIO_printf(Perl_debug_log, "\n");
4652     if (r->offsets) {
4653       U32 i;
4654       const U32 len = r->offsets[0];
4655       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4656       for (i = 1; i <= len; i++)
4657         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4658                       (UV)r->offsets[i*2-1], 
4659                       (UV)r->offsets[i*2]);
4660       PerlIO_printf(Perl_debug_log, "\n");
4661     }
4662 #else
4663     PERL_UNUSED_CONTEXT;
4664     PERL_UNUSED_ARG(r);
4665 #endif  /* DEBUGGING */
4666 }
4667
4668 /*
4669 - regprop - printable representation of opcode
4670 */
4671 void
4672 Perl_regprop(pTHX_ SV *sv, regnode *o)
4673 {
4674 #ifdef DEBUGGING
4675     register int k;
4676
4677     sv_setpvn(sv, "", 0);
4678     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4679         /* It would be nice to FAIL() here, but this may be called from
4680            regexec.c, and it would be hard to supply pRExC_state. */
4681         Perl_croak(aTHX_ "Corrupted regexp opcode");
4682     sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
4683
4684     k = PL_regkind[(U8)OP(o)];
4685
4686     if (k == EXACT) {
4687         SV * const dsv = sv_2mortal(newSVpvs(""));
4688         /* Using is_utf8_string() is a crude hack but it may
4689          * be the best for now since we have no flag "this EXACTish
4690          * node was UTF-8" --jhi */
4691         const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4692         const char * const s = do_utf8 ?
4693           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4694                          UNI_DISPLAY_REGEX) :
4695           STRING(o);
4696         const int len = do_utf8 ?
4697           strlen(s) :
4698           STR_LEN(o);
4699         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4700                        PL_colors[0],
4701                        len, s,
4702                        PL_colors[1]);
4703     }
4704     else if (k == CURLY) {
4705         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4706             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4707         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4708     }
4709     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4710         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4711     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4712         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4713     else if (k == LOGICAL)
4714         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4715     else if (k == ANYOF) {
4716         int i, rangestart = -1;
4717         const U8 flags = ANYOF_FLAGS(o);
4718
4719         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
4720         static const char * const anyofs[] = {
4721             "\\w",
4722             "\\W",
4723             "\\s",
4724             "\\S",
4725             "\\d",
4726             "\\D",
4727             "[:alnum:]",
4728             "[:^alnum:]",
4729             "[:alpha:]",
4730             "[:^alpha:]",
4731             "[:ascii:]",
4732             "[:^ascii:]",
4733             "[:ctrl:]",
4734             "[:^ctrl:]",
4735             "[:graph:]",
4736             "[:^graph:]",
4737             "[:lower:]",
4738             "[:^lower:]",
4739             "[:print:]",
4740             "[:^print:]",
4741             "[:punct:]",
4742             "[:^punct:]",
4743             "[:upper:]",
4744             "[:^upper:]",
4745             "[:xdigit:]",
4746             "[:^xdigit:]",
4747             "[:space:]",
4748             "[:^space:]",
4749             "[:blank:]",
4750             "[:^blank:]"
4751         };
4752
4753         if (flags & ANYOF_LOCALE)
4754             sv_catpvs(sv, "{loc}");
4755         if (flags & ANYOF_FOLD)
4756             sv_catpvs(sv, "{i}");
4757         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4758         if (flags & ANYOF_INVERT)
4759             sv_catpvs(sv, "^");
4760         for (i = 0; i <= 256; i++) {
4761             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4762                 if (rangestart == -1)
4763                     rangestart = i;
4764             } else if (rangestart != -1) {
4765                 if (i <= rangestart + 3)
4766                     for (; rangestart < i; rangestart++)
4767                         put_byte(sv, rangestart);
4768                 else {
4769                     put_byte(sv, rangestart);
4770                     sv_catpvs(sv, "-");
4771                     put_byte(sv, i - 1);
4772                 }
4773                 rangestart = -1;
4774             }
4775         }
4776
4777         if (o->flags & ANYOF_CLASS)
4778             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
4779                 if (ANYOF_CLASS_TEST(o,i))
4780                     sv_catpv(sv, anyofs[i]);
4781
4782         if (flags & ANYOF_UNICODE)
4783             sv_catpvs(sv, "{unicode}");
4784         else if (flags & ANYOF_UNICODE_ALL)
4785             sv_catpvs(sv, "{unicode_all}");
4786
4787         {
4788             SV *lv;
4789             SV * const sw = regclass_swash(o, FALSE, &lv, 0);
4790         
4791             if (lv) {
4792                 if (sw) {
4793                     U8 s[UTF8_MAXBYTES_CASE+1];
4794                 
4795                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4796                         uvchr_to_utf8(s, i);
4797                         
4798                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4799                             if (rangestart == -1)
4800                                 rangestart = i;
4801                         } else if (rangestart != -1) {
4802                             if (i <= rangestart + 3)
4803                                 for (; rangestart < i; rangestart++) {
4804                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
4805                                     U8 *p;
4806                                     for(p = s; p < e; p++)
4807                                         put_byte(sv, *p);
4808                                 }
4809                             else {
4810                                 const U8 *e = uvchr_to_utf8(s,rangestart);
4811                                 U8 *p;
4812                                 for (p = s; p < e; p++)
4813                                     put_byte(sv, *p);
4814                                 sv_catpvs(sv, "-");
4815                                 e = uvchr_to_utf8(s, i-1);
4816                                 for (p = s; p < e; p++)
4817                                     put_byte(sv, *p);
4818                                 }
4819                                 rangestart = -1;
4820                             }
4821                         }
4822                         
4823                     sv_catpvs(sv, "..."); /* et cetera */
4824                 }
4825
4826                 {
4827                     char *s = savesvpv(lv);
4828                     char * const origs = s;
4829                 
4830                     while(*s && *s != '\n') s++;
4831                 
4832                     if (*s == '\n') {
4833                         const char * const t = ++s;
4834                         
4835                         while (*s) {
4836                             if (*s == '\n')
4837                                 *s = ' ';
4838                             s++;
4839                         }
4840                         if (s[-1] == ' ')
4841                             s[-1] = 0;
4842                         
4843                         sv_catpv(sv, t);
4844                     }
4845                 
4846                     Safefree(origs);
4847                 }
4848             }
4849         }
4850
4851         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4852     }
4853     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4854         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4855 #else
4856     PERL_UNUSED_CONTEXT;
4857     PERL_UNUSED_ARG(sv);
4858     PERL_UNUSED_ARG(o);
4859 #endif  /* DEBUGGING */
4860 }
4861
4862 SV *
4863 Perl_re_intuit_string(pTHX_ regexp *prog)
4864 {                               /* Assume that RE_INTUIT is set */
4865     DEBUG_r(
4866         {
4867             const char * const s = SvPV_nolen_const(prog->check_substr
4868                       ? prog->check_substr : prog->check_utf8);
4869
4870             if (!PL_colorset) reginitcolors();
4871             PerlIO_printf(Perl_debug_log,
4872                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
4873                       PL_colors[4],
4874                       prog->check_substr ? "" : "utf8 ",
4875                       PL_colors[5],PL_colors[0],
4876                       s,
4877                       PL_colors[1],
4878                       (strlen(s) > 60 ? "..." : ""));
4879         } );
4880
4881     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4882 }
4883
4884 void
4885 Perl_pregfree(pTHX_ struct regexp *r)
4886 {
4887 #ifdef DEBUGGING
4888     SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
4889 #endif
4890
4891     if (!r || (--r->refcnt > 0))
4892         return;
4893     DEBUG_r({
4894         const char * const s = (r->reganch & ROPT_UTF8)
4895             ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
4896             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4897         const int len = SvCUR(dsv);
4898          if (!PL_colorset)
4899               reginitcolors();
4900          PerlIO_printf(Perl_debug_log,
4901                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4902                        PL_colors[4],PL_colors[5],PL_colors[0],
4903                        len, len, s,
4904                        PL_colors[1],
4905                        len > 60 ? "..." : "");
4906     });
4907
4908     /* gcov results gave these as non-null 100% of the time, so there's no
4909        optimisation in checking them before calling Safefree  */
4910     Safefree(r->precomp);
4911     Safefree(r->offsets);             /* 20010421 MJD */
4912     if (RX_MATCH_COPIED(r))
4913         Safefree(r->subbeg);
4914     if (r->substrs) {
4915         if (r->anchored_substr)
4916             SvREFCNT_dec(r->anchored_substr);
4917         if (r->anchored_utf8)
4918             SvREFCNT_dec(r->anchored_utf8);
4919         if (r->float_substr)
4920             SvREFCNT_dec(r->float_substr);
4921         if (r->float_utf8)
4922             SvREFCNT_dec(r->float_utf8);
4923         Safefree(r->substrs);
4924     }
4925     if (r->data) {
4926         int n = r->data->count;
4927         PAD* new_comppad = NULL;
4928         PAD* old_comppad;
4929         PADOFFSET refcnt;
4930
4931         while (--n >= 0) {
4932           /* If you add a ->what type here, update the comment in regcomp.h */
4933             switch (r->data->what[n]) {
4934             case 's':
4935                 SvREFCNT_dec((SV*)r->data->data[n]);
4936                 break;
4937             case 'f':
4938                 Safefree(r->data->data[n]);
4939                 break;
4940             case 'p':
4941                 new_comppad = (AV*)r->data->data[n];
4942                 break;
4943             case 'o':
4944                 if (new_comppad == NULL)
4945                     Perl_croak(aTHX_ "panic: pregfree comppad");
4946                 PAD_SAVE_LOCAL(old_comppad,
4947                     /* Watch out for global destruction's random ordering. */
4948                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
4949                 );
4950                 OP_REFCNT_LOCK;
4951                 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
4952                 OP_REFCNT_UNLOCK;
4953                 if (!refcnt)
4954                     op_free((OP_4tree*)r->data->data[n]);
4955
4956                 PAD_RESTORE_LOCAL(old_comppad);
4957                 SvREFCNT_dec((SV*)new_comppad);
4958                 new_comppad = NULL;
4959                 break;
4960             case 'n':
4961                 break;
4962             default:
4963                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4964             }
4965         }
4966         Safefree(r->data->what);
4967         Safefree(r->data);
4968     }
4969     Safefree(r->startp);
4970     Safefree(r->endp);
4971     Safefree(r);
4972 }
4973
4974 /*
4975  - regnext - dig the "next" pointer out of a node
4976  *
4977  * [Note, when REGALIGN is defined there are two places in regmatch()
4978  * that bypass this code for speed.]
4979  */
4980 regnode *
4981 Perl_regnext(pTHX_ register regnode *p)
4982 {
4983     register I32 offset;
4984
4985     if (p == &PL_regdummy)
4986         return(NULL);
4987
4988     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4989     if (offset == 0)
4990         return(NULL);
4991
4992     return(p+offset);
4993 }
4994
4995 STATIC void     
4996 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4997 {
4998     va_list args;
4999     STRLEN l1 = strlen(pat1);
5000     STRLEN l2 = strlen(pat2);
5001     char buf[512];
5002     SV *msv;
5003     const char *message;
5004
5005     if (l1 > 510)
5006         l1 = 510;
5007     if (l1 + l2 > 510)
5008         l2 = 510 - l1;
5009     Copy(pat1, buf, l1 , char);
5010     Copy(pat2, buf + l1, l2 , char);
5011     buf[l1 + l2] = '\n';
5012     buf[l1 + l2 + 1] = '\0';
5013 #ifdef I_STDARG
5014     /* ANSI variant takes additional second argument */
5015     va_start(args, pat2);
5016 #else
5017     va_start(args);
5018 #endif
5019     msv = vmess(buf, &args);
5020     va_end(args);
5021     message = SvPV_const(msv,l1);
5022     if (l1 > 512)
5023         l1 = 512;
5024     Copy(message, buf, l1 , char);
5025     buf[l1-1] = '\0';                   /* Overwrite \n */
5026     Perl_croak(aTHX_ "%s", buf);
5027 }
5028
5029 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5030
5031 void
5032 Perl_save_re_context(pTHX)
5033 {
5034
5035     struct re_save_state *state;
5036
5037     SAVEVPTR(PL_curcop);
5038     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
5039
5040     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
5041     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
5042     SSPUSHINT(SAVEt_RE_STATE);
5043
5044     state->re_state_reg_flags = PL_reg_flags;
5045     state->re_state_bostr = PL_bostr;
5046     state->re_state_reginput = PL_reginput;
5047     state->re_state_regbol = PL_regbol;
5048     state->re_state_regeol = PL_regeol;
5049     state->re_state_regstartp = PL_regstartp;
5050     state->re_state_regendp = PL_regendp;
5051     state->re_state_reglastparen = PL_reglastparen;
5052     state->re_state_reglastcloseparen = PL_reglastcloseparen;
5053     state->re_state_regtill = PL_regtill;
5054     state->re_state_reg_start_tmp = PL_reg_start_tmp;
5055     state->re_state_reg_start_tmpl = PL_reg_start_tmpl;
5056     state->re_state_reg_eval_set = PL_reg_eval_set;
5057     state->re_state_regnarrate = PL_regnarrate;
5058     state->re_state_regindent = PL_regindent;
5059     state->re_state_reg_call_cc = PL_reg_call_cc;
5060     state->re_state_reg_re = PL_reg_re;
5061     state->re_state_reg_ganch = PL_reg_ganch;
5062     state->re_state_reg_sv = PL_reg_sv;
5063     state->re_state_reg_match_utf8 = PL_reg_match_utf8;
5064     state->re_state_reg_magic = PL_reg_magic;
5065     state->re_state_reg_oldpos = PL_reg_oldpos;
5066     state->re_state_reg_oldcurpm = PL_reg_oldcurpm;
5067     state->re_state_reg_curpm = PL_reg_curpm;
5068     state->re_state_reg_oldsaved = PL_reg_oldsaved;
5069     state->re_state_reg_oldsavedlen = PL_reg_oldsavedlen;
5070     state->re_state_reg_maxiter = PL_reg_maxiter;
5071     state->re_state_reg_leftiter = PL_reg_leftiter;
5072     state->re_state_reg_poscache = PL_reg_poscache;
5073     state->re_state_reg_poscache_size = PL_reg_poscache_size;
5074     state->re_state_regsize = PL_regsize;
5075     state->re_state_reg_starttry = PL_reg_starttry;
5076
5077     /* These variables have been eliminated from 5.10:  */
5078     state->re_state_regdata = PL_regdata;
5079     state->re_state_regprogram = PL_regprogram;
5080     state->re_state_regcc = PL_regcc;
5081     state->re_state_regprecomp = PL_regprecomp;
5082     state->re_state_regnpar = PL_regnpar;
5083
5084     PL_reg_start_tmp = 0;
5085     PL_reg_start_tmpl = 0;
5086     PL_reg_oldsaved = NULL;
5087     PL_reg_oldsavedlen = 0;
5088     PL_reg_maxiter = 0;
5089     PL_reg_leftiter = 0;
5090     PL_reg_poscache = NULL;
5091     PL_reg_poscache_size = 0;
5092
5093     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5094     if (PL_curpm) {
5095         const REGEXP * const rx = PM_GETRE(PL_curpm);
5096         if (rx) {
5097             U32 i;
5098             for (i = 1; i <= rx->nparens; i++) {
5099                 GV *gv;
5100                 char digits[TYPE_CHARS(long)];
5101                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
5102                 GV *const *const gvp
5103                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
5104
5105                 if (gvp && SvTYPE(gv = *gvp) == SVt_PVGV && GvSV(gv)) {
5106                     save_scalar(gv);
5107                 }
5108             }
5109         }
5110     }
5111 }
5112
5113 static void
5114 clear_re(pTHX_ void *r)
5115 {
5116     ReREFCNT_dec((regexp *)r);
5117 }
5118
5119 #ifdef DEBUGGING
5120
5121 STATIC void
5122 S_put_byte(pTHX_ SV *sv, int c)
5123 {
5124     if (isCNTRL(c) || c == 255 || !isPRINT(c))
5125         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5126     else if (c == '-' || c == ']' || c == '\\' || c == '^')
5127         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5128     else
5129         Perl_sv_catpvf(aTHX_ sv, "%c", c);
5130 }
5131
5132
5133 STATIC const regnode *
5134 S_dumpuntil(pTHX_ const regnode *start, const regnode *node,
5135             const regnode *last, SV* sv, I32 l)
5136 {
5137     register U8 op = EXACT;     /* Arbitrary non-END op. */
5138     register const regnode *next;
5139
5140     while (op != END && (!last || node < last)) {
5141         /* While that wasn't END last time... */
5142
5143         NODE_ALIGN(node);
5144         op = OP(node);
5145         if (op == CLOSE)
5146             l--;        
5147         next = regnext((regnode *)node);
5148         /* Where, what. */
5149         if (OP(node) == OPTIMIZED)
5150             goto after_print;
5151         regprop(sv, (regnode *) node);
5152         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
5153                       (int)(2*l + 1), "", SvPVX_const(sv));
5154         if (next == NULL)               /* Next ptr. */
5155             PerlIO_printf(Perl_debug_log, "(0)");
5156         else
5157             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
5158         (void)PerlIO_putc(Perl_debug_log, '\n');
5159       after_print:
5160         if (PL_regkind[(U8)op] == BRANCHJ) {
5161             register const regnode *nnode = (OP(next) == LONGJMP
5162                                              ? regnext((regnode *)next)
5163                                              : next);
5164             if (last && nnode > last)
5165                 nnode = last;
5166             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
5167         }
5168         else if (PL_regkind[(U8)op] == BRANCH) {
5169             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
5170         }
5171         else if ( op == CURLY) {   /* "next" might be very big: optimizer */
5172             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5173                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
5174         }
5175         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
5176             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5177                              next, sv, l + 1);
5178         }
5179         else if ( op == PLUS || op == STAR) {
5180             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
5181         }
5182         else if (op == ANYOF) {
5183             /* arglen 1 + class block */
5184             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5185                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5186             node = NEXTOPER(node);
5187         }
5188         else if (PL_regkind[(U8)op] == EXACT) {
5189             /* Literal string, where present. */
5190             node += NODE_SZ_STR(node) - 1;
5191             node = NEXTOPER(node);
5192         }
5193         else {
5194             node = NEXTOPER(node);
5195             node += regarglen[(U8)op];
5196         }
5197         if (op == CURLYX || op == OPEN)
5198             l++;
5199         else if (op == WHILEM)
5200             l--;
5201     }
5202     return node;
5203 }
5204
5205 #endif  /* DEBUGGING */
5206
5207 /*
5208  * Local variables:
5209  * c-indentation-style: bsd
5210  * c-basic-offset: 4
5211  * indent-tabs-mode: t
5212  * End:
5213  *
5214  * ex: set ts=8 sts=4 sw=4 noet:
5215  */